Added Japanese and Bulgarian addressing

This commit is contained in:
2026-01-30 21:34:24 +02:00
parent 38a986658d
commit 7368ef0cd5

View File

@@ -40,15 +40,16 @@ async def is_miku_addressed(message) -> bool:
except Exception as e: except Exception as e:
logger.warning(f"Could not fetch referenced message: {e}") logger.warning(f"Could not fetch referenced message: {e}")
cleaned = message.content.strip().lower() cleaned = message.content.strip()
cleaned_lower = cleaned.lower()
# Base names for Miku in different scripts # Base names for Miku in different scripts
base_names = [ base_names = [
'miku', 'мику', 'みく', 'ミク', '未来' 'miku', 'мику', 'みく', 'ミク', '未来'
] ]
# Japanese honorifics - all scripts combined for simpler matching # Japanese honorifics - all scripts combined
honorifics_all_scripts = [ honorifics = [
# Latin # Latin
'chan', 'san', 'kun', 'nyan', 'hime', 'tan', 'chin', 'heika', 'chan', 'san', 'kun', 'nyan', 'hime', 'tan', 'chin', 'heika',
'denka', 'kakka', 'shi', 'chama', 'kyun', 'dono', 'sensei', 'senpai', 'jou', 'denka', 'kakka', 'shi', 'chama', 'kyun', 'dono', 'sensei', 'senpai', 'jou',
@@ -59,51 +60,74 @@ async def is_miku_addressed(message) -> bool:
'チャン', 'サン', 'クン', 'ニャン', 'ヒメ', 'タン', 'チン', 'ヘイカ', 'チャン', 'サン', 'クン', 'ニャン', 'ヒメ', 'タン', 'チン', 'ヘイカ',
'デンカ', 'カッカ', '', 'チャマ', 'キュン', 'ドノ', 'センセイ', 'センパイ', 'ジョウ', 'デンカ', 'カッカ', '', 'チャマ', 'キュン', 'ドノ', 'センセイ', 'センパイ', 'ジョウ',
# Cyrillic # Cyrillic
'чан', 'сан', 'кун', 'ньян', 'химе', 'тан', 'чин', 'хэйка', 'чан', 'сан', 'кун', 'нян', 'химе', 'тан', 'чин', 'хейка', 'хеика',
'дэнка', 'какка', 'си', 'чама', 'кюн', 'доно', 'сэнсэй', 'сэнпай', 'жо' 'денка', 'какка', 'си', 'чама', 'кюн', 'доно', 'сенсэй', 'сенсеи', 'сенпай', 'сенпаи', 'джо'
] ]
# Optional o- prefix in different scripts # o- prefix variants
o_prefixes = ['o-', 'о-', '', ''] o_prefixes = ['o-', 'о-', '', '']
# Strategy: Just check if any base name appears (case insensitive for latin/cyrillic) # Build all possible name variations to check
# Then allow any honorific to optionally follow name_patterns = []
for base in base_names: for base in base_names:
base_lower = base.lower() base_lower = base.lower()
base_escaped = re.escape(base_lower)
# Check for just the base name # Base name alone
if re.search(r'(?<![a-zа-яa-я\w])' + re.escape(base_lower) + r'(?![a-zа-яa-я\w])', cleaned): name_patterns.append(base_escaped)
return True
# Check with optional o- prefix # With honorifics (allows optional dash/space between)
for prefix in o_prefixes: for honorific in honorifics:
prefix_pattern = prefix.lower() if prefix != '' and prefix != '' else prefix
pattern = r'(?<![a-zа-яa-я\w])' + re.escape(prefix_pattern) + r'\s*' + re.escape(base_lower) + r'(?![a-zа-яa-я\w])'
if re.search(pattern, cleaned):
return True
# Check base name followed by any honorific (no spacing requirement to catch mixed script)
for honorific in honorifics_all_scripts:
honorific_lower = honorific.lower() honorific_lower = honorific.lower()
# Allow optional dash, space, or no separator between name and honorific honorific_escaped = re.escape(honorific_lower)
pattern = (r'(?<![a-zа-яa-я\w])' + re.escape(base_lower) + # Build pattern: base + optional [dash or space] + honorific
r'[-\s]*' + re.escape(honorific_lower) + name_patterns.append(base_escaped + r'[\-\s]*' + honorific_escaped)
r'(?![a-zа-яa-я\w])')
if re.search(pattern, cleaned):
return True
# Check with o- prefix + base + honorific # With o- prefix
for prefix in o_prefixes: for prefix in o_prefixes:
prefix_lower = prefix.lower() if prefix != '' and prefix != '' else prefix prefix_lower = prefix.lower()
for honorific in honorifics_all_scripts: prefix_escaped = re.escape(prefix_lower)
# o-prefix + optional space + base
name_patterns.append(prefix_escaped + r'\s*' + base_escaped)
# With o- prefix + honorific
for honorific in honorifics:
honorific_lower = honorific.lower() honorific_lower = honorific.lower()
pattern = (r'(?<![a-zа-яa-я\w])' + re.escape(prefix_lower) + honorific_escaped = re.escape(honorific_lower)
r'[-\s]*' + re.escape(base_lower) + # o-prefix + space + base + dash/space + honorific
r'[-\s]*' + re.escape(honorific_lower) + name_patterns.append(prefix_escaped + r'\s*' + base_escaped + r'[\-\s]*' + honorific_escaped)
r'(?![a-zа-яa-я\w])')
if re.search(pattern, cleaned): # Check all patterns - she must be "addressed" not just mentioned
return True for pattern in name_patterns:
try:
# Pattern 1: Start of message + punctuation/end
# "Miku, ..." or "みく!" or "ミクちゃん、..."
start_p = r'^' + pattern + r'(?:[,,、!?.。\s]+|$)'
if re.search(start_p, cleaned_lower, re.IGNORECASE):
return True
# Pattern 2: End of message (optionally preceded by punctuation)
# "..., Miku" or "...みく" or "...ミクちゃん!"
end_p = r'(?:[,,、!?.。\s]+|^)' + pattern + r'[!?.。\s]*$'
if re.search(end_p, cleaned_lower, re.IGNORECASE):
return True
# Pattern 3: Middle (surrounded by punctuation)
# "..., Miku, ..." or "...、ミク、..."
middle_p = r'[,,、!?.。\s]+' + pattern + r'[,,、!?.。\s]+'
if re.search(middle_p, cleaned_lower, re.IGNORECASE):
return True
# Pattern 4: Just the name alone
# "Miku" or "みく!" or "ミクちゃん"
alone_p = r'^\s*' + pattern + r'[!?.。]*\s*$'
if re.search(alone_p, cleaned_lower, re.IGNORECASE):
return True
except re.error as e:
# Log the problematic pattern and skip it
logger.error(f"REGEX ERROR - Pattern: '{pattern}' | Start regex: '{start_p}' | Error: {e}")
continue
return False return False