Files
miku-discord/bot/utils/core.py
koko210Serve a226bc41df Rewrite is_miku_addressed() to only trigger when addressed, not mentioned
- Pre-compile 393 name variants into 4 regex patterns at module load
  (was 7,300+ raw re.search() calls per message)
- Strict addressing detection using punctuation context:
  START:  name at beginning + punctuation (Miku, ... / みく!...)
  END:    comma + name at end (..., Miku / ...、ミク)
  MIDDLE: commas on both sides - vocative (..., Miku, ...)
  ALONE:  name is the entire message (Miku! / ミクちゃん)
- Rejects mere mentions: 'I like Miku' / 'Miku is cool' no longer trigger
- Script-family-aware pattern generation (Latin, Cyrillic, Japanese)
  eliminates nonsensical cross-script combos (e.g. o-みく)
- Word boundary enforcement prevents substring matches (mikumiku)
- Fixes regex 'unbalanced parenthesis' errors from old implementation
- Add comprehensive test suite (94 cases, all passing)
2026-03-03 12:42:33 +02:00

166 lines
6.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# utils/core.py
#
# Detects whether a Discord message is **addressed to** Miku
# (as opposed to merely mentioning her).
import re
from utils.logger import get_logger
logger = get_logger('core')
# ────────────────────────────────────────────────────────────────────
# Pre-compiled Miku addressing patterns
# Built once at module load; is_miku_addressed() runs only 4 .search()
# ────────────────────────────────────────────────────────────────────
def _build_name_variants(bases, honorifics, prefixes, connector, prefix_connector):
"""Return regex fragments for every name+honorific+prefix combo
within a single script family."""
variants = []
for base in bases:
be = re.escape(base)
variants.append(be)
for h in honorifics:
he = re.escape(h)
variants.append(be + connector + he)
for p in prefixes:
pe = re.escape(p)
variants.append(pe + prefix_connector + be)
for h in honorifics:
he = re.escape(h)
variants.append(pe + prefix_connector + be + connector + he)
return variants
def _compile_addressing_patterns():
"""Compile the four addressing regexes.
START name at the beginning, followed by punctuation
"Miku, how are you?" "みく!聞いて"
END comma then name at the end
"how are you, Miku?" "教えて、ミク"
MIDDLE name surrounded by commas (vocative)
"On the contrary, Miku, I think…"
ALONE name is the entire message
"Miku" "みく!" "ミクちゃん"
"""
latin = _build_name_variants(
bases=['miku'],
honorifics=[
'chan', 'san', 'kun', 'nyan', 'hime', 'tan', 'chin', 'heika',
'denka', 'kakka', 'shi', 'chama', 'kyun', 'dono', 'sensei',
'senpai', 'jou',
],
prefixes=['o-'],
connector=r'[\-\s]?',
prefix_connector=r'\s?',
)
cyrillic = _build_name_variants(
bases=['мику'],
honorifics=[
'чан', 'сан', 'кун', 'нян', 'химе', 'тан', 'чин',
'хейка', 'хеика', 'денка', 'какка', 'си', 'чама', 'кюн',
'доно', 'сенсэй', 'сенсеи', 'сенпай', 'сенпаи', 'джо',
],
prefixes=['о-'],
connector=r'[\-\s]?',
prefix_connector=r'\s?',
)
japanese = _build_name_variants(
bases=['みく', 'ミク', '未来'],
honorifics=[
# Hiragana
'ちゃん', 'さん', 'くん', 'にゃん', 'ひめ', 'たん', 'ちん',
'へいか', 'でんか', 'かっか', '', 'ちゃま', 'きゅん', 'どの',
'せんせい', 'せんぱい', 'じょう',
# Katakana
'チャン', 'サン', 'クン', 'ニャン', 'ヒメ', 'タン', 'チン',
'ヘイカ', 'デンカ', 'カッカ', '', 'チャマ', 'キュン', 'ドノ',
'センセイ', 'センパイ', 'ジョウ',
],
prefixes=['', ''],
connector=r'[-]?',
prefix_connector=r'',
)
# Longest-first so the regex engine prefers the most specific match
all_v = sorted(latin + cyrillic + japanese, key=len, reverse=True)
alts = '|'.join(all_v)
NAME = rf'\b(?:{alts})\b'
PUNCT = r'[,,、:!?.。]' # addressing punctuation after name
COMMA = r'[,,、]' # comma variants (before name / vocative)
ETRAIL = r'[!?.。~]*' # optional trailing at end
ATRAIL = r'[!?.。~~♪♡❤]*' # optional trailing for name-only messages
start_re = re.compile(rf'^\s*{NAME}\s*{PUNCT}', re.IGNORECASE)
end_re = re.compile(rf'{COMMA}\s*{NAME}\s*{ETRAIL}\s*$', re.IGNORECASE)
middle_re = re.compile(rf'{COMMA}\s*{NAME}\s*{COMMA}', re.IGNORECASE)
alone_re = re.compile(rf'^\s*{NAME}\s*{ATRAIL}\s*$', re.IGNORECASE)
logger.info(f"Miku addressing: {len(all_v)} name variants compiled into 4 patterns")
return start_re, end_re, middle_re, alone_re
try:
_START_RE, _END_RE, _MIDDLE_RE, _ALONE_RE = _compile_addressing_patterns()
except Exception as e:
logger.error(f"Failed to compile addressing patterns: {e}")
_START_RE = _END_RE = _MIDDLE_RE = _ALONE_RE = None
# ────────────────────────────────────────────────────────────────────
async def is_miku_addressed(message) -> bool:
"""Return True only when the message is directed *at* Miku,
not merely mentioning her.
Always responds to: DMs, @mentions, replies to Miku's messages.
For normal messages checks whether Miku's name (in any supported
script / honorific combination) appears in an "addressing" position:
• Start "Miku, how are you?"
• End "how are you, Miku?"
• Middle "On the contrary, Miku, I think…"
• Alone "Miku!" / "ミクちゃん"
Does NOT trigger on mere mentions:
"I like Miku" / "Miku is cool" / "told miku about it"
"""
# DMs always respond
if message.guild is None:
return True
if not message.guild or not message.guild.me:
logger.warning(f"Invalid guild/guild.me for message from {message.author}")
return False
# @mention
if message.guild.me in message.mentions:
return True
# Reply to Miku
if message.reference:
try:
ref = await message.channel.fetch_message(message.reference.message_id)
if ref.author == message.guild.me:
return True
except Exception as e:
logger.warning(f"Could not fetch referenced message: {e}")
# Regex addressing (4 pre-compiled patterns)
if _START_RE is None:
logger.error("Addressing patterns not compiled skipping pattern check")
return False
text = message.content.strip()
return bool(
_START_RE.search(text)
or _END_RE.search(text)
or _MIDDLE_RE.search(text)
or _ALONE_RE.search(text)
)