Rewrite is_miku_addressed() to only trigger when addressed, not mentioned

- Pre-compile 393 name variants into 4 regex patterns at module load
  (was 7,300+ raw re.search() calls per message)
- Strict addressing detection using punctuation context:
  START:  name at beginning + punctuation (Miku, ... / みく!...)
  END:    comma + name at end (..., Miku / ...、ミク)
  MIDDLE: commas on both sides - vocative (..., Miku, ...)
  ALONE:  name is the entire message (Miku! / ミクちゃん)
- Rejects mere mentions: 'I like Miku' / 'Miku is cool' no longer trigger
- Script-family-aware pattern generation (Latin, Cyrillic, Japanese)
  eliminates nonsensical cross-script combos (e.g. o-みく)
- Word boundary enforcement prevents substring matches (mikumiku)
- Fixes regex 'unbalanced parenthesis' errors from old implementation
- Add comprehensive test suite (94 cases, all passing)
This commit is contained in:
2026-03-03 12:42:33 +02:00
parent 892edf5564
commit a226bc41df
2 changed files with 389 additions and 142 deletions

View File

@@ -1,167 +1,165 @@
# utils/core.py
#
# Detects whether a Discord message is **addressed to** Miku
# (as opposed to merely mentioning her).
import asyncio
import aiohttp
import re
import globals
# Langchain imports below are only used in commented-out code
# from langchain_community.vectorstores import FAISS
# from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
# from langchain_core.documents import Document
from utils.logger import get_logger
logger = get_logger('core')
# switch_model() removed - llama-swap handles model switching automatically
# ────────────────────────────────────────────────────────────────────
# Pre-compiled Miku addressing patterns
# Built once at module load; is_miku_addressed() runs only 4 .search()
# ────────────────────────────────────────────────────────────────────
def _build_name_variants(bases, honorifics, prefixes, connector, prefix_connector):
"""Return regex fragments for every name+honorific+prefix combo
within a single script family."""
variants = []
for base in bases:
be = re.escape(base)
variants.append(be)
for h in honorifics:
he = re.escape(h)
variants.append(be + connector + he)
for p in prefixes:
pe = re.escape(p)
variants.append(pe + prefix_connector + be)
for h in honorifics:
he = re.escape(h)
variants.append(pe + prefix_connector + be + connector + he)
return variants
def _compile_addressing_patterns():
"""Compile the four addressing regexes.
START name at the beginning, followed by punctuation
"Miku, how are you?" "みく!聞いて"
END comma then name at the end
"how are you, Miku?" "教えて、ミク"
MIDDLE name surrounded by commas (vocative)
"On the contrary, Miku, I think…"
ALONE name is the entire message
"Miku" "みく!" "ミクちゃん"
"""
latin = _build_name_variants(
bases=['miku'],
honorifics=[
'chan', 'san', 'kun', 'nyan', 'hime', 'tan', 'chin', 'heika',
'denka', 'kakka', 'shi', 'chama', 'kyun', 'dono', 'sensei',
'senpai', 'jou',
],
prefixes=['o-'],
connector=r'[\-\s]?',
prefix_connector=r'\s?',
)
cyrillic = _build_name_variants(
bases=['мику'],
honorifics=[
'чан', 'сан', 'кун', 'нян', 'химе', 'тан', 'чин',
'хейка', 'хеика', 'денка', 'какка', 'си', 'чама', 'кюн',
'доно', 'сенсэй', 'сенсеи', 'сенпай', 'сенпаи', 'джо',
],
prefixes=['о-'],
connector=r'[\-\s]?',
prefix_connector=r'\s?',
)
japanese = _build_name_variants(
bases=['みく', 'ミク', '未来'],
honorifics=[
# Hiragana
'ちゃん', 'さん', 'くん', 'にゃん', 'ひめ', 'たん', 'ちん',
'へいか', 'でんか', 'かっか', '', 'ちゃま', 'きゅん', 'どの',
'せんせい', 'せんぱい', 'じょう',
# Katakana
'チャン', 'サン', 'クン', 'ニャン', 'ヒメ', 'タン', 'チン',
'ヘイカ', 'デンカ', 'カッカ', '', 'チャマ', 'キュン', 'ドノ',
'センセイ', 'センパイ', 'ジョウ',
],
prefixes=['', ''],
connector=r'[-]?',
prefix_connector=r'',
)
# Longest-first so the regex engine prefers the most specific match
all_v = sorted(latin + cyrillic + japanese, key=len, reverse=True)
alts = '|'.join(all_v)
NAME = rf'\b(?:{alts})\b'
PUNCT = r'[,,、:!?.。]' # addressing punctuation after name
COMMA = r'[,,、]' # comma variants (before name / vocative)
ETRAIL = r'[!?.。~]*' # optional trailing at end
ATRAIL = r'[!?.。~~♪♡❤]*' # optional trailing for name-only messages
start_re = re.compile(rf'^\s*{NAME}\s*{PUNCT}', re.IGNORECASE)
end_re = re.compile(rf'{COMMA}\s*{NAME}\s*{ETRAIL}\s*$', re.IGNORECASE)
middle_re = re.compile(rf'{COMMA}\s*{NAME}\s*{COMMA}', re.IGNORECASE)
alone_re = re.compile(rf'^\s*{NAME}\s*{ATRAIL}\s*$', re.IGNORECASE)
logger.info(f"Miku addressing: {len(all_v)} name variants compiled into 4 patterns")
return start_re, end_re, middle_re, alone_re
try:
_START_RE, _END_RE, _MIDDLE_RE, _ALONE_RE = _compile_addressing_patterns()
except Exception as e:
logger.error(f"Failed to compile addressing patterns: {e}")
_START_RE = _END_RE = _MIDDLE_RE = _ALONE_RE = None
# ────────────────────────────────────────────────────────────────────
async def is_miku_addressed(message) -> bool:
# Check if this is a DM (no guild)
"""Return True only when the message is directed *at* Miku,
not merely mentioning her.
Always responds to: DMs, @mentions, replies to Miku's messages.
For normal messages checks whether Miku's name (in any supported
script / honorific combination) appears in an "addressing" position:
• Start "Miku, how are you?"
• End "how are you, Miku?"
• Middle "On the contrary, Miku, I think…"
• Alone "Miku!" / "ミクちゃん"
Does NOT trigger on mere mentions:
"I like Miku" / "Miku is cool" / "told miku about it"
"""
# DMs always respond
if message.guild is None:
# In DMs, always respond to every message
return True
# Safety check: ensure guild and guild.me exist
if not message.guild or not message.guild.me:
logger.warning(f"Invalid guild or guild.me in message from {message.author}")
logger.warning(f"Invalid guild/guild.me for message from {message.author}")
return False
# If message contains a ping for Miku, return true
# @mention
if message.guild.me in message.mentions:
return True
# If message is a reply, check the referenced message author
# Reply to Miku
if message.reference:
try:
referenced_msg = await message.channel.fetch_message(message.reference.message_id)
if referenced_msg.author == message.guild.me:
ref = await message.channel.fetch_message(message.reference.message_id)
if ref.author == message.guild.me:
return True
except Exception as e:
logger.warning(f"Could not fetch referenced message: {e}")
cleaned = message.content.strip()
cleaned_lower = cleaned.lower()
# Base names for Miku in different scripts
base_names = [
'miku', 'мику', 'みく', 'ミク', '未来'
]
# Japanese honorifics - all scripts combined
honorifics = [
# Latin
'chan', 'san', 'kun', 'nyan', 'hime', 'tan', 'chin', 'heika',
'denka', 'kakka', 'shi', 'chama', 'kyun', 'dono', 'sensei', 'senpai', 'jou',
# Hiragana
'ちゃん', 'さん', 'くん', 'にゃん', 'ひめ', 'たん', 'ちん', 'へいか',
'でんか', 'かっか', '', 'ちゃま', 'きゅん', 'どの', 'せんせい', 'せんぱい', 'じょう',
# Katakana
'チャン', 'サン', 'クン', 'ニャン', 'ヒメ', 'タン', 'チン', 'ヘイカ',
'デンカ', 'カッカ', '', 'チャマ', 'キュン', 'ドノ', 'センセイ', 'センパイ', 'ジョウ',
# Cyrillic
'чан', 'сан', 'кун', 'нян', 'химе', 'тан', 'чин', 'хейка', 'хеика',
'денка', 'какка', 'си', 'чама', 'кюн', 'доно', 'сенсэй', 'сенсеи', 'сенпай', 'сенпаи', 'джо'
]
# o- prefix variants
o_prefixes = ['o-', 'о-', '', '']
# Build all possible name variations to check
name_patterns = []
for base in base_names:
base_lower = base.lower()
base_escaped = re.escape(base_lower)
# Base name alone
name_patterns.append(base_escaped)
# With honorifics (allows optional dash/space between)
for honorific in honorifics:
honorific_lower = honorific.lower()
honorific_escaped = re.escape(honorific_lower)
# Build pattern: base + optional [dash or space] + honorific
name_patterns.append(base_escaped + r'[\-\s]*' + honorific_escaped)
# With o- prefix
for prefix in o_prefixes:
prefix_lower = prefix.lower()
prefix_escaped = re.escape(prefix_lower)
# o-prefix + optional space + base
name_patterns.append(prefix_escaped + r'\s*' + base_escaped)
# With o- prefix + honorific
for honorific in honorifics:
honorific_lower = honorific.lower()
honorific_escaped = re.escape(honorific_lower)
# o-prefix + space + base + dash/space + honorific
name_patterns.append(prefix_escaped + r'\s*' + base_escaped + r'[\-\s]*' + honorific_escaped)
# Check all patterns - she must be "addressed" not just mentioned
for pattern in name_patterns:
try:
# Pattern 1: Start of message + punctuation/end
# "Miku, ..." or "みく!" or "ミクちゃん、..."
start_p = r'^' + pattern + r'(?:[,,、!?.。\s]+|$)'
if re.search(start_p, cleaned_lower, re.IGNORECASE):
return True
# Pattern 2: End of message (optionally preceded by punctuation)
# "..., Miku" or "...みく" or "...ミクちゃん!"
end_p = r'(?:[,,、!?.。\s]+|^)' + pattern + r'[!?.。\s]*$'
if re.search(end_p, cleaned_lower, re.IGNORECASE):
return True
# Pattern 3: Middle (surrounded by punctuation)
# "..., Miku, ..." or "...、ミク、..."
middle_p = r'[,,、!?.。\s]+' + pattern + r'[,,、!?.。\s]+'
if re.search(middle_p, cleaned_lower, re.IGNORECASE):
return True
# Pattern 4: Just the name alone
# "Miku" or "みく!" or "ミクちゃん"
alone_p = r'^\s*' + pattern + r'[!?.。]*\s*$'
if re.search(alone_p, cleaned_lower, re.IGNORECASE):
return True
except re.error as e:
# Log the problematic pattern and skip it
logger.error(f"REGEX ERROR - Pattern: '{pattern}' | Start regex: '{start_p}' | Error: {e}")
continue
return False
# Regex addressing (4 pre-compiled patterns)
if _START_RE is None:
logger.error("Addressing patterns not compiled skipping pattern check")
return False
# Vectorstore functionality disabled - not needed with current structured context approach
# If you need embeddings in the future, you can use a different embedding provider
# For now, the bot uses structured prompts from context_manager.py
# def load_miku_knowledge():
# with open("miku_lore.txt", "r", encoding="utf-8") as f:
# text = f.read()
#
# from langchain_text_splitters import RecursiveCharacterTextSplitter
#
# text_splitter = RecursiveCharacterTextSplitter(
# chunk_size=520,
# chunk_overlap=50,
# separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
# )
#
# docs = [Document(page_content=chunk) for chunk in text_splitter.split_text(text)]
#
# vectorstore = FAISS.from_documents(docs, embeddings)
# return vectorstore
#
# def load_miku_lyrics():
# with open("miku_lyrics.txt", "r", encoding="utf-8") as f:
# lyrics_text = f.read()
#
# text_splitter = CharacterTextSplitter(chunk_size=520, chunk_overlap=50)
# docs = [Document(page_content=chunk) for chunk in text_splitter.split_text(lyrics_text)]
#
# vectorstore = FAISS.from_documents(docs, embeddings)
# return vectorstore
#
# miku_vectorstore = load_miku_knowledge()
# miku_lyrics_vectorstore = load_miku_lyrics()
text = message.content.strip()
return bool(
_START_RE.search(text)
or _END_RE.search(text)
or _MIDDLE_RE.search(text)
or _ALONE_RE.search(text)
)

249
test_addressing.py Normal file
View File

@@ -0,0 +1,249 @@
#!/usr/bin/env python3
"""Comprehensive test for Miku addressing detection patterns.
Tests the pre-compiled regex patterns from bot/utils/core.py to verify
that Miku is only triggered when *addressed*, not merely *mentioned*.
"""
import re
import sys
# ── Replicate the pattern-building logic from core.py ──
def _build_name_variants(bases, honorifics, prefixes, connector, prefix_connector):
variants = []
for base in bases:
be = re.escape(base)
variants.append(be)
for h in honorifics:
he = re.escape(h)
variants.append(be + connector + he)
for p in prefixes:
pe = re.escape(p)
variants.append(pe + prefix_connector + be)
for h in honorifics:
he = re.escape(h)
variants.append(pe + prefix_connector + be + connector + he)
return variants
latin = _build_name_variants(
bases=['miku'],
honorifics=[
'chan', 'san', 'kun', 'nyan', 'hime', 'tan', 'chin', 'heika',
'denka', 'kakka', 'shi', 'chama', 'kyun', 'dono', 'sensei',
'senpai', 'jou',
],
prefixes=['o-'],
connector=r'[\-\s]?',
prefix_connector=r'\s?',
)
cyrillic = _build_name_variants(
bases=['мику'],
honorifics=[
'чан', 'сан', 'кун', 'нян', 'химе', 'тан', 'чин',
'хейка', 'хеика', 'денка', 'какка', 'си', 'чама', 'кюн',
'доно', 'сенсэй', 'сенсеи', 'сенпай', 'сенпаи', 'джо',
],
prefixes=['о-'],
connector=r'[\-\s]?',
prefix_connector=r'\s?',
)
japanese = _build_name_variants(
bases=['みく', 'ミク', '未来'],
honorifics=[
'ちゃん', 'さん', 'くん', 'にゃん', 'ひめ', 'たん', 'ちん',
'へいか', 'でんか', 'かっか', '', 'ちゃま', 'きゅん', 'どの',
'せんせい', 'せんぱい', 'じょう',
'チャン', 'サン', 'クン', 'ニャン', 'ヒメ', 'タン', 'チン',
'ヘイカ', 'デンカ', 'カッカ', '', 'チャマ', 'キュン', 'ドノ',
'センセイ', 'センパイ', 'ジョウ',
],
prefixes=['', ''],
connector=r'[-]?',
prefix_connector=r'',
)
all_v = sorted(latin + cyrillic + japanese, key=len, reverse=True)
alts = '|'.join(all_v)
NAME = rf'\b(?:{alts})\b'
PUNCT = r'[,,、:!?.。]'
COMMA = r'[,,、]'
ETRAIL = r'[!?.。~]*'
ATRAIL = r'[!?.。~~♪♡❤]*'
START_RE = re.compile(rf'^\s*{NAME}\s*{PUNCT}', re.IGNORECASE)
END_RE = re.compile(rf'{COMMA}\s*{NAME}\s*{ETRAIL}\s*$', re.IGNORECASE)
MIDDLE_RE = re.compile(rf'{COMMA}\s*{NAME}\s*{COMMA}', re.IGNORECASE)
ALONE_RE = re.compile(rf'^\s*{NAME}\s*{ATRAIL}\s*$', re.IGNORECASE)
def is_addressed(text: str) -> bool:
text = text.strip()
return bool(
START_RE.search(text)
or END_RE.search(text)
or MIDDLE_RE.search(text)
or ALONE_RE.search(text)
)
def which_pattern(text: str) -> str:
"""Return which pattern matched (for debugging)."""
text = text.strip()
matched = []
if START_RE.search(text):
matched.append("START")
if END_RE.search(text):
matched.append("END")
if MIDDLE_RE.search(text):
matched.append("MIDDLE")
if ALONE_RE.search(text):
matched.append("ALONE")
return ', '.join(matched) if matched else 'NONE'
# ── Test cases ──
# (message, expected, description)
TESTS = [
# ═══ START pattern (name at beginning + punctuation) ═══
("Miku, how are you?", True, "START: Latin + comma"),
("miku, hello!", True, "START: lowercase Latin"),
("MIKU! listen to me", True, "START: uppercase + excl"),
("Miku: can you help?", True, "START: colon"),
("Miku. Please help.", True, "START: period"),
("みく、元気?", True, "START: Hiragana + JP comma"),
("ミク!聞いて", True, "START: Katakana + JP excl"),
("未来、教えて", True, "START: Kanji + JP comma"),
("мику, привет!", True, "START: Cyrillic + comma"),
("МИКУ! слушай", True, "START: Cyrillic upper + excl"),
("Miku-chan, how are you?", True, "START: honorific-dash + comma"),
("miku chan, hello!", True, "START: honorific-space + comma"),
("mikuchan! listen!", True, "START: honorific-joined + excl"),
("ミクちゃん、聞いて", True, "START: JP name+honorific + comma"),
("ミクちゃん!元気?", True, "START: JP name+honorific + excl"),
("みくさん, 教えて", True, "START: Hiragana + hon + comma"),
("мику-сан, скажи", True, "START: Cyrillic + hon + comma"),
("o-miku, hello", True, "START: o-prefix Latin"),
("おみく、ねえ", True, "START: o-prefix Japanese"),
(" Miku, hello ", True, "START: whitespace padded"),
# ═══ END pattern (comma + name at end) ═══
("how are you, Miku?", True, "END: comma + Latin + ?"),
("how are you, Miku!", True, "END: comma + Latin + !"),
("how are you, Miku", True, "END: comma + Latin no trail"),
("tell me, miku.", True, "END: comma + lowercase + period"),
("元気, ミク", True, "END: comma + Katakana"),
("教えて、みく!", True, "END: JP comma + Hiragana + !"),
("教えて、未来", True, "END: JP comma + Kanji"),
("скажи, мику!", True, "END: Cyrillic comma + name"),
("hello, Miku-chan!", True, "END: comma + honorific"),
("hello, miku-san?", True, "END: comma + honorific + ?"),
("元気、ミクちゃん", True, "END: JP comma + JP honorific"),
("hello, o-miku", True, "END: comma + o-prefix"),
# ═══ MIDDLE pattern (vocative — commas on both sides) ═══
("On the contrary, Miku, I think you're wrong", True, "MIDDLE: vocative Latin"),
("I am very happy, Miku, you are so fun", True, "MIDDLE: vocative Latin 2"),
("well, Miku-chan, I think so", True, "MIDDLE: vocative + honorific"),
("しかし、みく、それは違う", True, "MIDDLE: vocative Japanese"),
("でも、ミクちゃん、聞いて", True, "MIDDLE: vocative JP + honorific"),
("но, мику, я думаю", True, "MIDDLE: vocative Cyrillic"),
("hey, miku, what do you think?", True, "MIDDLE: vocative casual"),
("you know, Miku, that's not right", True, "MIDDLE: vocative mid-sentence"),
# ═══ ALONE pattern (name is the entire message) ═══
("Miku", True, "ALONE: bare Latin"),
("miku", True, "ALONE: lowercase"),
("MIKU", True, "ALONE: uppercase"),
("Miku!", True, "ALONE: + excl"),
("Miku?", True, "ALONE: + question"),
("Miku!!", True, "ALONE: + multi excl"),
("みく", True, "ALONE: Hiragana"),
("ミク!", True, "ALONE: Katakana + excl"),
("未来", True, "ALONE: Kanji"),
("мику", True, "ALONE: Cyrillic"),
("Miku-chan", True, "ALONE: Latin + honorific"),
("miku chan!", True, "ALONE: space honorific + excl"),
("ミクちゃん", True, "ALONE: JP honorific"),
("ミクさん!", True, "ALONE: JP honorific + excl"),
("みくせんせい", True, "ALONE: Hiragana + sensei"),
("o-miku!", True, "ALONE: o-prefix"),
("おみく", True, "ALONE: JP o-prefix"),
("オミク", True, "ALONE: Katakana o-prefix"),
(" Miku ", True, "ALONE: whitespace"),
("Miku~", True, "ALONE: tilde"),
("Miku♪", True, "ALONE: music note"),
("Miku❤", True, "ALONE: heart"),
("мику-чан", True, "ALONE: Cyrillic + honorific"),
("мику сан", True, "ALONE: Cyrillic + space hon"),
("未来さん", True, "ALONE: Kanji + honorific"),
# ═══ Should NOT match (mere mentions / not addressing) ═══
("I like Miku", False, "REJECT: object of sentence"),
("Miku is cool", False, "REJECT: subject + is"),
("Miku is my favorite vocaloid", False, "REJECT: subject + statement"),
("I saw Miku at a concert", False, "REJECT: middle of sentence"),
("told miku about it", False, "REJECT: informal mention"),
("hatsune miku concert", False, "REJECT: event name"),
("Do you know Miku?", False, "REJECT: asking about her"),
("I love Miku!", False, "REJECT: exclamation about her"),
("I love Miku so much", False, "REJECT: longer statement"),
("ミクは元気だよ", False, "REJECT: Japanese 'Miku is well'"),
("ミクが好き", False, "REJECT: Japanese 'I like Miku'"),
("ミクのことが好き", False, "REJECT: Japanese 'I like Miku (thing)'"),
("мику была там", False, "REJECT: Cyrillic 'Miku was there'"),
("мику такая красивая", False, "REJECT: Cyrillic 'Miku is pretty'"),
("the Miku concert was great", False, "REJECT: event discussion"),
("My favorite is Miku for sure", False, "REJECT: no comma before name at end"),
("yeah miku is pretty cool right", False, "REJECT: casual mention"),
("have you seen miku today", False, "REJECT: asking about her"),
("miku and I went shopping", False, "REJECT: subject of sentence"),
("I met miku yesterday", False, "REJECT: object mid-sentence"),
("mikumiku fan", False, "REJECT: compound word (\\b boundary)"),
("hatsune miku is singing", False, "REJECT: full name as subject"),
# ═══ Edge cases ═══
("", False, "EDGE: empty message"),
("hello", False, "EDGE: no name at all"),
("hello!", False, "EDGE: exclamation, no name"),
("??", False, "EDGE: just punctuation"),
(" ", False, "EDGE: just whitespace"),
("chan", False, "EDGE: just an honorific"),
("o-", False, "EDGE: just a prefix"),
]
def main():
print(f"Generated {len(all_v)} name variants")
print(f"Running {len(TESTS)} test cases...\n")
passed = 0
failed = 0
for msg, expected, desc in TESTS:
result = is_addressed(msg)
ok = result == expected
if ok:
passed += 1
else:
failed += 1
pattern = which_pattern(msg)
exp_str = "ADDR" if expected else "SKIP"
got_str = "ADDR" if result else "SKIP"
print(f" FAIL expected={exp_str} got={got_str} matched={pattern}")
print(f" {desc}")
print(f" message: \"{msg}\"\n")
print(f"\n{'='*50}")
print(f" {passed}/{len(TESTS)} passed, {failed} failed")
print(f"{'='*50}")
return 0 if failed == 0 else 1
if __name__ == '__main__':
sys.exit(main())