- Pre-compile 393 name variants into 4 regex patterns at module load (was 7,300+ raw re.search() calls per message) - Strict addressing detection using punctuation context: START: name at beginning + punctuation (Miku, ... / みく!...) END: comma + name at end (..., Miku / ...、ミク) MIDDLE: commas on both sides - vocative (..., Miku, ...) ALONE: name is the entire message (Miku! / ミクちゃん) - Rejects mere mentions: 'I like Miku' / 'Miku is cool' no longer trigger - Script-family-aware pattern generation (Latin, Cyrillic, Japanese) eliminates nonsensical cross-script combos (e.g. o-みく) - Word boundary enforcement prevents substring matches (mikumiku) - Fixes regex 'unbalanced parenthesis' errors from old implementation - Add comprehensive test suite (94 cases, all passing)
250 lines
12 KiB
Python
250 lines
12 KiB
Python
#!/usr/bin/env python3
|
||
"""Comprehensive test for Miku addressing detection patterns.
|
||
|
||
Tests the pre-compiled regex patterns from bot/utils/core.py to verify
|
||
that Miku is only triggered when *addressed*, not merely *mentioned*.
|
||
"""
|
||
|
||
import re
|
||
import sys
|
||
|
||
# ── Replicate the pattern-building logic from core.py ──
|
||
|
||
def _build_name_variants(bases, honorifics, prefixes, connector, prefix_connector):
|
||
variants = []
|
||
for base in bases:
|
||
be = re.escape(base)
|
||
variants.append(be)
|
||
for h in honorifics:
|
||
he = re.escape(h)
|
||
variants.append(be + connector + he)
|
||
for p in prefixes:
|
||
pe = re.escape(p)
|
||
variants.append(pe + prefix_connector + be)
|
||
for h in honorifics:
|
||
he = re.escape(h)
|
||
variants.append(pe + prefix_connector + be + connector + he)
|
||
return variants
|
||
|
||
|
||
latin = _build_name_variants(
|
||
bases=['miku'],
|
||
honorifics=[
|
||
'chan', 'san', 'kun', 'nyan', 'hime', 'tan', 'chin', 'heika',
|
||
'denka', 'kakka', 'shi', 'chama', 'kyun', 'dono', 'sensei',
|
||
'senpai', 'jou',
|
||
],
|
||
prefixes=['o-'],
|
||
connector=r'[\-\s]?',
|
||
prefix_connector=r'\s?',
|
||
)
|
||
|
||
cyrillic = _build_name_variants(
|
||
bases=['мику'],
|
||
honorifics=[
|
||
'чан', 'сан', 'кун', 'нян', 'химе', 'тан', 'чин',
|
||
'хейка', 'хеика', 'денка', 'какка', 'си', 'чама', 'кюн',
|
||
'доно', 'сенсэй', 'сенсеи', 'сенпай', 'сенпаи', 'джо',
|
||
],
|
||
prefixes=['о-'],
|
||
connector=r'[\-\s]?',
|
||
prefix_connector=r'\s?',
|
||
)
|
||
|
||
japanese = _build_name_variants(
|
||
bases=['みく', 'ミク', '未来'],
|
||
honorifics=[
|
||
'ちゃん', 'さん', 'くん', 'にゃん', 'ひめ', 'たん', 'ちん',
|
||
'へいか', 'でんか', 'かっか', 'し', 'ちゃま', 'きゅん', 'どの',
|
||
'せんせい', 'せんぱい', 'じょう',
|
||
'チャン', 'サン', 'クン', 'ニャン', 'ヒメ', 'タン', 'チン',
|
||
'ヘイカ', 'デンカ', 'カッカ', 'シ', 'チャマ', 'キュン', 'ドノ',
|
||
'センセイ', 'センパイ', 'ジョウ',
|
||
],
|
||
prefixes=['お', 'オ'],
|
||
connector=r'[-]?',
|
||
prefix_connector=r'',
|
||
)
|
||
|
||
all_v = sorted(latin + cyrillic + japanese, key=len, reverse=True)
|
||
alts = '|'.join(all_v)
|
||
|
||
NAME = rf'\b(?:{alts})\b'
|
||
PUNCT = r'[,,、::!!??.。]'
|
||
COMMA = r'[,,、]'
|
||
ETRAIL = r'[!!??.。~~]*'
|
||
ATRAIL = r'[!!??.。~~♪♡❤]*'
|
||
|
||
START_RE = re.compile(rf'^\s*{NAME}\s*{PUNCT}', re.IGNORECASE)
|
||
END_RE = re.compile(rf'{COMMA}\s*{NAME}\s*{ETRAIL}\s*$', re.IGNORECASE)
|
||
MIDDLE_RE = re.compile(rf'{COMMA}\s*{NAME}\s*{COMMA}', re.IGNORECASE)
|
||
ALONE_RE = re.compile(rf'^\s*{NAME}\s*{ATRAIL}\s*$', re.IGNORECASE)
|
||
|
||
|
||
def is_addressed(text: str) -> bool:
|
||
text = text.strip()
|
||
return bool(
|
||
START_RE.search(text)
|
||
or END_RE.search(text)
|
||
or MIDDLE_RE.search(text)
|
||
or ALONE_RE.search(text)
|
||
)
|
||
|
||
|
||
def which_pattern(text: str) -> str:
|
||
"""Return which pattern matched (for debugging)."""
|
||
text = text.strip()
|
||
matched = []
|
||
if START_RE.search(text):
|
||
matched.append("START")
|
||
if END_RE.search(text):
|
||
matched.append("END")
|
||
if MIDDLE_RE.search(text):
|
||
matched.append("MIDDLE")
|
||
if ALONE_RE.search(text):
|
||
matched.append("ALONE")
|
||
return ', '.join(matched) if matched else 'NONE'
|
||
|
||
|
||
# ── Test cases ──
|
||
# (message, expected, description)
|
||
TESTS = [
|
||
# ═══ START pattern (name at beginning + punctuation) ═══
|
||
("Miku, how are you?", True, "START: Latin + comma"),
|
||
("miku, hello!", True, "START: lowercase Latin"),
|
||
("MIKU! listen to me", True, "START: uppercase + excl"),
|
||
("Miku: can you help?", True, "START: colon"),
|
||
("Miku. Please help.", True, "START: period"),
|
||
("みく、元気?", True, "START: Hiragana + JP comma"),
|
||
("ミク!聞いて", True, "START: Katakana + JP excl"),
|
||
("未来、教えて", True, "START: Kanji + JP comma"),
|
||
("мику, привет!", True, "START: Cyrillic + comma"),
|
||
("МИКУ! слушай", True, "START: Cyrillic upper + excl"),
|
||
("Miku-chan, how are you?", True, "START: honorific-dash + comma"),
|
||
("miku chan, hello!", True, "START: honorific-space + comma"),
|
||
("mikuchan! listen!", True, "START: honorific-joined + excl"),
|
||
("ミクちゃん、聞いて", True, "START: JP name+honorific + comma"),
|
||
("ミクちゃん!元気?", True, "START: JP name+honorific + excl"),
|
||
("みくさん, 教えて", True, "START: Hiragana + hon + comma"),
|
||
("мику-сан, скажи", True, "START: Cyrillic + hon + comma"),
|
||
("o-miku, hello", True, "START: o-prefix Latin"),
|
||
("おみく、ねえ", True, "START: o-prefix Japanese"),
|
||
(" Miku, hello ", True, "START: whitespace padded"),
|
||
|
||
# ═══ END pattern (comma + name at end) ═══
|
||
("how are you, Miku?", True, "END: comma + Latin + ?"),
|
||
("how are you, Miku!", True, "END: comma + Latin + !"),
|
||
("how are you, Miku", True, "END: comma + Latin no trail"),
|
||
("tell me, miku.", True, "END: comma + lowercase + period"),
|
||
("元気, ミク", True, "END: comma + Katakana"),
|
||
("教えて、みく!", True, "END: JP comma + Hiragana + !"),
|
||
("教えて、未来", True, "END: JP comma + Kanji"),
|
||
("скажи, мику!", True, "END: Cyrillic comma + name"),
|
||
("hello, Miku-chan!", True, "END: comma + honorific"),
|
||
("hello, miku-san?", True, "END: comma + honorific + ?"),
|
||
("元気、ミクちゃん", True, "END: JP comma + JP honorific"),
|
||
("hello, o-miku", True, "END: comma + o-prefix"),
|
||
|
||
# ═══ MIDDLE pattern (vocative — commas on both sides) ═══
|
||
("On the contrary, Miku, I think you're wrong", True, "MIDDLE: vocative Latin"),
|
||
("I am very happy, Miku, you are so fun", True, "MIDDLE: vocative Latin 2"),
|
||
("well, Miku-chan, I think so", True, "MIDDLE: vocative + honorific"),
|
||
("しかし、みく、それは違う", True, "MIDDLE: vocative Japanese"),
|
||
("でも、ミクちゃん、聞いて", True, "MIDDLE: vocative JP + honorific"),
|
||
("но, мику, я думаю", True, "MIDDLE: vocative Cyrillic"),
|
||
("hey, miku, what do you think?", True, "MIDDLE: vocative casual"),
|
||
("you know, Miku, that's not right", True, "MIDDLE: vocative mid-sentence"),
|
||
|
||
# ═══ ALONE pattern (name is the entire message) ═══
|
||
("Miku", True, "ALONE: bare Latin"),
|
||
("miku", True, "ALONE: lowercase"),
|
||
("MIKU", True, "ALONE: uppercase"),
|
||
("Miku!", True, "ALONE: + excl"),
|
||
("Miku?", True, "ALONE: + question"),
|
||
("Miku!!", True, "ALONE: + multi excl"),
|
||
("みく", True, "ALONE: Hiragana"),
|
||
("ミク!", True, "ALONE: Katakana + excl"),
|
||
("未来", True, "ALONE: Kanji"),
|
||
("мику", True, "ALONE: Cyrillic"),
|
||
("Miku-chan", True, "ALONE: Latin + honorific"),
|
||
("miku chan!", True, "ALONE: space honorific + excl"),
|
||
("ミクちゃん", True, "ALONE: JP honorific"),
|
||
("ミクさん!", True, "ALONE: JP honorific + excl"),
|
||
("みくせんせい", True, "ALONE: Hiragana + sensei"),
|
||
("o-miku!", True, "ALONE: o-prefix"),
|
||
("おみく", True, "ALONE: JP o-prefix"),
|
||
("オミク", True, "ALONE: Katakana o-prefix"),
|
||
(" Miku ", True, "ALONE: whitespace"),
|
||
("Miku~", True, "ALONE: tilde"),
|
||
("Miku♪", True, "ALONE: music note"),
|
||
("Miku❤", True, "ALONE: heart"),
|
||
("мику-чан", True, "ALONE: Cyrillic + honorific"),
|
||
("мику сан", True, "ALONE: Cyrillic + space hon"),
|
||
("未来さん", True, "ALONE: Kanji + honorific"),
|
||
|
||
# ═══ Should NOT match (mere mentions / not addressing) ═══
|
||
("I like Miku", False, "REJECT: object of sentence"),
|
||
("Miku is cool", False, "REJECT: subject + is"),
|
||
("Miku is my favorite vocaloid", False, "REJECT: subject + statement"),
|
||
("I saw Miku at a concert", False, "REJECT: middle of sentence"),
|
||
("told miku about it", False, "REJECT: informal mention"),
|
||
("hatsune miku concert", False, "REJECT: event name"),
|
||
("Do you know Miku?", False, "REJECT: asking about her"),
|
||
("I love Miku!", False, "REJECT: exclamation about her"),
|
||
("I love Miku so much", False, "REJECT: longer statement"),
|
||
("ミクは元気だよ", False, "REJECT: Japanese 'Miku is well'"),
|
||
("ミクが好き", False, "REJECT: Japanese 'I like Miku'"),
|
||
("ミクのことが好き", False, "REJECT: Japanese 'I like Miku (thing)'"),
|
||
("мику была там", False, "REJECT: Cyrillic 'Miku was there'"),
|
||
("мику такая красивая", False, "REJECT: Cyrillic 'Miku is pretty'"),
|
||
("the Miku concert was great", False, "REJECT: event discussion"),
|
||
("My favorite is Miku for sure", False, "REJECT: no comma before name at end"),
|
||
("yeah miku is pretty cool right", False, "REJECT: casual mention"),
|
||
("have you seen miku today", False, "REJECT: asking about her"),
|
||
("miku and I went shopping", False, "REJECT: subject of sentence"),
|
||
("I met miku yesterday", False, "REJECT: object mid-sentence"),
|
||
("mikumiku fan", False, "REJECT: compound word (\\b boundary)"),
|
||
("hatsune miku is singing", False, "REJECT: full name as subject"),
|
||
|
||
# ═══ Edge cases ═══
|
||
("", False, "EDGE: empty message"),
|
||
("hello", False, "EDGE: no name at all"),
|
||
("hello!", False, "EDGE: exclamation, no name"),
|
||
("??", False, "EDGE: just punctuation"),
|
||
(" ", False, "EDGE: just whitespace"),
|
||
("chan", False, "EDGE: just an honorific"),
|
||
("o-", False, "EDGE: just a prefix"),
|
||
]
|
||
|
||
|
||
def main():
|
||
print(f"Generated {len(all_v)} name variants")
|
||
print(f"Running {len(TESTS)} test cases...\n")
|
||
|
||
passed = 0
|
||
failed = 0
|
||
|
||
for msg, expected, desc in TESTS:
|
||
result = is_addressed(msg)
|
||
ok = result == expected
|
||
if ok:
|
||
passed += 1
|
||
else:
|
||
failed += 1
|
||
pattern = which_pattern(msg)
|
||
exp_str = "ADDR" if expected else "SKIP"
|
||
got_str = "ADDR" if result else "SKIP"
|
||
print(f" FAIL expected={exp_str} got={got_str} matched={pattern}")
|
||
print(f" {desc}")
|
||
print(f" message: \"{msg}\"\n")
|
||
|
||
print(f"\n{'='*50}")
|
||
print(f" {passed}/{len(TESTS)} passed, {failed} failed")
|
||
print(f"{'='*50}")
|
||
|
||
return 0 if failed == 0 else 1
|
||
|
||
|
||
if __name__ == '__main__':
|
||
sys.exit(main())
|