miku-discord/test_addressing.py

#!/usr/bin/env python3
"""Comprehensive test for Miku addressing detection patterns.

Tests the pre-compiled regex patterns from bot/utils/core.py to verify
that Miku is only triggered when *addressed*, not merely *mentioned*.
"""

import re
import sys

# ── Replicate the pattern-building logic from core.py ──

def _build_name_variants(bases, honorifics, prefixes, connector, prefix_connector):
    variants = []
    for base in bases:
        be = re.escape(base)
        variants.append(be)
        for h in honorifics:
            he = re.escape(h)
            variants.append(be + connector + he)
        for p in prefixes:
            pe = re.escape(p)
            variants.append(pe + prefix_connector + be)
            for h in honorifics:
                he = re.escape(h)
                variants.append(pe + prefix_connector + be + connector + he)
    return variants


latin = _build_name_variants(
    bases=['miku'],
    honorifics=[
        'chan', 'san', 'kun', 'nyan', 'hime', 'tan', 'chin', 'heika',
        'denka', 'kakka', 'shi', 'chama', 'kyun', 'dono', 'sensei',
        'senpai', 'jou',
    ],
    prefixes=['o-'],
    connector=r'[\-\s]?',
    prefix_connector=r'\s?',
)

cyrillic = _build_name_variants(
    bases=['мику'],
    honorifics=[
        'чан', 'сан', 'кун', 'нян', 'химе', 'тан', 'чин',
        'хейка', 'хеика', 'денка', 'какка', 'си', 'чама', 'кюн',
        'доно', 'сенсэй', 'сенсеи', 'сенпай', 'сенпаи', 'джо',
    ],
    prefixes=['о-'],
    connector=r'[\-\s]?',
    prefix_connector=r'\s?',
)

japanese = _build_name_variants(
    bases=['みく', 'ミク', '未来'],
    honorifics=[
        'ちゃん', 'さん', 'くん', 'にゃん', 'ひめ', 'たん', 'ちん',
        'へいか', 'でんか', 'かっか', 'し', 'ちゃま', 'きゅん', 'どの',
        'せんせい', 'せんぱい', 'じょう',
        'チャン', 'サン', 'クン', 'ニャン', 'ヒメ', 'タン', 'チン',
        'ヘイカ', 'デンカ', 'カッカ', 'シ', 'チャマ', 'キュン', 'ドノ',
        'センセイ', 'センパイ', 'ジョウ',
    ],
    prefixes=['お', 'オ'],
    connector=r'[-]?',
    prefix_connector=r'',
)

all_v = sorted(latin + cyrillic + japanese, key=len, reverse=True)
alts = '|'.join(all_v)

NAME   = rf'\b(?:{alts})\b'
PUNCT  = r'[,，、:：!！?？.。]'
COMMA  = r'[,，、]'
ETRAIL = r'[!！?？.。~～]*'
ATRAIL = r'[!！?？.。~～♪♡❤]*'

START_RE  = re.compile(rf'^\s*{NAME}\s*{PUNCT}',            re.IGNORECASE)
END_RE    = re.compile(rf'{COMMA}\s*{NAME}\s*{ETRAIL}\s*$', re.IGNORECASE)
MIDDLE_RE = re.compile(rf'{COMMA}\s*{NAME}\s*{COMMA}',      re.IGNORECASE)
ALONE_RE  = re.compile(rf'^\s*{NAME}\s*{ATRAIL}\s*$',       re.IGNORECASE)


def is_addressed(text: str) -> bool:
    text = text.strip()
    return bool(
        START_RE.search(text)
        or END_RE.search(text)
        or MIDDLE_RE.search(text)
        or ALONE_RE.search(text)
    )


def which_pattern(text: str) -> str:
    """Return which pattern matched (for debugging)."""
    text = text.strip()
    matched = []
    if START_RE.search(text):
        matched.append("START")
    if END_RE.search(text):
        matched.append("END")
    if MIDDLE_RE.search(text):
        matched.append("MIDDLE")
    if ALONE_RE.search(text):
        matched.append("ALONE")
    return ', '.join(matched) if matched else 'NONE'


# ── Test cases ──
# (message, expected, description)
TESTS = [
    # ═══ START pattern (name at beginning + punctuation) ═══
    ("Miku, how are you?",            True,  "START: Latin + comma"),
    ("miku, hello!",                   True,  "START: lowercase Latin"),
    ("MIKU! listen to me",            True,  "START: uppercase + excl"),
    ("Miku: can you help?",           True,  "START: colon"),
    ("Miku. Please help.",            True,  "START: period"),
    ("みく、元気？",                   True,  "START: Hiragana + JP comma"),
    ("ミク！聞いて",                   True,  "START: Katakana + JP excl"),
    ("未来、教えて",                   True,  "START: Kanji + JP comma"),
    ("мику, привет!",                 True,  "START: Cyrillic + comma"),
    ("МИКУ! слушай",                  True,  "START: Cyrillic upper + excl"),
    ("Miku-chan, how are you?",       True,  "START: honorific-dash + comma"),
    ("miku chan, hello!",             True,  "START: honorific-space + comma"),
    ("mikuchan! listen!",             True,  "START: honorific-joined + excl"),
    ("ミクちゃん、聞いて",             True,  "START: JP name+honorific + comma"),
    ("ミクちゃん！元気？",             True,  "START: JP name+honorific + excl"),
    ("みくさん, 教えて",               True,  "START: Hiragana + hon + comma"),
    ("мику-сан, скажи",              True,  "START: Cyrillic + hon + comma"),
    ("o-miku, hello",                 True,  "START: o-prefix Latin"),
    ("おみく、ねえ",                   True,  "START: o-prefix Japanese"),
    ("  Miku, hello  ",              True,  "START: whitespace padded"),

    # ═══ END pattern (comma + name at end) ═══
    ("how are you, Miku?",            True,  "END: comma + Latin + ?"),
    ("how are you, Miku!",            True,  "END: comma + Latin + !"),
    ("how are you, Miku",             True,  "END: comma + Latin no trail"),
    ("tell me, miku.",                True,  "END: comma + lowercase + period"),
    ("元気, ミク",                     True,  "END: comma + Katakana"),
    ("教えて、みく！",                 True,  "END: JP comma + Hiragana + !"),
    ("教えて、未来",                   True,  "END: JP comma + Kanji"),
    ("скажи, мику!",                  True,  "END: Cyrillic comma + name"),
    ("hello, Miku-chan!",             True,  "END: comma + honorific"),
    ("hello, miku-san?",             True,  "END: comma + honorific + ?"),
    ("元気、ミクちゃん",               True,  "END: JP comma + JP honorific"),
    ("hello, o-miku",                 True,  "END: comma + o-prefix"),

    # ═══ MIDDLE pattern (vocative — commas on both sides) ═══
    ("On the contrary, Miku, I think you're wrong",     True, "MIDDLE: vocative Latin"),
    ("I am very happy, Miku, you are so fun",           True, "MIDDLE: vocative Latin 2"),
    ("well, Miku-chan, I think so",                      True, "MIDDLE: vocative + honorific"),
    ("しかし、みく、それは違う",                          True, "MIDDLE: vocative Japanese"),
    ("でも、ミクちゃん、聞いて",                          True, "MIDDLE: vocative JP + honorific"),
    ("но, мику, я думаю",                               True, "MIDDLE: vocative Cyrillic"),
    ("hey, miku, what do you think?",                    True, "MIDDLE: vocative casual"),
    ("you know, Miku, that's not right",                 True, "MIDDLE: vocative mid-sentence"),

    # ═══ ALONE pattern (name is the entire message) ═══
    ("Miku",                          True,  "ALONE: bare Latin"),
    ("miku",                          True,  "ALONE: lowercase"),
    ("MIKU",                          True,  "ALONE: uppercase"),
    ("Miku!",                         True,  "ALONE: + excl"),
    ("Miku?",                         True,  "ALONE: + question"),
    ("Miku!!",                        True,  "ALONE: + multi excl"),
    ("みく",                           True,  "ALONE: Hiragana"),
    ("ミク！",                         True,  "ALONE: Katakana + excl"),
    ("未来",                           True,  "ALONE: Kanji"),
    ("мику",                           True,  "ALONE: Cyrillic"),
    ("Miku-chan",                      True,  "ALONE: Latin + honorific"),
    ("miku chan!",                     True,  "ALONE: space honorific + excl"),
    ("ミクちゃん",                     True,  "ALONE: JP honorific"),
    ("ミクさん！",                     True,  "ALONE: JP honorific + excl"),
    ("みくせんせい",                   True,  "ALONE: Hiragana + sensei"),
    ("o-miku!",                       True,  "ALONE: o-prefix"),
    ("おみく",                         True,  "ALONE: JP o-prefix"),
    ("オミク",                         True,  "ALONE: Katakana o-prefix"),
    ("  Miku  ",                      True,  "ALONE: whitespace"),
    ("Miku~",                         True,  "ALONE: tilde"),
    ("Miku♪",                         True,  "ALONE: music note"),
    ("Miku❤",                         True,  "ALONE: heart"),
    ("мику-чан",                      True,  "ALONE: Cyrillic + honorific"),
    ("мику сан",                      True,  "ALONE: Cyrillic + space hon"),
    ("未来さん",                       True,  "ALONE: Kanji + honorific"),

    # ═══ Should NOT match (mere mentions / not addressing) ═══
    ("I like Miku",                    False, "REJECT: object of sentence"),
    ("Miku is cool",                   False, "REJECT: subject + is"),
    ("Miku is my favorite vocaloid",   False, "REJECT: subject + statement"),
    ("I saw Miku at a concert",        False, "REJECT: middle of sentence"),
    ("told miku about it",             False, "REJECT: informal mention"),
    ("hatsune miku concert",           False, "REJECT: event name"),
    ("Do you know Miku?",             False, "REJECT: asking about her"),
    ("I love Miku!",                   False, "REJECT: exclamation about her"),
    ("I love Miku so much",           False, "REJECT: longer statement"),
    ("ミクは元気だよ",                 False, "REJECT: Japanese 'Miku is well'"),
    ("ミクが好き",                     False, "REJECT: Japanese 'I like Miku'"),
    ("ミクのことが好き",               False, "REJECT: Japanese 'I like Miku (thing)'"),
    ("мику была там",                  False, "REJECT: Cyrillic 'Miku was there'"),
    ("мику такая красивая",            False, "REJECT: Cyrillic 'Miku is pretty'"),
    ("the Miku concert was great",     False, "REJECT: event discussion"),
    ("My favorite is Miku for sure",   False, "REJECT: no comma before name at end"),
    ("yeah miku is pretty cool right", False, "REJECT: casual mention"),
    ("have you seen miku today",       False, "REJECT: asking about her"),
    ("miku and I went shopping",       False, "REJECT: subject of sentence"),
    ("I met miku yesterday",           False, "REJECT: object mid-sentence"),
    ("mikumiku fan",                   False, "REJECT: compound word (\\b boundary)"),
    ("hatsune miku is singing",        False, "REJECT: full name as subject"),

    # ═══ Edge cases ═══
    ("",                               False, "EDGE: empty message"),
    ("hello",                          False, "EDGE: no name at all"),
    ("hello!",                         False, "EDGE: exclamation, no name"),
    ("??",                             False, "EDGE: just punctuation"),
    ("   ",                            False, "EDGE: just whitespace"),
    ("chan",                            False, "EDGE: just an honorific"),
    ("o-",                             False, "EDGE: just a prefix"),
]


def main():
    print(f"Generated {len(all_v)} name variants")
    print(f"Running {len(TESTS)} test cases...\n")

    passed = 0
    failed = 0

    for msg, expected, desc in TESTS:
        result = is_addressed(msg)
        ok = result == expected
        if ok:
            passed += 1
        else:
            failed += 1
            pattern = which_pattern(msg)
            exp_str = "ADDR" if expected else "SKIP"
            got_str = "ADDR" if result else "SKIP"
            print(f"  FAIL  expected={exp_str}  got={got_str}  matched={pattern}")
            print(f"        {desc}")
            print(f"        message: \"{msg}\"\n")

    print(f"\n{'='*50}")
    print(f"  {passed}/{len(TESTS)} passed, {failed} failed")
    print(f"{'='*50}")

    return 0 if failed == 0 else 1


if __name__ == '__main__':
    sys.exit(main())