diff --git a/bot/utils/core.py b/bot/utils/core.py index dc9df80..37873df 100644 --- a/bot/utils/core.py +++ b/bot/utils/core.py @@ -1,167 +1,165 @@ # utils/core.py +# +# Detects whether a Discord message is **addressed to** Miku +# (as opposed to merely mentioning her). -import asyncio -import aiohttp import re - -import globals -# Langchain imports below are only used in commented-out code -# from langchain_community.vectorstores import FAISS -# from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter -# from langchain_core.documents import Document from utils.logger import get_logger logger = get_logger('core') -# switch_model() removed - llama-swap handles model switching automatically +# ──────────────────────────────────────────────────────────────────── +# Pre-compiled Miku addressing patterns +# Built once at module load; is_miku_addressed() runs only 4 .search() +# ──────────────────────────────────────────────────────────────────── +def _build_name_variants(bases, honorifics, prefixes, connector, prefix_connector): + """Return regex fragments for every name+honorific+prefix combo + within a single script family.""" + variants = [] + for base in bases: + be = re.escape(base) + variants.append(be) + for h in honorifics: + he = re.escape(h) + variants.append(be + connector + he) + for p in prefixes: + pe = re.escape(p) + variants.append(pe + prefix_connector + be) + for h in honorifics: + he = re.escape(h) + variants.append(pe + prefix_connector + be + connector + he) + return variants + + +def _compile_addressing_patterns(): + """Compile the four addressing regexes. + + START – name at the beginning, followed by punctuation + "Miku, how are you?" "みく!聞いて" + END – comma then name at the end + "how are you, Miku?" "教えて、ミク" + MIDDLE – name surrounded by commas (vocative) + "On the contrary, Miku, I think…" + ALONE – name is the entire message + "Miku" "みく!" "ミクちゃん" + """ + latin = _build_name_variants( + bases=['miku'], + honorifics=[ + 'chan', 'san', 'kun', 'nyan', 'hime', 'tan', 'chin', 'heika', + 'denka', 'kakka', 'shi', 'chama', 'kyun', 'dono', 'sensei', + 'senpai', 'jou', + ], + prefixes=['o-'], + connector=r'[\-\s]?', + prefix_connector=r'\s?', + ) + + cyrillic = _build_name_variants( + bases=['мику'], + honorifics=[ + 'чан', 'сан', 'кун', 'нян', 'химе', 'тан', 'чин', + 'хейка', 'хеика', 'денка', 'какка', 'си', 'чама', 'кюн', + 'доно', 'сенсэй', 'сенсеи', 'сенпай', 'сенпаи', 'джо', + ], + prefixes=['о-'], + connector=r'[\-\s]?', + prefix_connector=r'\s?', + ) + + japanese = _build_name_variants( + bases=['みく', 'ミク', '未来'], + honorifics=[ + # Hiragana + 'ちゃん', 'さん', 'くん', 'にゃん', 'ひめ', 'たん', 'ちん', + 'へいか', 'でんか', 'かっか', 'し', 'ちゃま', 'きゅん', 'どの', + 'せんせい', 'せんぱい', 'じょう', + # Katakana + 'チャン', 'サン', 'クン', 'ニャン', 'ヒメ', 'タン', 'チン', + 'ヘイカ', 'デンカ', 'カッカ', 'シ', 'チャマ', 'キュン', 'ドノ', + 'センセイ', 'センパイ', 'ジョウ', + ], + prefixes=['お', 'オ'], + connector=r'[-]?', + prefix_connector=r'', + ) + + # Longest-first so the regex engine prefers the most specific match + all_v = sorted(latin + cyrillic + japanese, key=len, reverse=True) + alts = '|'.join(all_v) + + NAME = rf'\b(?:{alts})\b' + PUNCT = r'[,,、::!!??.。]' # addressing punctuation after name + COMMA = r'[,,、]' # comma variants (before name / vocative) + ETRAIL = r'[!!??.。~~]*' # optional trailing at end + ATRAIL = r'[!!??.。~~♪♡❤]*' # optional trailing for name-only messages + + start_re = re.compile(rf'^\s*{NAME}\s*{PUNCT}', re.IGNORECASE) + end_re = re.compile(rf'{COMMA}\s*{NAME}\s*{ETRAIL}\s*$', re.IGNORECASE) + middle_re = re.compile(rf'{COMMA}\s*{NAME}\s*{COMMA}', re.IGNORECASE) + alone_re = re.compile(rf'^\s*{NAME}\s*{ATRAIL}\s*$', re.IGNORECASE) + + logger.info(f"Miku addressing: {len(all_v)} name variants compiled into 4 patterns") + return start_re, end_re, middle_re, alone_re + + +try: + _START_RE, _END_RE, _MIDDLE_RE, _ALONE_RE = _compile_addressing_patterns() +except Exception as e: + logger.error(f"Failed to compile addressing patterns: {e}") + _START_RE = _END_RE = _MIDDLE_RE = _ALONE_RE = None + + +# ──────────────────────────────────────────────────────────────────── async def is_miku_addressed(message) -> bool: - # Check if this is a DM (no guild) + """Return True only when the message is directed *at* Miku, + not merely mentioning her. + + Always responds to: DMs, @mentions, replies to Miku's messages. + + For normal messages checks whether Miku's name (in any supported + script / honorific combination) appears in an "addressing" position: + • Start – "Miku, how are you?" + • End – "how are you, Miku?" + • Middle – "On the contrary, Miku, I think…" + • Alone – "Miku!" / "ミクちゃん" + + Does NOT trigger on mere mentions: + • "I like Miku" / "Miku is cool" / "told miku about it" + """ + # DMs – always respond if message.guild is None: - # In DMs, always respond to every message return True - - # Safety check: ensure guild and guild.me exist + if not message.guild or not message.guild.me: - logger.warning(f"Invalid guild or guild.me in message from {message.author}") + logger.warning(f"Invalid guild/guild.me for message from {message.author}") return False - - # If message contains a ping for Miku, return true + + # @mention if message.guild.me in message.mentions: return True - # If message is a reply, check the referenced message author + # Reply to Miku if message.reference: try: - referenced_msg = await message.channel.fetch_message(message.reference.message_id) - if referenced_msg.author == message.guild.me: + ref = await message.channel.fetch_message(message.reference.message_id) + if ref.author == message.guild.me: return True except Exception as e: logger.warning(f"Could not fetch referenced message: {e}") - cleaned = message.content.strip() - cleaned_lower = cleaned.lower() - - # Base names for Miku in different scripts - base_names = [ - 'miku', 'мику', 'みく', 'ミク', '未来' - ] - - # Japanese honorifics - all scripts combined - honorifics = [ - # Latin - 'chan', 'san', 'kun', 'nyan', 'hime', 'tan', 'chin', 'heika', - 'denka', 'kakka', 'shi', 'chama', 'kyun', 'dono', 'sensei', 'senpai', 'jou', - # Hiragana - 'ちゃん', 'さん', 'くん', 'にゃん', 'ひめ', 'たん', 'ちん', 'へいか', - 'でんか', 'かっか', 'し', 'ちゃま', 'きゅん', 'どの', 'せんせい', 'せんぱい', 'じょう', - # Katakana - 'チャン', 'サン', 'クン', 'ニャン', 'ヒメ', 'タン', 'チン', 'ヘイカ', - 'デンカ', 'カッカ', 'シ', 'チャマ', 'キュン', 'ドノ', 'センセイ', 'センパイ', 'ジョウ', - # Cyrillic - 'чан', 'сан', 'кун', 'нян', 'химе', 'тан', 'чин', 'хейка', 'хеика', - 'денка', 'какка', 'си', 'чама', 'кюн', 'доно', 'сенсэй', 'сенсеи', 'сенпай', 'сенпаи', 'джо' - ] - - # o- prefix variants - o_prefixes = ['o-', 'о-', 'お', 'オ'] - - # Build all possible name variations to check - name_patterns = [] - - for base in base_names: - base_lower = base.lower() - base_escaped = re.escape(base_lower) - - # Base name alone - name_patterns.append(base_escaped) - - # With honorifics (allows optional dash/space between) - for honorific in honorifics: - honorific_lower = honorific.lower() - honorific_escaped = re.escape(honorific_lower) - # Build pattern: base + optional [dash or space] + honorific - name_patterns.append(base_escaped + r'[\-\s]*' + honorific_escaped) - - # With o- prefix - for prefix in o_prefixes: - prefix_lower = prefix.lower() - prefix_escaped = re.escape(prefix_lower) - # o-prefix + optional space + base - name_patterns.append(prefix_escaped + r'\s*' + base_escaped) - - # With o- prefix + honorific - for honorific in honorifics: - honorific_lower = honorific.lower() - honorific_escaped = re.escape(honorific_lower) - # o-prefix + space + base + dash/space + honorific - name_patterns.append(prefix_escaped + r'\s*' + base_escaped + r'[\-\s]*' + honorific_escaped) - - # Check all patterns - she must be "addressed" not just mentioned - for pattern in name_patterns: - try: - # Pattern 1: Start of message + punctuation/end - # "Miku, ..." or "みく!" or "ミクちゃん、..." - start_p = r'^' + pattern + r'(?:[,,、!!??.。\s]+|$)' - if re.search(start_p, cleaned_lower, re.IGNORECASE): - return True - - # Pattern 2: End of message (optionally preceded by punctuation) - # "..., Miku" or "...みく" or "...ミクちゃん!" - end_p = r'(?:[,,、!!??.。\s]+|^)' + pattern + r'[!!??.。\s]*$' - if re.search(end_p, cleaned_lower, re.IGNORECASE): - return True - - # Pattern 3: Middle (surrounded by punctuation) - # "..., Miku, ..." or "...、ミク、..." - middle_p = r'[,,、!!??.。\s]+' + pattern + r'[,,、!!??.。\s]+' - if re.search(middle_p, cleaned_lower, re.IGNORECASE): - return True - - # Pattern 4: Just the name alone - # "Miku" or "みく!" or "ミクちゃん" - alone_p = r'^\s*' + pattern + r'[!!??.。]*\s*$' - if re.search(alone_p, cleaned_lower, re.IGNORECASE): - return True - except re.error as e: - # Log the problematic pattern and skip it - logger.error(f"REGEX ERROR - Pattern: '{pattern}' | Start regex: '{start_p}' | Error: {e}") - continue - - return False + # Regex addressing (4 pre-compiled patterns) + if _START_RE is None: + logger.error("Addressing patterns not compiled – skipping pattern check") + return False -# Vectorstore functionality disabled - not needed with current structured context approach -# If you need embeddings in the future, you can use a different embedding provider -# For now, the bot uses structured prompts from context_manager.py - -# def load_miku_knowledge(): -# with open("miku_lore.txt", "r", encoding="utf-8") as f: -# text = f.read() -# -# from langchain_text_splitters import RecursiveCharacterTextSplitter -# -# text_splitter = RecursiveCharacterTextSplitter( -# chunk_size=520, -# chunk_overlap=50, -# separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""] -# ) -# -# docs = [Document(page_content=chunk) for chunk in text_splitter.split_text(text)] -# -# vectorstore = FAISS.from_documents(docs, embeddings) -# return vectorstore -# -# def load_miku_lyrics(): -# with open("miku_lyrics.txt", "r", encoding="utf-8") as f: -# lyrics_text = f.read() -# -# text_splitter = CharacterTextSplitter(chunk_size=520, chunk_overlap=50) -# docs = [Document(page_content=chunk) for chunk in text_splitter.split_text(lyrics_text)] -# -# vectorstore = FAISS.from_documents(docs, embeddings) -# return vectorstore -# -# miku_vectorstore = load_miku_knowledge() -# miku_lyrics_vectorstore = load_miku_lyrics() + text = message.content.strip() + return bool( + _START_RE.search(text) + or _END_RE.search(text) + or _MIDDLE_RE.search(text) + or _ALONE_RE.search(text) + ) diff --git a/test_addressing.py b/test_addressing.py new file mode 100644 index 0000000..0e05cfd --- /dev/null +++ b/test_addressing.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python3 +"""Comprehensive test for Miku addressing detection patterns. + +Tests the pre-compiled regex patterns from bot/utils/core.py to verify +that Miku is only triggered when *addressed*, not merely *mentioned*. +""" + +import re +import sys + +# ── Replicate the pattern-building logic from core.py ── + +def _build_name_variants(bases, honorifics, prefixes, connector, prefix_connector): + variants = [] + for base in bases: + be = re.escape(base) + variants.append(be) + for h in honorifics: + he = re.escape(h) + variants.append(be + connector + he) + for p in prefixes: + pe = re.escape(p) + variants.append(pe + prefix_connector + be) + for h in honorifics: + he = re.escape(h) + variants.append(pe + prefix_connector + be + connector + he) + return variants + + +latin = _build_name_variants( + bases=['miku'], + honorifics=[ + 'chan', 'san', 'kun', 'nyan', 'hime', 'tan', 'chin', 'heika', + 'denka', 'kakka', 'shi', 'chama', 'kyun', 'dono', 'sensei', + 'senpai', 'jou', + ], + prefixes=['o-'], + connector=r'[\-\s]?', + prefix_connector=r'\s?', +) + +cyrillic = _build_name_variants( + bases=['мику'], + honorifics=[ + 'чан', 'сан', 'кун', 'нян', 'химе', 'тан', 'чин', + 'хейка', 'хеика', 'денка', 'какка', 'си', 'чама', 'кюн', + 'доно', 'сенсэй', 'сенсеи', 'сенпай', 'сенпаи', 'джо', + ], + prefixes=['о-'], + connector=r'[\-\s]?', + prefix_connector=r'\s?', +) + +japanese = _build_name_variants( + bases=['みく', 'ミク', '未来'], + honorifics=[ + 'ちゃん', 'さん', 'くん', 'にゃん', 'ひめ', 'たん', 'ちん', + 'へいか', 'でんか', 'かっか', 'し', 'ちゃま', 'きゅん', 'どの', + 'せんせい', 'せんぱい', 'じょう', + 'チャン', 'サン', 'クン', 'ニャン', 'ヒメ', 'タン', 'チン', + 'ヘイカ', 'デンカ', 'カッカ', 'シ', 'チャマ', 'キュン', 'ドノ', + 'センセイ', 'センパイ', 'ジョウ', + ], + prefixes=['お', 'オ'], + connector=r'[-]?', + prefix_connector=r'', +) + +all_v = sorted(latin + cyrillic + japanese, key=len, reverse=True) +alts = '|'.join(all_v) + +NAME = rf'\b(?:{alts})\b' +PUNCT = r'[,,、::!!??.。]' +COMMA = r'[,,、]' +ETRAIL = r'[!!??.。~~]*' +ATRAIL = r'[!!??.。~~♪♡❤]*' + +START_RE = re.compile(rf'^\s*{NAME}\s*{PUNCT}', re.IGNORECASE) +END_RE = re.compile(rf'{COMMA}\s*{NAME}\s*{ETRAIL}\s*$', re.IGNORECASE) +MIDDLE_RE = re.compile(rf'{COMMA}\s*{NAME}\s*{COMMA}', re.IGNORECASE) +ALONE_RE = re.compile(rf'^\s*{NAME}\s*{ATRAIL}\s*$', re.IGNORECASE) + + +def is_addressed(text: str) -> bool: + text = text.strip() + return bool( + START_RE.search(text) + or END_RE.search(text) + or MIDDLE_RE.search(text) + or ALONE_RE.search(text) + ) + + +def which_pattern(text: str) -> str: + """Return which pattern matched (for debugging).""" + text = text.strip() + matched = [] + if START_RE.search(text): + matched.append("START") + if END_RE.search(text): + matched.append("END") + if MIDDLE_RE.search(text): + matched.append("MIDDLE") + if ALONE_RE.search(text): + matched.append("ALONE") + return ', '.join(matched) if matched else 'NONE' + + +# ── Test cases ── +# (message, expected, description) +TESTS = [ + # ═══ START pattern (name at beginning + punctuation) ═══ + ("Miku, how are you?", True, "START: Latin + comma"), + ("miku, hello!", True, "START: lowercase Latin"), + ("MIKU! listen to me", True, "START: uppercase + excl"), + ("Miku: can you help?", True, "START: colon"), + ("Miku. Please help.", True, "START: period"), + ("みく、元気?", True, "START: Hiragana + JP comma"), + ("ミク!聞いて", True, "START: Katakana + JP excl"), + ("未来、教えて", True, "START: Kanji + JP comma"), + ("мику, привет!", True, "START: Cyrillic + comma"), + ("МИКУ! слушай", True, "START: Cyrillic upper + excl"), + ("Miku-chan, how are you?", True, "START: honorific-dash + comma"), + ("miku chan, hello!", True, "START: honorific-space + comma"), + ("mikuchan! listen!", True, "START: honorific-joined + excl"), + ("ミクちゃん、聞いて", True, "START: JP name+honorific + comma"), + ("ミクちゃん!元気?", True, "START: JP name+honorific + excl"), + ("みくさん, 教えて", True, "START: Hiragana + hon + comma"), + ("мику-сан, скажи", True, "START: Cyrillic + hon + comma"), + ("o-miku, hello", True, "START: o-prefix Latin"), + ("おみく、ねえ", True, "START: o-prefix Japanese"), + (" Miku, hello ", True, "START: whitespace padded"), + + # ═══ END pattern (comma + name at end) ═══ + ("how are you, Miku?", True, "END: comma + Latin + ?"), + ("how are you, Miku!", True, "END: comma + Latin + !"), + ("how are you, Miku", True, "END: comma + Latin no trail"), + ("tell me, miku.", True, "END: comma + lowercase + period"), + ("元気, ミク", True, "END: comma + Katakana"), + ("教えて、みく!", True, "END: JP comma + Hiragana + !"), + ("教えて、未来", True, "END: JP comma + Kanji"), + ("скажи, мику!", True, "END: Cyrillic comma + name"), + ("hello, Miku-chan!", True, "END: comma + honorific"), + ("hello, miku-san?", True, "END: comma + honorific + ?"), + ("元気、ミクちゃん", True, "END: JP comma + JP honorific"), + ("hello, o-miku", True, "END: comma + o-prefix"), + + # ═══ MIDDLE pattern (vocative — commas on both sides) ═══ + ("On the contrary, Miku, I think you're wrong", True, "MIDDLE: vocative Latin"), + ("I am very happy, Miku, you are so fun", True, "MIDDLE: vocative Latin 2"), + ("well, Miku-chan, I think so", True, "MIDDLE: vocative + honorific"), + ("しかし、みく、それは違う", True, "MIDDLE: vocative Japanese"), + ("でも、ミクちゃん、聞いて", True, "MIDDLE: vocative JP + honorific"), + ("но, мику, я думаю", True, "MIDDLE: vocative Cyrillic"), + ("hey, miku, what do you think?", True, "MIDDLE: vocative casual"), + ("you know, Miku, that's not right", True, "MIDDLE: vocative mid-sentence"), + + # ═══ ALONE pattern (name is the entire message) ═══ + ("Miku", True, "ALONE: bare Latin"), + ("miku", True, "ALONE: lowercase"), + ("MIKU", True, "ALONE: uppercase"), + ("Miku!", True, "ALONE: + excl"), + ("Miku?", True, "ALONE: + question"), + ("Miku!!", True, "ALONE: + multi excl"), + ("みく", True, "ALONE: Hiragana"), + ("ミク!", True, "ALONE: Katakana + excl"), + ("未来", True, "ALONE: Kanji"), + ("мику", True, "ALONE: Cyrillic"), + ("Miku-chan", True, "ALONE: Latin + honorific"), + ("miku chan!", True, "ALONE: space honorific + excl"), + ("ミクちゃん", True, "ALONE: JP honorific"), + ("ミクさん!", True, "ALONE: JP honorific + excl"), + ("みくせんせい", True, "ALONE: Hiragana + sensei"), + ("o-miku!", True, "ALONE: o-prefix"), + ("おみく", True, "ALONE: JP o-prefix"), + ("オミク", True, "ALONE: Katakana o-prefix"), + (" Miku ", True, "ALONE: whitespace"), + ("Miku~", True, "ALONE: tilde"), + ("Miku♪", True, "ALONE: music note"), + ("Miku❤", True, "ALONE: heart"), + ("мику-чан", True, "ALONE: Cyrillic + honorific"), + ("мику сан", True, "ALONE: Cyrillic + space hon"), + ("未来さん", True, "ALONE: Kanji + honorific"), + + # ═══ Should NOT match (mere mentions / not addressing) ═══ + ("I like Miku", False, "REJECT: object of sentence"), + ("Miku is cool", False, "REJECT: subject + is"), + ("Miku is my favorite vocaloid", False, "REJECT: subject + statement"), + ("I saw Miku at a concert", False, "REJECT: middle of sentence"), + ("told miku about it", False, "REJECT: informal mention"), + ("hatsune miku concert", False, "REJECT: event name"), + ("Do you know Miku?", False, "REJECT: asking about her"), + ("I love Miku!", False, "REJECT: exclamation about her"), + ("I love Miku so much", False, "REJECT: longer statement"), + ("ミクは元気だよ", False, "REJECT: Japanese 'Miku is well'"), + ("ミクが好き", False, "REJECT: Japanese 'I like Miku'"), + ("ミクのことが好き", False, "REJECT: Japanese 'I like Miku (thing)'"), + ("мику была там", False, "REJECT: Cyrillic 'Miku was there'"), + ("мику такая красивая", False, "REJECT: Cyrillic 'Miku is pretty'"), + ("the Miku concert was great", False, "REJECT: event discussion"), + ("My favorite is Miku for sure", False, "REJECT: no comma before name at end"), + ("yeah miku is pretty cool right", False, "REJECT: casual mention"), + ("have you seen miku today", False, "REJECT: asking about her"), + ("miku and I went shopping", False, "REJECT: subject of sentence"), + ("I met miku yesterday", False, "REJECT: object mid-sentence"), + ("mikumiku fan", False, "REJECT: compound word (\\b boundary)"), + ("hatsune miku is singing", False, "REJECT: full name as subject"), + + # ═══ Edge cases ═══ + ("", False, "EDGE: empty message"), + ("hello", False, "EDGE: no name at all"), + ("hello!", False, "EDGE: exclamation, no name"), + ("??", False, "EDGE: just punctuation"), + (" ", False, "EDGE: just whitespace"), + ("chan", False, "EDGE: just an honorific"), + ("o-", False, "EDGE: just a prefix"), +] + + +def main(): + print(f"Generated {len(all_v)} name variants") + print(f"Running {len(TESTS)} test cases...\n") + + passed = 0 + failed = 0 + + for msg, expected, desc in TESTS: + result = is_addressed(msg) + ok = result == expected + if ok: + passed += 1 + else: + failed += 1 + pattern = which_pattern(msg) + exp_str = "ADDR" if expected else "SKIP" + got_str = "ADDR" if result else "SKIP" + print(f" FAIL expected={exp_str} got={got_str} matched={pattern}") + print(f" {desc}") + print(f" message: \"{msg}\"\n") + + print(f"\n{'='*50}") + print(f" {passed}/{len(TESTS)} passed, {failed} failed") + print(f"{'='*50}") + + return 0 if failed == 0 else 1 + + +if __name__ == '__main__': + sys.exit(main())