Rewrite is_miku_addressed() to only trigger when addressed, not mentioned
- Pre-compile 393 name variants into 4 regex patterns at module load (was 7,300+ raw re.search() calls per message) - Strict addressing detection using punctuation context: START: name at beginning + punctuation (Miku, ... / みく!...) END: comma + name at end (..., Miku / ...、ミク) MIDDLE: commas on both sides - vocative (..., Miku, ...) ALONE: name is the entire message (Miku! / ミクちゃん) - Rejects mere mentions: 'I like Miku' / 'Miku is cool' no longer trigger - Script-family-aware pattern generation (Latin, Cyrillic, Japanese) eliminates nonsensical cross-script combos (e.g. o-みく) - Word boundary enforcement prevents substring matches (mikumiku) - Fixes regex 'unbalanced parenthesis' errors from old implementation - Add comprehensive test suite (94 cases, all passing)
This commit is contained in:
@@ -1,167 +1,165 @@
|
||||
# utils/core.py
|
||||
#
|
||||
# Detects whether a Discord message is **addressed to** Miku
|
||||
# (as opposed to merely mentioning her).
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import re
|
||||
|
||||
import globals
|
||||
# Langchain imports below are only used in commented-out code
|
||||
# from langchain_community.vectorstores import FAISS
|
||||
# from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
|
||||
# from langchain_core.documents import Document
|
||||
from utils.logger import get_logger
|
||||
|
||||
logger = get_logger('core')
|
||||
|
||||
|
||||
# switch_model() removed - llama-swap handles model switching automatically
|
||||
# ────────────────────────────────────────────────────────────────────
|
||||
# Pre-compiled Miku addressing patterns
|
||||
# Built once at module load; is_miku_addressed() runs only 4 .search()
|
||||
# ────────────────────────────────────────────────────────────────────
|
||||
|
||||
def _build_name_variants(bases, honorifics, prefixes, connector, prefix_connector):
|
||||
"""Return regex fragments for every name+honorific+prefix combo
|
||||
within a single script family."""
|
||||
variants = []
|
||||
for base in bases:
|
||||
be = re.escape(base)
|
||||
variants.append(be)
|
||||
for h in honorifics:
|
||||
he = re.escape(h)
|
||||
variants.append(be + connector + he)
|
||||
for p in prefixes:
|
||||
pe = re.escape(p)
|
||||
variants.append(pe + prefix_connector + be)
|
||||
for h in honorifics:
|
||||
he = re.escape(h)
|
||||
variants.append(pe + prefix_connector + be + connector + he)
|
||||
return variants
|
||||
|
||||
|
||||
def _compile_addressing_patterns():
|
||||
"""Compile the four addressing regexes.
|
||||
|
||||
START – name at the beginning, followed by punctuation
|
||||
"Miku, how are you?" "みく!聞いて"
|
||||
END – comma then name at the end
|
||||
"how are you, Miku?" "教えて、ミク"
|
||||
MIDDLE – name surrounded by commas (vocative)
|
||||
"On the contrary, Miku, I think…"
|
||||
ALONE – name is the entire message
|
||||
"Miku" "みく!" "ミクちゃん"
|
||||
"""
|
||||
latin = _build_name_variants(
|
||||
bases=['miku'],
|
||||
honorifics=[
|
||||
'chan', 'san', 'kun', 'nyan', 'hime', 'tan', 'chin', 'heika',
|
||||
'denka', 'kakka', 'shi', 'chama', 'kyun', 'dono', 'sensei',
|
||||
'senpai', 'jou',
|
||||
],
|
||||
prefixes=['o-'],
|
||||
connector=r'[\-\s]?',
|
||||
prefix_connector=r'\s?',
|
||||
)
|
||||
|
||||
cyrillic = _build_name_variants(
|
||||
bases=['мику'],
|
||||
honorifics=[
|
||||
'чан', 'сан', 'кун', 'нян', 'химе', 'тан', 'чин',
|
||||
'хейка', 'хеика', 'денка', 'какка', 'си', 'чама', 'кюн',
|
||||
'доно', 'сенсэй', 'сенсеи', 'сенпай', 'сенпаи', 'джо',
|
||||
],
|
||||
prefixes=['о-'],
|
||||
connector=r'[\-\s]?',
|
||||
prefix_connector=r'\s?',
|
||||
)
|
||||
|
||||
japanese = _build_name_variants(
|
||||
bases=['みく', 'ミク', '未来'],
|
||||
honorifics=[
|
||||
# Hiragana
|
||||
'ちゃん', 'さん', 'くん', 'にゃん', 'ひめ', 'たん', 'ちん',
|
||||
'へいか', 'でんか', 'かっか', 'し', 'ちゃま', 'きゅん', 'どの',
|
||||
'せんせい', 'せんぱい', 'じょう',
|
||||
# Katakana
|
||||
'チャン', 'サン', 'クン', 'ニャン', 'ヒメ', 'タン', 'チン',
|
||||
'ヘイカ', 'デンカ', 'カッカ', 'シ', 'チャマ', 'キュン', 'ドノ',
|
||||
'センセイ', 'センパイ', 'ジョウ',
|
||||
],
|
||||
prefixes=['お', 'オ'],
|
||||
connector=r'[-]?',
|
||||
prefix_connector=r'',
|
||||
)
|
||||
|
||||
# Longest-first so the regex engine prefers the most specific match
|
||||
all_v = sorted(latin + cyrillic + japanese, key=len, reverse=True)
|
||||
alts = '|'.join(all_v)
|
||||
|
||||
NAME = rf'\b(?:{alts})\b'
|
||||
PUNCT = r'[,,、::!!??.。]' # addressing punctuation after name
|
||||
COMMA = r'[,,、]' # comma variants (before name / vocative)
|
||||
ETRAIL = r'[!!??.。~~]*' # optional trailing at end
|
||||
ATRAIL = r'[!!??.。~~♪♡❤]*' # optional trailing for name-only messages
|
||||
|
||||
start_re = re.compile(rf'^\s*{NAME}\s*{PUNCT}', re.IGNORECASE)
|
||||
end_re = re.compile(rf'{COMMA}\s*{NAME}\s*{ETRAIL}\s*$', re.IGNORECASE)
|
||||
middle_re = re.compile(rf'{COMMA}\s*{NAME}\s*{COMMA}', re.IGNORECASE)
|
||||
alone_re = re.compile(rf'^\s*{NAME}\s*{ATRAIL}\s*$', re.IGNORECASE)
|
||||
|
||||
logger.info(f"Miku addressing: {len(all_v)} name variants compiled into 4 patterns")
|
||||
return start_re, end_re, middle_re, alone_re
|
||||
|
||||
|
||||
try:
|
||||
_START_RE, _END_RE, _MIDDLE_RE, _ALONE_RE = _compile_addressing_patterns()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to compile addressing patterns: {e}")
|
||||
_START_RE = _END_RE = _MIDDLE_RE = _ALONE_RE = None
|
||||
|
||||
|
||||
# ────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def is_miku_addressed(message) -> bool:
|
||||
# Check if this is a DM (no guild)
|
||||
"""Return True only when the message is directed *at* Miku,
|
||||
not merely mentioning her.
|
||||
|
||||
Always responds to: DMs, @mentions, replies to Miku's messages.
|
||||
|
||||
For normal messages checks whether Miku's name (in any supported
|
||||
script / honorific combination) appears in an "addressing" position:
|
||||
• Start – "Miku, how are you?"
|
||||
• End – "how are you, Miku?"
|
||||
• Middle – "On the contrary, Miku, I think…"
|
||||
• Alone – "Miku!" / "ミクちゃん"
|
||||
|
||||
Does NOT trigger on mere mentions:
|
||||
• "I like Miku" / "Miku is cool" / "told miku about it"
|
||||
"""
|
||||
# DMs – always respond
|
||||
if message.guild is None:
|
||||
# In DMs, always respond to every message
|
||||
return True
|
||||
|
||||
# Safety check: ensure guild and guild.me exist
|
||||
if not message.guild or not message.guild.me:
|
||||
logger.warning(f"Invalid guild or guild.me in message from {message.author}")
|
||||
logger.warning(f"Invalid guild/guild.me for message from {message.author}")
|
||||
return False
|
||||
|
||||
# If message contains a ping for Miku, return true
|
||||
# @mention
|
||||
if message.guild.me in message.mentions:
|
||||
return True
|
||||
|
||||
# If message is a reply, check the referenced message author
|
||||
# Reply to Miku
|
||||
if message.reference:
|
||||
try:
|
||||
referenced_msg = await message.channel.fetch_message(message.reference.message_id)
|
||||
if referenced_msg.author == message.guild.me:
|
||||
ref = await message.channel.fetch_message(message.reference.message_id)
|
||||
if ref.author == message.guild.me:
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not fetch referenced message: {e}")
|
||||
|
||||
cleaned = message.content.strip()
|
||||
cleaned_lower = cleaned.lower()
|
||||
# Regex addressing (4 pre-compiled patterns)
|
||||
if _START_RE is None:
|
||||
logger.error("Addressing patterns not compiled – skipping pattern check")
|
||||
return False
|
||||
|
||||
# Base names for Miku in different scripts
|
||||
base_names = [
|
||||
'miku', 'мику', 'みく', 'ミク', '未来'
|
||||
]
|
||||
|
||||
# Japanese honorifics - all scripts combined
|
||||
honorifics = [
|
||||
# Latin
|
||||
'chan', 'san', 'kun', 'nyan', 'hime', 'tan', 'chin', 'heika',
|
||||
'denka', 'kakka', 'shi', 'chama', 'kyun', 'dono', 'sensei', 'senpai', 'jou',
|
||||
# Hiragana
|
||||
'ちゃん', 'さん', 'くん', 'にゃん', 'ひめ', 'たん', 'ちん', 'へいか',
|
||||
'でんか', 'かっか', 'し', 'ちゃま', 'きゅん', 'どの', 'せんせい', 'せんぱい', 'じょう',
|
||||
# Katakana
|
||||
'チャン', 'サン', 'クン', 'ニャン', 'ヒメ', 'タン', 'チン', 'ヘイカ',
|
||||
'デンカ', 'カッカ', 'シ', 'チャマ', 'キュン', 'ドノ', 'センセイ', 'センパイ', 'ジョウ',
|
||||
# Cyrillic
|
||||
'чан', 'сан', 'кун', 'нян', 'химе', 'тан', 'чин', 'хейка', 'хеика',
|
||||
'денка', 'какка', 'си', 'чама', 'кюн', 'доно', 'сенсэй', 'сенсеи', 'сенпай', 'сенпаи', 'джо'
|
||||
]
|
||||
|
||||
# o- prefix variants
|
||||
o_prefixes = ['o-', 'о-', 'お', 'オ']
|
||||
|
||||
# Build all possible name variations to check
|
||||
name_patterns = []
|
||||
|
||||
for base in base_names:
|
||||
base_lower = base.lower()
|
||||
base_escaped = re.escape(base_lower)
|
||||
|
||||
# Base name alone
|
||||
name_patterns.append(base_escaped)
|
||||
|
||||
# With honorifics (allows optional dash/space between)
|
||||
for honorific in honorifics:
|
||||
honorific_lower = honorific.lower()
|
||||
honorific_escaped = re.escape(honorific_lower)
|
||||
# Build pattern: base + optional [dash or space] + honorific
|
||||
name_patterns.append(base_escaped + r'[\-\s]*' + honorific_escaped)
|
||||
|
||||
# With o- prefix
|
||||
for prefix in o_prefixes:
|
||||
prefix_lower = prefix.lower()
|
||||
prefix_escaped = re.escape(prefix_lower)
|
||||
# o-prefix + optional space + base
|
||||
name_patterns.append(prefix_escaped + r'\s*' + base_escaped)
|
||||
|
||||
# With o- prefix + honorific
|
||||
for honorific in honorifics:
|
||||
honorific_lower = honorific.lower()
|
||||
honorific_escaped = re.escape(honorific_lower)
|
||||
# o-prefix + space + base + dash/space + honorific
|
||||
name_patterns.append(prefix_escaped + r'\s*' + base_escaped + r'[\-\s]*' + honorific_escaped)
|
||||
|
||||
# Check all patterns - she must be "addressed" not just mentioned
|
||||
for pattern in name_patterns:
|
||||
try:
|
||||
# Pattern 1: Start of message + punctuation/end
|
||||
# "Miku, ..." or "みく!" or "ミクちゃん、..."
|
||||
start_p = r'^' + pattern + r'(?:[,,、!!??.。\s]+|$)'
|
||||
if re.search(start_p, cleaned_lower, re.IGNORECASE):
|
||||
return True
|
||||
|
||||
# Pattern 2: End of message (optionally preceded by punctuation)
|
||||
# "..., Miku" or "...みく" or "...ミクちゃん!"
|
||||
end_p = r'(?:[,,、!!??.。\s]+|^)' + pattern + r'[!!??.。\s]*$'
|
||||
if re.search(end_p, cleaned_lower, re.IGNORECASE):
|
||||
return True
|
||||
|
||||
# Pattern 3: Middle (surrounded by punctuation)
|
||||
# "..., Miku, ..." or "...、ミク、..."
|
||||
middle_p = r'[,,、!!??.。\s]+' + pattern + r'[,,、!!??.。\s]+'
|
||||
if re.search(middle_p, cleaned_lower, re.IGNORECASE):
|
||||
return True
|
||||
|
||||
# Pattern 4: Just the name alone
|
||||
# "Miku" or "みく!" or "ミクちゃん"
|
||||
alone_p = r'^\s*' + pattern + r'[!!??.。]*\s*$'
|
||||
if re.search(alone_p, cleaned_lower, re.IGNORECASE):
|
||||
return True
|
||||
except re.error as e:
|
||||
# Log the problematic pattern and skip it
|
||||
logger.error(f"REGEX ERROR - Pattern: '{pattern}' | Start regex: '{start_p}' | Error: {e}")
|
||||
continue
|
||||
|
||||
return False
|
||||
|
||||
# Vectorstore functionality disabled - not needed with current structured context approach
|
||||
# If you need embeddings in the future, you can use a different embedding provider
|
||||
# For now, the bot uses structured prompts from context_manager.py
|
||||
|
||||
# def load_miku_knowledge():
|
||||
# with open("miku_lore.txt", "r", encoding="utf-8") as f:
|
||||
# text = f.read()
|
||||
#
|
||||
# from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||
#
|
||||
# text_splitter = RecursiveCharacterTextSplitter(
|
||||
# chunk_size=520,
|
||||
# chunk_overlap=50,
|
||||
# separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
|
||||
# )
|
||||
#
|
||||
# docs = [Document(page_content=chunk) for chunk in text_splitter.split_text(text)]
|
||||
#
|
||||
# vectorstore = FAISS.from_documents(docs, embeddings)
|
||||
# return vectorstore
|
||||
#
|
||||
# def load_miku_lyrics():
|
||||
# with open("miku_lyrics.txt", "r", encoding="utf-8") as f:
|
||||
# lyrics_text = f.read()
|
||||
#
|
||||
# text_splitter = CharacterTextSplitter(chunk_size=520, chunk_overlap=50)
|
||||
# docs = [Document(page_content=chunk) for chunk in text_splitter.split_text(lyrics_text)]
|
||||
#
|
||||
# vectorstore = FAISS.from_documents(docs, embeddings)
|
||||
# return vectorstore
|
||||
#
|
||||
# miku_vectorstore = load_miku_knowledge()
|
||||
# miku_lyrics_vectorstore = load_miku_lyrics()
|
||||
text = message.content.strip()
|
||||
return bool(
|
||||
_START_RE.search(text)
|
||||
or _END_RE.search(text)
|
||||
or _MIDDLE_RE.search(text)
|
||||
or _ALONE_RE.search(text)
|
||||
)
|
||||
|
||||
249
test_addressing.py
Normal file
249
test_addressing.py
Normal file
@@ -0,0 +1,249 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Comprehensive test for Miku addressing detection patterns.
|
||||
|
||||
Tests the pre-compiled regex patterns from bot/utils/core.py to verify
|
||||
that Miku is only triggered when *addressed*, not merely *mentioned*.
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
|
||||
# ── Replicate the pattern-building logic from core.py ──
|
||||
|
||||
def _build_name_variants(bases, honorifics, prefixes, connector, prefix_connector):
|
||||
variants = []
|
||||
for base in bases:
|
||||
be = re.escape(base)
|
||||
variants.append(be)
|
||||
for h in honorifics:
|
||||
he = re.escape(h)
|
||||
variants.append(be + connector + he)
|
||||
for p in prefixes:
|
||||
pe = re.escape(p)
|
||||
variants.append(pe + prefix_connector + be)
|
||||
for h in honorifics:
|
||||
he = re.escape(h)
|
||||
variants.append(pe + prefix_connector + be + connector + he)
|
||||
return variants
|
||||
|
||||
|
||||
latin = _build_name_variants(
|
||||
bases=['miku'],
|
||||
honorifics=[
|
||||
'chan', 'san', 'kun', 'nyan', 'hime', 'tan', 'chin', 'heika',
|
||||
'denka', 'kakka', 'shi', 'chama', 'kyun', 'dono', 'sensei',
|
||||
'senpai', 'jou',
|
||||
],
|
||||
prefixes=['o-'],
|
||||
connector=r'[\-\s]?',
|
||||
prefix_connector=r'\s?',
|
||||
)
|
||||
|
||||
cyrillic = _build_name_variants(
|
||||
bases=['мику'],
|
||||
honorifics=[
|
||||
'чан', 'сан', 'кун', 'нян', 'химе', 'тан', 'чин',
|
||||
'хейка', 'хеика', 'денка', 'какка', 'си', 'чама', 'кюн',
|
||||
'доно', 'сенсэй', 'сенсеи', 'сенпай', 'сенпаи', 'джо',
|
||||
],
|
||||
prefixes=['о-'],
|
||||
connector=r'[\-\s]?',
|
||||
prefix_connector=r'\s?',
|
||||
)
|
||||
|
||||
japanese = _build_name_variants(
|
||||
bases=['みく', 'ミク', '未来'],
|
||||
honorifics=[
|
||||
'ちゃん', 'さん', 'くん', 'にゃん', 'ひめ', 'たん', 'ちん',
|
||||
'へいか', 'でんか', 'かっか', 'し', 'ちゃま', 'きゅん', 'どの',
|
||||
'せんせい', 'せんぱい', 'じょう',
|
||||
'チャン', 'サン', 'クン', 'ニャン', 'ヒメ', 'タン', 'チン',
|
||||
'ヘイカ', 'デンカ', 'カッカ', 'シ', 'チャマ', 'キュン', 'ドノ',
|
||||
'センセイ', 'センパイ', 'ジョウ',
|
||||
],
|
||||
prefixes=['お', 'オ'],
|
||||
connector=r'[-]?',
|
||||
prefix_connector=r'',
|
||||
)
|
||||
|
||||
all_v = sorted(latin + cyrillic + japanese, key=len, reverse=True)
|
||||
alts = '|'.join(all_v)
|
||||
|
||||
NAME = rf'\b(?:{alts})\b'
|
||||
PUNCT = r'[,,、::!!??.。]'
|
||||
COMMA = r'[,,、]'
|
||||
ETRAIL = r'[!!??.。~~]*'
|
||||
ATRAIL = r'[!!??.。~~♪♡❤]*'
|
||||
|
||||
START_RE = re.compile(rf'^\s*{NAME}\s*{PUNCT}', re.IGNORECASE)
|
||||
END_RE = re.compile(rf'{COMMA}\s*{NAME}\s*{ETRAIL}\s*$', re.IGNORECASE)
|
||||
MIDDLE_RE = re.compile(rf'{COMMA}\s*{NAME}\s*{COMMA}', re.IGNORECASE)
|
||||
ALONE_RE = re.compile(rf'^\s*{NAME}\s*{ATRAIL}\s*$', re.IGNORECASE)
|
||||
|
||||
|
||||
def is_addressed(text: str) -> bool:
|
||||
text = text.strip()
|
||||
return bool(
|
||||
START_RE.search(text)
|
||||
or END_RE.search(text)
|
||||
or MIDDLE_RE.search(text)
|
||||
or ALONE_RE.search(text)
|
||||
)
|
||||
|
||||
|
||||
def which_pattern(text: str) -> str:
|
||||
"""Return which pattern matched (for debugging)."""
|
||||
text = text.strip()
|
||||
matched = []
|
||||
if START_RE.search(text):
|
||||
matched.append("START")
|
||||
if END_RE.search(text):
|
||||
matched.append("END")
|
||||
if MIDDLE_RE.search(text):
|
||||
matched.append("MIDDLE")
|
||||
if ALONE_RE.search(text):
|
||||
matched.append("ALONE")
|
||||
return ', '.join(matched) if matched else 'NONE'
|
||||
|
||||
|
||||
# ── Test cases ──
|
||||
# (message, expected, description)
|
||||
TESTS = [
|
||||
# ═══ START pattern (name at beginning + punctuation) ═══
|
||||
("Miku, how are you?", True, "START: Latin + comma"),
|
||||
("miku, hello!", True, "START: lowercase Latin"),
|
||||
("MIKU! listen to me", True, "START: uppercase + excl"),
|
||||
("Miku: can you help?", True, "START: colon"),
|
||||
("Miku. Please help.", True, "START: period"),
|
||||
("みく、元気?", True, "START: Hiragana + JP comma"),
|
||||
("ミク!聞いて", True, "START: Katakana + JP excl"),
|
||||
("未来、教えて", True, "START: Kanji + JP comma"),
|
||||
("мику, привет!", True, "START: Cyrillic + comma"),
|
||||
("МИКУ! слушай", True, "START: Cyrillic upper + excl"),
|
||||
("Miku-chan, how are you?", True, "START: honorific-dash + comma"),
|
||||
("miku chan, hello!", True, "START: honorific-space + comma"),
|
||||
("mikuchan! listen!", True, "START: honorific-joined + excl"),
|
||||
("ミクちゃん、聞いて", True, "START: JP name+honorific + comma"),
|
||||
("ミクちゃん!元気?", True, "START: JP name+honorific + excl"),
|
||||
("みくさん, 教えて", True, "START: Hiragana + hon + comma"),
|
||||
("мику-сан, скажи", True, "START: Cyrillic + hon + comma"),
|
||||
("o-miku, hello", True, "START: o-prefix Latin"),
|
||||
("おみく、ねえ", True, "START: o-prefix Japanese"),
|
||||
(" Miku, hello ", True, "START: whitespace padded"),
|
||||
|
||||
# ═══ END pattern (comma + name at end) ═══
|
||||
("how are you, Miku?", True, "END: comma + Latin + ?"),
|
||||
("how are you, Miku!", True, "END: comma + Latin + !"),
|
||||
("how are you, Miku", True, "END: comma + Latin no trail"),
|
||||
("tell me, miku.", True, "END: comma + lowercase + period"),
|
||||
("元気, ミク", True, "END: comma + Katakana"),
|
||||
("教えて、みく!", True, "END: JP comma + Hiragana + !"),
|
||||
("教えて、未来", True, "END: JP comma + Kanji"),
|
||||
("скажи, мику!", True, "END: Cyrillic comma + name"),
|
||||
("hello, Miku-chan!", True, "END: comma + honorific"),
|
||||
("hello, miku-san?", True, "END: comma + honorific + ?"),
|
||||
("元気、ミクちゃん", True, "END: JP comma + JP honorific"),
|
||||
("hello, o-miku", True, "END: comma + o-prefix"),
|
||||
|
||||
# ═══ MIDDLE pattern (vocative — commas on both sides) ═══
|
||||
("On the contrary, Miku, I think you're wrong", True, "MIDDLE: vocative Latin"),
|
||||
("I am very happy, Miku, you are so fun", True, "MIDDLE: vocative Latin 2"),
|
||||
("well, Miku-chan, I think so", True, "MIDDLE: vocative + honorific"),
|
||||
("しかし、みく、それは違う", True, "MIDDLE: vocative Japanese"),
|
||||
("でも、ミクちゃん、聞いて", True, "MIDDLE: vocative JP + honorific"),
|
||||
("но, мику, я думаю", True, "MIDDLE: vocative Cyrillic"),
|
||||
("hey, miku, what do you think?", True, "MIDDLE: vocative casual"),
|
||||
("you know, Miku, that's not right", True, "MIDDLE: vocative mid-sentence"),
|
||||
|
||||
# ═══ ALONE pattern (name is the entire message) ═══
|
||||
("Miku", True, "ALONE: bare Latin"),
|
||||
("miku", True, "ALONE: lowercase"),
|
||||
("MIKU", True, "ALONE: uppercase"),
|
||||
("Miku!", True, "ALONE: + excl"),
|
||||
("Miku?", True, "ALONE: + question"),
|
||||
("Miku!!", True, "ALONE: + multi excl"),
|
||||
("みく", True, "ALONE: Hiragana"),
|
||||
("ミク!", True, "ALONE: Katakana + excl"),
|
||||
("未来", True, "ALONE: Kanji"),
|
||||
("мику", True, "ALONE: Cyrillic"),
|
||||
("Miku-chan", True, "ALONE: Latin + honorific"),
|
||||
("miku chan!", True, "ALONE: space honorific + excl"),
|
||||
("ミクちゃん", True, "ALONE: JP honorific"),
|
||||
("ミクさん!", True, "ALONE: JP honorific + excl"),
|
||||
("みくせんせい", True, "ALONE: Hiragana + sensei"),
|
||||
("o-miku!", True, "ALONE: o-prefix"),
|
||||
("おみく", True, "ALONE: JP o-prefix"),
|
||||
("オミク", True, "ALONE: Katakana o-prefix"),
|
||||
(" Miku ", True, "ALONE: whitespace"),
|
||||
("Miku~", True, "ALONE: tilde"),
|
||||
("Miku♪", True, "ALONE: music note"),
|
||||
("Miku❤", True, "ALONE: heart"),
|
||||
("мику-чан", True, "ALONE: Cyrillic + honorific"),
|
||||
("мику сан", True, "ALONE: Cyrillic + space hon"),
|
||||
("未来さん", True, "ALONE: Kanji + honorific"),
|
||||
|
||||
# ═══ Should NOT match (mere mentions / not addressing) ═══
|
||||
("I like Miku", False, "REJECT: object of sentence"),
|
||||
("Miku is cool", False, "REJECT: subject + is"),
|
||||
("Miku is my favorite vocaloid", False, "REJECT: subject + statement"),
|
||||
("I saw Miku at a concert", False, "REJECT: middle of sentence"),
|
||||
("told miku about it", False, "REJECT: informal mention"),
|
||||
("hatsune miku concert", False, "REJECT: event name"),
|
||||
("Do you know Miku?", False, "REJECT: asking about her"),
|
||||
("I love Miku!", False, "REJECT: exclamation about her"),
|
||||
("I love Miku so much", False, "REJECT: longer statement"),
|
||||
("ミクは元気だよ", False, "REJECT: Japanese 'Miku is well'"),
|
||||
("ミクが好き", False, "REJECT: Japanese 'I like Miku'"),
|
||||
("ミクのことが好き", False, "REJECT: Japanese 'I like Miku (thing)'"),
|
||||
("мику была там", False, "REJECT: Cyrillic 'Miku was there'"),
|
||||
("мику такая красивая", False, "REJECT: Cyrillic 'Miku is pretty'"),
|
||||
("the Miku concert was great", False, "REJECT: event discussion"),
|
||||
("My favorite is Miku for sure", False, "REJECT: no comma before name at end"),
|
||||
("yeah miku is pretty cool right", False, "REJECT: casual mention"),
|
||||
("have you seen miku today", False, "REJECT: asking about her"),
|
||||
("miku and I went shopping", False, "REJECT: subject of sentence"),
|
||||
("I met miku yesterday", False, "REJECT: object mid-sentence"),
|
||||
("mikumiku fan", False, "REJECT: compound word (\\b boundary)"),
|
||||
("hatsune miku is singing", False, "REJECT: full name as subject"),
|
||||
|
||||
# ═══ Edge cases ═══
|
||||
("", False, "EDGE: empty message"),
|
||||
("hello", False, "EDGE: no name at all"),
|
||||
("hello!", False, "EDGE: exclamation, no name"),
|
||||
("??", False, "EDGE: just punctuation"),
|
||||
(" ", False, "EDGE: just whitespace"),
|
||||
("chan", False, "EDGE: just an honorific"),
|
||||
("o-", False, "EDGE: just a prefix"),
|
||||
]
|
||||
|
||||
|
||||
def main():
|
||||
print(f"Generated {len(all_v)} name variants")
|
||||
print(f"Running {len(TESTS)} test cases...\n")
|
||||
|
||||
passed = 0
|
||||
failed = 0
|
||||
|
||||
for msg, expected, desc in TESTS:
|
||||
result = is_addressed(msg)
|
||||
ok = result == expected
|
||||
if ok:
|
||||
passed += 1
|
||||
else:
|
||||
failed += 1
|
||||
pattern = which_pattern(msg)
|
||||
exp_str = "ADDR" if expected else "SKIP"
|
||||
got_str = "ADDR" if result else "SKIP"
|
||||
print(f" FAIL expected={exp_str} got={got_str} matched={pattern}")
|
||||
print(f" {desc}")
|
||||
print(f" message: \"{msg}\"\n")
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f" {passed}/{len(TESTS)} passed, {failed} failed")
|
||||
print(f"{'='*50}")
|
||||
|
||||
return 0 if failed == 0 else 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user