Rewrite is_miku_addressed() to only trigger when addressed, not mentioned
- Pre-compile 393 name variants into 4 regex patterns at module load (was 7,300+ raw re.search() calls per message) - Strict addressing detection using punctuation context: START: name at beginning + punctuation (Miku, ... / みく!...) END: comma + name at end (..., Miku / ...、ミク) MIDDLE: commas on both sides - vocative (..., Miku, ...) ALONE: name is the entire message (Miku! / ミクちゃん) - Rejects mere mentions: 'I like Miku' / 'Miku is cool' no longer trigger - Script-family-aware pattern generation (Latin, Cyrillic, Japanese) eliminates nonsensical cross-script combos (e.g. o-みく) - Word boundary enforcement prevents substring matches (mikumiku) - Fixes regex 'unbalanced parenthesis' errors from old implementation - Add comprehensive test suite (94 cases, all passing)
This commit is contained in:
@@ -1,167 +1,165 @@
|
||||
# utils/core.py
|
||||
#
|
||||
# Detects whether a Discord message is **addressed to** Miku
|
||||
# (as opposed to merely mentioning her).
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import re
|
||||
|
||||
import globals
|
||||
# Langchain imports below are only used in commented-out code
|
||||
# from langchain_community.vectorstores import FAISS
|
||||
# from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
|
||||
# from langchain_core.documents import Document
|
||||
from utils.logger import get_logger
|
||||
|
||||
logger = get_logger('core')
|
||||
|
||||
|
||||
# switch_model() removed - llama-swap handles model switching automatically
|
||||
# ────────────────────────────────────────────────────────────────────
|
||||
# Pre-compiled Miku addressing patterns
|
||||
# Built once at module load; is_miku_addressed() runs only 4 .search()
|
||||
# ────────────────────────────────────────────────────────────────────
|
||||
|
||||
def _build_name_variants(bases, honorifics, prefixes, connector, prefix_connector):
|
||||
"""Return regex fragments for every name+honorific+prefix combo
|
||||
within a single script family."""
|
||||
variants = []
|
||||
for base in bases:
|
||||
be = re.escape(base)
|
||||
variants.append(be)
|
||||
for h in honorifics:
|
||||
he = re.escape(h)
|
||||
variants.append(be + connector + he)
|
||||
for p in prefixes:
|
||||
pe = re.escape(p)
|
||||
variants.append(pe + prefix_connector + be)
|
||||
for h in honorifics:
|
||||
he = re.escape(h)
|
||||
variants.append(pe + prefix_connector + be + connector + he)
|
||||
return variants
|
||||
|
||||
|
||||
def _compile_addressing_patterns():
|
||||
"""Compile the four addressing regexes.
|
||||
|
||||
START – name at the beginning, followed by punctuation
|
||||
"Miku, how are you?" "みく!聞いて"
|
||||
END – comma then name at the end
|
||||
"how are you, Miku?" "教えて、ミク"
|
||||
MIDDLE – name surrounded by commas (vocative)
|
||||
"On the contrary, Miku, I think…"
|
||||
ALONE – name is the entire message
|
||||
"Miku" "みく!" "ミクちゃん"
|
||||
"""
|
||||
latin = _build_name_variants(
|
||||
bases=['miku'],
|
||||
honorifics=[
|
||||
'chan', 'san', 'kun', 'nyan', 'hime', 'tan', 'chin', 'heika',
|
||||
'denka', 'kakka', 'shi', 'chama', 'kyun', 'dono', 'sensei',
|
||||
'senpai', 'jou',
|
||||
],
|
||||
prefixes=['o-'],
|
||||
connector=r'[\-\s]?',
|
||||
prefix_connector=r'\s?',
|
||||
)
|
||||
|
||||
cyrillic = _build_name_variants(
|
||||
bases=['мику'],
|
||||
honorifics=[
|
||||
'чан', 'сан', 'кун', 'нян', 'химе', 'тан', 'чин',
|
||||
'хейка', 'хеика', 'денка', 'какка', 'си', 'чама', 'кюн',
|
||||
'доно', 'сенсэй', 'сенсеи', 'сенпай', 'сенпаи', 'джо',
|
||||
],
|
||||
prefixes=['о-'],
|
||||
connector=r'[\-\s]?',
|
||||
prefix_connector=r'\s?',
|
||||
)
|
||||
|
||||
japanese = _build_name_variants(
|
||||
bases=['みく', 'ミク', '未来'],
|
||||
honorifics=[
|
||||
# Hiragana
|
||||
'ちゃん', 'さん', 'くん', 'にゃん', 'ひめ', 'たん', 'ちん',
|
||||
'へいか', 'でんか', 'かっか', 'し', 'ちゃま', 'きゅん', 'どの',
|
||||
'せんせい', 'せんぱい', 'じょう',
|
||||
# Katakana
|
||||
'チャン', 'サン', 'クン', 'ニャン', 'ヒメ', 'タン', 'チン',
|
||||
'ヘイカ', 'デンカ', 'カッカ', 'シ', 'チャマ', 'キュン', 'ドノ',
|
||||
'センセイ', 'センパイ', 'ジョウ',
|
||||
],
|
||||
prefixes=['お', 'オ'],
|
||||
connector=r'[-]?',
|
||||
prefix_connector=r'',
|
||||
)
|
||||
|
||||
# Longest-first so the regex engine prefers the most specific match
|
||||
all_v = sorted(latin + cyrillic + japanese, key=len, reverse=True)
|
||||
alts = '|'.join(all_v)
|
||||
|
||||
NAME = rf'\b(?:{alts})\b'
|
||||
PUNCT = r'[,,、::!!??.。]' # addressing punctuation after name
|
||||
COMMA = r'[,,、]' # comma variants (before name / vocative)
|
||||
ETRAIL = r'[!!??.。~~]*' # optional trailing at end
|
||||
ATRAIL = r'[!!??.。~~♪♡❤]*' # optional trailing for name-only messages
|
||||
|
||||
start_re = re.compile(rf'^\s*{NAME}\s*{PUNCT}', re.IGNORECASE)
|
||||
end_re = re.compile(rf'{COMMA}\s*{NAME}\s*{ETRAIL}\s*$', re.IGNORECASE)
|
||||
middle_re = re.compile(rf'{COMMA}\s*{NAME}\s*{COMMA}', re.IGNORECASE)
|
||||
alone_re = re.compile(rf'^\s*{NAME}\s*{ATRAIL}\s*$', re.IGNORECASE)
|
||||
|
||||
logger.info(f"Miku addressing: {len(all_v)} name variants compiled into 4 patterns")
|
||||
return start_re, end_re, middle_re, alone_re
|
||||
|
||||
|
||||
try:
|
||||
_START_RE, _END_RE, _MIDDLE_RE, _ALONE_RE = _compile_addressing_patterns()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to compile addressing patterns: {e}")
|
||||
_START_RE = _END_RE = _MIDDLE_RE = _ALONE_RE = None
|
||||
|
||||
|
||||
# ────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def is_miku_addressed(message) -> bool:
|
||||
# Check if this is a DM (no guild)
|
||||
"""Return True only when the message is directed *at* Miku,
|
||||
not merely mentioning her.
|
||||
|
||||
Always responds to: DMs, @mentions, replies to Miku's messages.
|
||||
|
||||
For normal messages checks whether Miku's name (in any supported
|
||||
script / honorific combination) appears in an "addressing" position:
|
||||
• Start – "Miku, how are you?"
|
||||
• End – "how are you, Miku?"
|
||||
• Middle – "On the contrary, Miku, I think…"
|
||||
• Alone – "Miku!" / "ミクちゃん"
|
||||
|
||||
Does NOT trigger on mere mentions:
|
||||
• "I like Miku" / "Miku is cool" / "told miku about it"
|
||||
"""
|
||||
# DMs – always respond
|
||||
if message.guild is None:
|
||||
# In DMs, always respond to every message
|
||||
return True
|
||||
|
||||
# Safety check: ensure guild and guild.me exist
|
||||
|
||||
if not message.guild or not message.guild.me:
|
||||
logger.warning(f"Invalid guild or guild.me in message from {message.author}")
|
||||
logger.warning(f"Invalid guild/guild.me for message from {message.author}")
|
||||
return False
|
||||
|
||||
# If message contains a ping for Miku, return true
|
||||
|
||||
# @mention
|
||||
if message.guild.me in message.mentions:
|
||||
return True
|
||||
|
||||
# If message is a reply, check the referenced message author
|
||||
# Reply to Miku
|
||||
if message.reference:
|
||||
try:
|
||||
referenced_msg = await message.channel.fetch_message(message.reference.message_id)
|
||||
if referenced_msg.author == message.guild.me:
|
||||
ref = await message.channel.fetch_message(message.reference.message_id)
|
||||
if ref.author == message.guild.me:
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not fetch referenced message: {e}")
|
||||
|
||||
cleaned = message.content.strip()
|
||||
cleaned_lower = cleaned.lower()
|
||||
|
||||
# Base names for Miku in different scripts
|
||||
base_names = [
|
||||
'miku', 'мику', 'みく', 'ミク', '未来'
|
||||
]
|
||||
|
||||
# Japanese honorifics - all scripts combined
|
||||
honorifics = [
|
||||
# Latin
|
||||
'chan', 'san', 'kun', 'nyan', 'hime', 'tan', 'chin', 'heika',
|
||||
'denka', 'kakka', 'shi', 'chama', 'kyun', 'dono', 'sensei', 'senpai', 'jou',
|
||||
# Hiragana
|
||||
'ちゃん', 'さん', 'くん', 'にゃん', 'ひめ', 'たん', 'ちん', 'へいか',
|
||||
'でんか', 'かっか', 'し', 'ちゃま', 'きゅん', 'どの', 'せんせい', 'せんぱい', 'じょう',
|
||||
# Katakana
|
||||
'チャン', 'サン', 'クン', 'ニャン', 'ヒメ', 'タン', 'チン', 'ヘイカ',
|
||||
'デンカ', 'カッカ', 'シ', 'チャマ', 'キュン', 'ドノ', 'センセイ', 'センパイ', 'ジョウ',
|
||||
# Cyrillic
|
||||
'чан', 'сан', 'кун', 'нян', 'химе', 'тан', 'чин', 'хейка', 'хеика',
|
||||
'денка', 'какка', 'си', 'чама', 'кюн', 'доно', 'сенсэй', 'сенсеи', 'сенпай', 'сенпаи', 'джо'
|
||||
]
|
||||
|
||||
# o- prefix variants
|
||||
o_prefixes = ['o-', 'о-', 'お', 'オ']
|
||||
|
||||
# Build all possible name variations to check
|
||||
name_patterns = []
|
||||
|
||||
for base in base_names:
|
||||
base_lower = base.lower()
|
||||
base_escaped = re.escape(base_lower)
|
||||
|
||||
# Base name alone
|
||||
name_patterns.append(base_escaped)
|
||||
|
||||
# With honorifics (allows optional dash/space between)
|
||||
for honorific in honorifics:
|
||||
honorific_lower = honorific.lower()
|
||||
honorific_escaped = re.escape(honorific_lower)
|
||||
# Build pattern: base + optional [dash or space] + honorific
|
||||
name_patterns.append(base_escaped + r'[\-\s]*' + honorific_escaped)
|
||||
|
||||
# With o- prefix
|
||||
for prefix in o_prefixes:
|
||||
prefix_lower = prefix.lower()
|
||||
prefix_escaped = re.escape(prefix_lower)
|
||||
# o-prefix + optional space + base
|
||||
name_patterns.append(prefix_escaped + r'\s*' + base_escaped)
|
||||
|
||||
# With o- prefix + honorific
|
||||
for honorific in honorifics:
|
||||
honorific_lower = honorific.lower()
|
||||
honorific_escaped = re.escape(honorific_lower)
|
||||
# o-prefix + space + base + dash/space + honorific
|
||||
name_patterns.append(prefix_escaped + r'\s*' + base_escaped + r'[\-\s]*' + honorific_escaped)
|
||||
|
||||
# Check all patterns - she must be "addressed" not just mentioned
|
||||
for pattern in name_patterns:
|
||||
try:
|
||||
# Pattern 1: Start of message + punctuation/end
|
||||
# "Miku, ..." or "みく!" or "ミクちゃん、..."
|
||||
start_p = r'^' + pattern + r'(?:[,,、!!??.。\s]+|$)'
|
||||
if re.search(start_p, cleaned_lower, re.IGNORECASE):
|
||||
return True
|
||||
|
||||
# Pattern 2: End of message (optionally preceded by punctuation)
|
||||
# "..., Miku" or "...みく" or "...ミクちゃん!"
|
||||
end_p = r'(?:[,,、!!??.。\s]+|^)' + pattern + r'[!!??.。\s]*$'
|
||||
if re.search(end_p, cleaned_lower, re.IGNORECASE):
|
||||
return True
|
||||
|
||||
# Pattern 3: Middle (surrounded by punctuation)
|
||||
# "..., Miku, ..." or "...、ミク、..."
|
||||
middle_p = r'[,,、!!??.。\s]+' + pattern + r'[,,、!!??.。\s]+'
|
||||
if re.search(middle_p, cleaned_lower, re.IGNORECASE):
|
||||
return True
|
||||
|
||||
# Pattern 4: Just the name alone
|
||||
# "Miku" or "みく!" or "ミクちゃん"
|
||||
alone_p = r'^\s*' + pattern + r'[!!??.。]*\s*$'
|
||||
if re.search(alone_p, cleaned_lower, re.IGNORECASE):
|
||||
return True
|
||||
except re.error as e:
|
||||
# Log the problematic pattern and skip it
|
||||
logger.error(f"REGEX ERROR - Pattern: '{pattern}' | Start regex: '{start_p}' | Error: {e}")
|
||||
continue
|
||||
|
||||
return False
|
||||
# Regex addressing (4 pre-compiled patterns)
|
||||
if _START_RE is None:
|
||||
logger.error("Addressing patterns not compiled – skipping pattern check")
|
||||
return False
|
||||
|
||||
# Vectorstore functionality disabled - not needed with current structured context approach
|
||||
# If you need embeddings in the future, you can use a different embedding provider
|
||||
# For now, the bot uses structured prompts from context_manager.py
|
||||
|
||||
# def load_miku_knowledge():
|
||||
# with open("miku_lore.txt", "r", encoding="utf-8") as f:
|
||||
# text = f.read()
|
||||
#
|
||||
# from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||
#
|
||||
# text_splitter = RecursiveCharacterTextSplitter(
|
||||
# chunk_size=520,
|
||||
# chunk_overlap=50,
|
||||
# separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
|
||||
# )
|
||||
#
|
||||
# docs = [Document(page_content=chunk) for chunk in text_splitter.split_text(text)]
|
||||
#
|
||||
# vectorstore = FAISS.from_documents(docs, embeddings)
|
||||
# return vectorstore
|
||||
#
|
||||
# def load_miku_lyrics():
|
||||
# with open("miku_lyrics.txt", "r", encoding="utf-8") as f:
|
||||
# lyrics_text = f.read()
|
||||
#
|
||||
# text_splitter = CharacterTextSplitter(chunk_size=520, chunk_overlap=50)
|
||||
# docs = [Document(page_content=chunk) for chunk in text_splitter.split_text(lyrics_text)]
|
||||
#
|
||||
# vectorstore = FAISS.from_documents(docs, embeddings)
|
||||
# return vectorstore
|
||||
#
|
||||
# miku_vectorstore = load_miku_knowledge()
|
||||
# miku_lyrics_vectorstore = load_miku_lyrics()
|
||||
text = message.content.strip()
|
||||
return bool(
|
||||
_START_RE.search(text)
|
||||
or _END_RE.search(text)
|
||||
or _MIDDLE_RE.search(text)
|
||||
or _ALONE_RE.search(text)
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user