Files
miku-discord/bot/utils/core.py

167 lines
6.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# utils/core.py
import asyncio
import aiohttp
import re
import globals
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from utils.logger import get_logger
logger = get_logger('core')
# switch_model() removed - llama-swap handles model switching automatically
async def is_miku_addressed(message) -> bool:
# Check if this is a DM (no guild)
if message.guild is None:
# In DMs, always respond to every message
return True
# Safety check: ensure guild and guild.me exist
if not message.guild or not message.guild.me:
logger.warning(f"Invalid guild or guild.me in message from {message.author}")
return False
# If message contains a ping for Miku, return true
if message.guild.me in message.mentions:
return True
# If message is a reply, check the referenced message author
if message.reference:
try:
referenced_msg = await message.channel.fetch_message(message.reference.message_id)
if referenced_msg.author == message.guild.me:
return True
except Exception as e:
logger.warning(f"Could not fetch referenced message: {e}")
cleaned = message.content.strip()
cleaned_lower = cleaned.lower()
# Base names for Miku in different scripts
base_names = [
'miku', 'мику', 'みく', 'ミク', '未来'
]
# Japanese honorifics - all scripts combined
honorifics = [
# Latin
'chan', 'san', 'kun', 'nyan', 'hime', 'tan', 'chin', 'heika',
'denka', 'kakka', 'shi', 'chama', 'kyun', 'dono', 'sensei', 'senpai', 'jou',
# Hiragana
'ちゃん', 'さん', 'くん', 'にゃん', 'ひめ', 'たん', 'ちん', 'へいか',
'でんか', 'かっか', '', 'ちゃま', 'きゅん', 'どの', 'せんせい', 'せんぱい', 'じょう',
# Katakana
'チャン', 'サン', 'クン', 'ニャン', 'ヒメ', 'タン', 'チン', 'ヘイカ',
'デンカ', 'カッカ', '', 'チャマ', 'キュン', 'ドノ', 'センセイ', 'センパイ', 'ジョウ',
# Cyrillic
'чан', 'сан', 'кун', 'нян', 'химе', 'тан', 'чин', 'хейка', 'хеика',
'денка', 'какка', 'си', 'чама', 'кюн', 'доно', 'сенсэй', 'сенсеи', 'сенпай', 'сенпаи', 'джо'
]
# o- prefix variants
o_prefixes = ['o-', 'о-', '', '']
# Build all possible name variations to check
name_patterns = []
for base in base_names:
base_lower = base.lower()
base_escaped = re.escape(base_lower)
# Base name alone
name_patterns.append(base_escaped)
# With honorifics (allows optional dash/space between)
for honorific in honorifics:
honorific_lower = honorific.lower()
honorific_escaped = re.escape(honorific_lower)
# Build pattern: base + optional [dash or space] + honorific
name_patterns.append(base_escaped + r'[\-\s]*' + honorific_escaped)
# With o- prefix
for prefix in o_prefixes:
prefix_lower = prefix.lower()
prefix_escaped = re.escape(prefix_lower)
# o-prefix + optional space + base
name_patterns.append(prefix_escaped + r'\s*' + base_escaped)
# With o- prefix + honorific
for honorific in honorifics:
honorific_lower = honorific.lower()
honorific_escaped = re.escape(honorific_lower)
# o-prefix + space + base + dash/space + honorific
name_patterns.append(prefix_escaped + r'\s*' + base_escaped + r'[\-\s]*' + honorific_escaped)
# Check all patterns - she must be "addressed" not just mentioned
for pattern in name_patterns:
try:
# Pattern 1: Start of message + punctuation/end
# "Miku, ..." or "みく!" or "ミクちゃん、..."
start_p = r'^' + pattern + r'(?:[,,、!?.。\s]+|$)'
if re.search(start_p, cleaned_lower, re.IGNORECASE):
return True
# Pattern 2: End of message (optionally preceded by punctuation)
# "..., Miku" or "...みく" or "...ミクちゃん!"
end_p = r'(?:[,,、!?.。\s]+|^)' + pattern + r'[!?.。\s]*$'
if re.search(end_p, cleaned_lower, re.IGNORECASE):
return True
# Pattern 3: Middle (surrounded by punctuation)
# "..., Miku, ..." or "...、ミク、..."
middle_p = r'[,,、!?.。\s]+' + pattern + r'[,,、!?.。\s]+'
if re.search(middle_p, cleaned_lower, re.IGNORECASE):
return True
# Pattern 4: Just the name alone
# "Miku" or "みく!" or "ミクちゃん"
alone_p = r'^\s*' + pattern + r'[!?.。]*\s*$'
if re.search(alone_p, cleaned_lower, re.IGNORECASE):
return True
except re.error as e:
# Log the problematic pattern and skip it
logger.error(f"REGEX ERROR - Pattern: '{pattern}' | Start regex: '{start_p}' | Error: {e}")
continue
return False
# Vectorstore functionality disabled - not needed with current structured context approach
# If you need embeddings in the future, you can use a different embedding provider
# For now, the bot uses structured prompts from context_manager.py
# def load_miku_knowledge():
# with open("miku_lore.txt", "r", encoding="utf-8") as f:
# text = f.read()
#
# from langchain_text_splitters import RecursiveCharacterTextSplitter
#
# text_splitter = RecursiveCharacterTextSplitter(
# chunk_size=520,
# chunk_overlap=50,
# separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
# )
#
# docs = [Document(page_content=chunk) for chunk in text_splitter.split_text(text)]
#
# vectorstore = FAISS.from_documents(docs, embeddings)
# return vectorstore
#
# def load_miku_lyrics():
# with open("miku_lyrics.txt", "r", encoding="utf-8") as f:
# lyrics_text = f.read()
#
# text_splitter = CharacterTextSplitter(chunk_size=520, chunk_overlap=50)
# docs = [Document(page_content=chunk) for chunk in text_splitter.split_text(lyrics_text)]
#
# vectorstore = FAISS.from_documents(docs, embeddings)
# return vectorstore
#
# miku_vectorstore = load_miku_knowledge()
# miku_lyrics_vectorstore = load_miku_lyrics()