Untested Phase 1 (Foundation & Resource management) of voice chat integration

2026-01-16 13:01:08 +02:00
parent 353c9c9583
commit 911f11ee9f
9 changed files with 2288 additions and 0 deletions
--- a/bot/bot.py
+++ b/bot/bot.py
@@ -125,6 +125,19 @@ async def on_message(message):
    if message.author == globals.client.user:
        return
    # Check for voice commands first (!miku join, !miku leave, !miku voice-status)
    if not isinstance(message.channel, discord.DMChannel) and message.content.strip().lower().startswith('!miku '):
        from commands.voice import handle_voice_command
        parts = message.content.strip().split()
        if len(parts) >= 2:
            cmd = parts[1].lower()
            args = parts[2:] if len(parts) > 2 else []
            if cmd in ['join', 'leave', 'voice-status']:
                await handle_voice_command(message, cmd, args)
                return
    # Skip processing if a bipolar argument is in progress in this channel
    if not isinstance(message.channel, discord.DMChannel):
        from utils.bipolar_mode import is_argument_in_progress
@@ -196,6 +209,14 @@ async def on_message(message):
                logger.error(f"Failed to fetch replied message for context: {e}")
        async with message.channel.typing():
            # Check if vision model is blocked (voice session active)
            if message.attachments and globals.VISION_MODEL_BLOCKED:
                await message.channel.send(
                    "🎤 I can't look at images or videos right now, I'm talking in voice chat! "
                    "Send it again after I leave the voice channel."
                )
                return
            # If message has an image, video, or GIF attachment
            if message.attachments:
                for attachment in message.attachments:
@@ -504,6 +525,13 @@ async def on_message(message):
            if is_image_request and image_prompt:
                logger.info(f"🎨 Image generation request detected: '{image_prompt}' from {message.author.display_name}")
                # Block image generation during voice sessions
                if globals.IMAGE_GENERATION_BLOCKED:
                    await message.channel.send(globals.IMAGE_GENERATION_BLOCK_MESSAGE)
                    await message.add_reaction('🎤')
                    logger.info("🚫 Image generation blocked - voice session active")
                    return
                # Handle the image generation workflow
                success = await handle_image_generation_request(message, image_prompt)
                if success:
--- a/bot/commands/voice.py
+++ b/bot/commands/voice.py
@@ -0,0 +1,229 @@
 # voice.py
 """
 Voice channel commands for Miku Discord bot.
 Handles joining, leaving, and status commands for voice chat sessions.
 """
 import discord
 from utils.voice_manager import voice_manager
 from utils.logger import get_logger
 logger = get_logger('voice_commands')
 async def handle_voice_command(message, cmd, args):
    """
    Handle voice-related commands.
    Args:
        message: Discord message object
        cmd: Command name (join, leave, voice-status)
        args: Command arguments
    """
    if cmd == 'join':
        await _handle_join(message, args)
    elif cmd == 'leave':
        await _handle_leave(message)
    elif cmd == 'voice-status':
        await _handle_status(message)
    else:
        await message.channel.send(f"❌ Unknown voice command: `{cmd}`")
 async def _handle_join(message, args):
    """
    Handle !miku join command.
    Join voice channel and start session with resource locks.
    """
    # Get voice channel
    voice_channel = None
    if args and args[0].startswith('<#'):
        # Channel mentioned (e.g., !miku join #voice-chat)
        try:
            channel_id = int(args[0][2:-1])
            voice_channel = message.guild.get_channel(channel_id)
            if not isinstance(voice_channel, discord.VoiceChannel):
                await message.channel.send("❌ That's not a voice channel!")
                return
        except (ValueError, AttributeError):
            await message.channel.send("❌ Invalid channel!")
            return
    else:
        # Use user's current voice channel
        if message.author.voice and message.author.voice.channel:
            voice_channel = message.author.voice.channel
        else:
            await message.channel.send(
                "❌ You must be in a voice channel! "
                "Or mention a voice channel like `!miku join #voice-chat`"
            )
            return
    # Check permissions
    if not voice_channel.permissions_for(message.guild.me).connect:
        await message.channel.send(f"❌ I don't have permission to join {voice_channel.mention}!")
        return
    if not voice_channel.permissions_for(message.guild.me).speak:
        await message.channel.send(f"❌ I don't have permission to speak in {voice_channel.mention}!")
        return
    # Start session
    try:
        await message.channel.send(f"🎤 Joining {voice_channel.mention}...")
        await voice_manager.start_session(
            message.guild.id,
            voice_channel,
            message.channel  # Use current text channel for prompts
        )
        embed = discord.Embed(
            title="🎤 Voice Chat Active",
            description=f"I've joined {voice_channel.mention}!",
            color=discord.Color.from_rgb(134, 206, 203)  # Miku teal
        )
        embed.add_field(
            name="How to use",
            value=f"Send messages in {message.channel.mention} to make me speak!",
            inline=False
        )
        embed.add_field(
            name="⚠️ Resource Mode",
            value=(
                "• Text inference on AMD GPU only\n"
                "• Vision model disabled\n"
                "• Image generation disabled\n"
                "• Other text channels paused"
            ),
            inline=False
        )
        embed.set_footer(text="Use !miku leave to end the session")
        await message.channel.send(embed=embed)
        logger.info(f"Voice session started by {message.author} in {voice_channel.name}")
    except Exception as e:
        await message.channel.send(f"❌ Failed to join voice: {str(e)}")
        logger.error(f"Failed to start voice session: {e}", exc_info=True)
 async def _handle_leave(message):
    """
    Handle !miku leave command.
    Leave voice channel and release all resources.
    """
    if not voice_manager.active_session:
        await message.channel.send("❌ I'm not in a voice channel!")
        return
    # Check if user is in the same guild as the active session
    if voice_manager.active_session.guild_id != message.guild.id:
        await message.channel.send("❌ I'm in a voice channel in a different server!")
        return
    try:
        voice_channel_name = voice_manager.active_session.voice_channel.name
        await message.channel.send("👋 Leaving voice channel...")
        await voice_manager.end_session()
        embed = discord.Embed(
            title="👋 Voice Chat Ended",
            description=f"Left {voice_channel_name}",
            color=discord.Color.from_rgb(134, 206, 203)
        )
        embed.add_field(
            name="✅ Resources Released",
            value=(
                "• Vision model available\n"
                "• Image generation available\n"
                "• Text channels resumed\n"
                "• All features restored"
            ),
            inline=False
        )
        await message.channel.send(embed=embed)
        logger.info(f"Voice session ended by {message.author}")
    except Exception as e:
        await message.channel.send(f"⚠️ Error leaving voice: {str(e)}")
        logger.error(f"Failed to end voice session: {e}", exc_info=True)
 async def _handle_status(message):
    """
    Handle !miku voice-status command.
    Show current voice session status.
    """
    if not voice_manager.active_session:
        embed = discord.Embed(
            title="🔇 No Active Voice Session",
            description="I'm not currently in a voice channel.",
            color=discord.Color.greyple()
        )
        embed.add_field(
            name="To start",
            value="Use `!miku join` while in a voice channel",
            inline=False
        )
        await message.channel.send(embed=embed)
        return
    session = voice_manager.active_session
    # Check if in same guild
    if session.guild_id != message.guild.id:
        await message.channel.send("ℹ️ I'm in a voice channel in a different server.")
        return
    embed = discord.Embed(
        title="🎤 Voice Session Active",
        description=f"Currently in voice chat",
        color=discord.Color.from_rgb(134, 206, 203)
    )
    embed.add_field(
        name="Voice Channel",
        value=session.voice_channel.mention,
        inline=True
    )
    embed.add_field(
        name="Prompt Channel",
        value=session.text_channel.mention,
        inline=True
    )
    embed.add_field(
        name="📊 Resource Allocation",
        value=(
            "**GPU Usage:**\n"
            "• AMD RX 6800: Text model + RVC\n"
            "• GTX 1660: Soprano TTS only\n\n"
            "**Blocked Features:**\n"
            "• ❌ Vision model\n"
            "• ❌ Image generation\n"
            "• ❌ Bipolar mode\n"
            "• ❌ Profile picture changes\n"
            "• ⏸️ Autonomous engine\n"
            "• ⏸️ Scheduled events\n"
            "• 📦 Other text channels (queued)"
        ),
        inline=False
    )
    embed.set_footer(text="Use !miku leave to end the session")
    await message.channel.send(embed=embed)
--- a/bot/globals.py
+++ b/bot/globals.py
@@ -96,3 +96,12 @@ LAST_FULL_PROMPT = ""
 # Persona Dialogue System (conversations between Miku and Evil Miku)
 LAST_PERSONA_DIALOGUE_TIME = 0  # Timestamp of last dialogue for cooldown
 # Voice Chat Session State
 VOICE_SESSION_ACTIVE = False
 TEXT_MESSAGE_QUEUE = []  # Queue for messages received during voice session
 # Feature Blocking Flags (set during voice session)
 VISION_MODEL_BLOCKED = False
 IMAGE_GENERATION_BLOCKED = False
 IMAGE_GENERATION_BLOCK_MESSAGE = None
--- a/bot/requirements.txt
+++ b/bot/requirements.txt
@@ -20,3 +20,5 @@ numpy
 scikit-learn
 transformers
 torch
 PyNaCl>=1.5.0
 websockets>=12.0
--- a/bot/utils/autonomous.py
+++ b/bot/utils/autonomous.py
@@ -17,12 +17,34 @@ logger = get_logger('autonomous')
 _last_action_execution = {}  # guild_id -> timestamp
 _MIN_ACTION_INTERVAL = 30  # Minimum 30 seconds between autonomous actions
 # Pause state for voice sessions
 _autonomous_paused = False
 def pause_autonomous_system():
    """Pause autonomous message generation (called during voice sessions)"""
    global _autonomous_paused
    _autonomous_paused = True
    logger.info("Autonomous system paused")
 def resume_autonomous_system():
    """Resume autonomous message generation (called after voice sessions)"""
    global _autonomous_paused
    _autonomous_paused = False
    logger.info("Autonomous system resumed")
 async def autonomous_tick_v2(guild_id: int):
    """
    New autonomous tick that uses context-aware decision making.
    Replaces the random 10% chance with intelligent decision.
    """
    # Check if autonomous is paused (voice session)
    if _autonomous_paused:
        logger.debug(f"[V2] Autonomous system paused (voice session active)")
        return
    # Rate limiting check
    now = time.time()
    if guild_id in _last_action_execution:
--- a/bot/utils/bipolar_mode.py
+++ b/bot/utils/bipolar_mode.py
@@ -28,6 +28,31 @@ MIN_EXCHANGES = 4  # Minimum number of back-and-forth exchanges before ending ca
 ARGUMENT_TRIGGER_CHANCE = 0.15  # 15% chance for the other Miku to break through
 DELAY_BETWEEN_MESSAGES = (2.0, 5.0)  # Random delay between argument messages (seconds)
 # Pause state for voice sessions
 _bipolar_interactions_paused = False
 # ============================================================================
 # VOICE SESSION PAUSE/RESUME
 # ============================================================================
 def pause_bipolar_interactions():
    """Pause all bipolar interactions (called during voice sessions)"""
    global _bipolar_interactions_paused
    _bipolar_interactions_paused = True
    logger.info("Bipolar interactions paused")
 def resume_bipolar_interactions():
    """Resume bipolar interactions (called after voice sessions)"""
    global _bipolar_interactions_paused
    _bipolar_interactions_paused = False
    logger.info("Bipolar interactions resumed")
 def is_bipolar_paused():
    """Check if bipolar interactions are currently paused"""
    return _bipolar_interactions_paused
 # ============================================================================
 # STATE PERSISTENCE
 # ============================================================================
@@ -1039,6 +1064,11 @@ async def maybe_trigger_argument(channel: discord.TextChannel, client, context:
    if not globals.BIPOLAR_MODE:
        return False
    # Check if bipolar interactions are paused (voice session)
    if is_bipolar_paused():
        logger.debug("Bipolar argument blocked (voice session active)")
        return False
    if is_argument_in_progress(channel.id):
        return False
--- a/bot/utils/profile_picture_manager.py
+++ b/bot/utils/profile_picture_manager.py
@@ -47,6 +47,17 @@ class ProfilePictureManager:
    def __init__(self):
        self._ensure_directories()
        self.switching_locked = False  # Lock for voice session
    def lock_switching(self):
        """Lock profile picture changes during voice session"""
        self.switching_locked = True
        logger.info("Profile picture switching locked")
    def unlock_switching(self):
        """Unlock profile picture changes after voice session"""
        self.switching_locked = False
        logger.info("Profile picture switching unlocked")
    def _ensure_directories(self):
        """Ensure profile picture directory exists"""
@@ -247,6 +258,16 @@ class ProfilePictureManager:
        Returns:
            Dict with status and metadata
        """
        # Check if switching is locked (voice session active)
        if self.switching_locked:
            logger.info("Profile picture change blocked (voice session active)")
            return {
                "success": False,
                "source": None,
                "error": "Profile picture switching locked during voice session",
                "metadata": {}
            }
        result = {
            "success": False,
            "source": None,
--- a/bot/utils/voice_manager.py
+++ b/bot/utils/voice_manager.py
@@ -0,0 +1,358 @@
 # voice_manager.py
 """
 Voice session manager for Miku Discord bot.
 Handles Discord voice channel connections, resource locking, and feature blocking during voice sessions.
 During a voice session:
 - GPU switches to AMD for text inference only
 - Vision model is blocked (keeps GTX 1660 for TTS)
 - Image generation is blocked
 - Bipolar mode interactions are disabled
 - Profile picture switching is locked
 - Autonomous engine is paused
 - Scheduled events are paused
 - Text channels are paused (messages queued)
 """
 import asyncio
 import json
 import os
 from typing import Optional
 import discord
 import globals
 from utils.logger import get_logger
 logger = get_logger('voice_manager')
 class VoiceSessionManager:
    """
    Singleton manager for voice chat sessions.
    Ensures only one voice session active at a time and manages all resource locks.
    """
    _instance = None
    def __new__(cls):
        if cls._instance is None:
            cls._instance = super().__new__(cls)
            cls._instance._initialized = False
        return cls._instance
    def __init__(self):
        if self._initialized:
            return
        self.active_session: Optional['VoiceSession'] = None
        self.session_lock = asyncio.Lock()
        self._initialized = True
        logger.info("VoiceSessionManager initialized")
    async def start_session(self, guild_id: int, voice_channel: discord.VoiceChannel, text_channel: discord.TextChannel):
        """
        Start a voice session with full resource locking.
        Args:
            guild_id: Discord guild ID
            voice_channel: Voice channel to join
            text_channel: Text channel for voice prompts
        Raises:
            Exception: If session already active or resources can't be locked
        """
        async with self.session_lock:
            if self.active_session:
                raise Exception("Voice session already active")
            logger.info(f"Starting voice session in {voice_channel.name} (guild {guild_id})")
            try:
                # 1. Switch to AMD GPU for text inference
                await self._switch_to_amd_gpu()
                # 2. Block vision model loading
                await self._block_vision_model()
                # 3. Disable image generation (ComfyUI)
                await self._disable_image_generation()
                # 4. Pause text channel inference (queue messages)
                await self._pause_text_channels()
                # 5. Disable bipolar mode interactions (Miku/Evil Miku arguments)
                await self._disable_bipolar_mode()
                # 6. Disable profile picture switching
                await self._disable_profile_picture_switching()
                # 7. Pause autonomous engine
                await self._pause_autonomous_engine()
                # 8. Pause scheduled events
                await self._pause_scheduled_events()
                # 9. Pause figurine notifier
                await self._pause_figurine_notifier()
                # 10. Create and connect voice session
                self.active_session = VoiceSession(guild_id, voice_channel, text_channel)
                # Note: Actual voice connection will be implemented in Phase 2
                logger.info(f"✓ Voice session started successfully")
            except Exception as e:
                logger.error(f"Failed to start voice session: {e}", exc_info=True)
                # Cleanup on failure
                await self._cleanup_failed_start()
                raise
    async def end_session(self):
        """
        End voice session and release all resources.
        """
        async with self.session_lock:
            if not self.active_session:
                logger.warning("No active voice session to end")
                return
            logger.info("Ending voice session...")
            try:
                # 1. Disconnect from voice (Phase 2 implementation)
                # await self.active_session.disconnect()
                # 2. Resume text channel inference
                await self._resume_text_channels()
                # 3. Unblock vision model
                await self._unblock_vision_model()
                # 4. Re-enable image generation
                await self._enable_image_generation()
                # 5. Re-enable bipolar mode interactions
                await self._enable_bipolar_mode()
                # 6. Re-enable profile picture switching
                await self._enable_profile_picture_switching()
                # 7. Resume autonomous engine
                await self._resume_autonomous_engine()
                # 8. Resume scheduled events
                await self._resume_scheduled_events()
                # 9. Resume figurine notifier
                await self._resume_figurine_notifier()
                # 10. Clear active session
                self.active_session = None
                logger.info("✓ Voice session ended successfully, all resources released")
            except Exception as e:
                logger.error(f"Error during session cleanup: {e}", exc_info=True)
                # Force clear session even on error
                self.active_session = None
                raise
    # ==================== Resource Locking Methods ====================
    async def _switch_to_amd_gpu(self):
        """Switch text inference to AMD GPU (RX 6800)"""
        try:
            gpu_state_file = os.path.join("memory", "gpu_state.json")
            os.makedirs("memory", exist_ok=True)
            with open(gpu_state_file, "w") as f:
                json.dump({"current_gpu": "amd", "reason": "voice_session"}, f)
            logger.info("✓ Switched to AMD GPU for text inference")
        except Exception as e:
            logger.error(f"Failed to switch GPU: {e}")
            raise
    async def _block_vision_model(self):
        """Prevent vision model from loading during voice session"""
        globals.VISION_MODEL_BLOCKED = True
        logger.info("✓ Vision model blocked")
    async def _unblock_vision_model(self):
        """Allow vision model to load after voice session"""
        globals.VISION_MODEL_BLOCKED = False
        logger.info("✓ Vision model unblocked")
    async def _disable_image_generation(self):
        """Block ComfyUI image generation during voice session"""
        globals.IMAGE_GENERATION_BLOCKED = True
        globals.IMAGE_GENERATION_BLOCK_MESSAGE = (
            "🎤 I can't draw right now, I'm talking in voice chat! "
            "Ask me again after I leave the voice channel."
        )
        logger.info("✓ Image generation disabled")
    async def _enable_image_generation(self):
        """Re-enable image generation after voice session"""
        globals.IMAGE_GENERATION_BLOCKED = False
        globals.IMAGE_GENERATION_BLOCK_MESSAGE = None
        logger.info("✓ Image generation re-enabled")
    async def _pause_text_channels(self):
        """Queue text messages instead of processing during voice session"""
        globals.VOICE_SESSION_ACTIVE = True
        globals.TEXT_MESSAGE_QUEUE = []
        logger.info("✓ Text channels paused (messages will be queued)")
    async def _resume_text_channels(self):
        """Process queued messages after voice session"""
        globals.VOICE_SESSION_ACTIVE = False
        queued_count = len(globals.TEXT_MESSAGE_QUEUE)
        if queued_count > 0:
            logger.info(f"Resuming text channels, {queued_count} messages queued")
            # TODO: Process queue in Phase 2 (need message handler integration)
            # For now, just clear the queue
            globals.TEXT_MESSAGE_QUEUE = []
            logger.warning(f"Discarded {queued_count} queued messages (queue processing not yet implemented)")
        else:
            logger.info("✓ Text channels resumed (no queued messages)")
    async def _disable_bipolar_mode(self):
        """Prevent Miku/Evil Miku arguments during voice session"""
        try:
            from utils.bipolar_mode import pause_bipolar_interactions
            pause_bipolar_interactions()
            logger.info("✓ Bipolar mode interactions disabled")
        except ImportError:
            logger.warning("bipolar_mode module not found, skipping")
        except AttributeError:
            logger.warning("pause_bipolar_interactions not implemented yet, skipping")
    async def _enable_bipolar_mode(self):
        """Re-enable Miku/Evil Miku arguments after voice session"""
        try:
            from utils.bipolar_mode import resume_bipolar_interactions
            resume_bipolar_interactions()
            logger.info("✓ Bipolar mode interactions re-enabled")
        except ImportError:
            logger.warning("bipolar_mode module not found, skipping")
        except AttributeError:
            logger.warning("resume_bipolar_interactions not implemented yet, skipping")
    async def _disable_profile_picture_switching(self):
        """Lock profile picture during voice session"""
        try:
            from utils.profile_picture_manager import profile_picture_manager
            if hasattr(profile_picture_manager, 'lock_switching'):
                profile_picture_manager.lock_switching()
                logger.info("✓ Profile picture switching disabled")
            else:
                logger.warning("profile_picture_manager.lock_switching not implemented yet, skipping")
        except ImportError:
            logger.warning("profile_picture_manager module not found, skipping")
    async def _enable_profile_picture_switching(self):
        """Unlock profile picture after voice session"""
        try:
            from utils.profile_picture_manager import profile_picture_manager
            if hasattr(profile_picture_manager, 'unlock_switching'):
                profile_picture_manager.unlock_switching()
                logger.info("✓ Profile picture switching re-enabled")
            else:
                logger.warning("profile_picture_manager.unlock_switching not implemented yet, skipping")
        except ImportError:
            logger.warning("profile_picture_manager module not found, skipping")
    async def _pause_autonomous_engine(self):
        """Pause autonomous message generation during voice session"""
        try:
            from utils.autonomous import pause_autonomous_system
            pause_autonomous_system()
            logger.info("✓ Autonomous engine paused")
        except ImportError:
            logger.warning("autonomous module not found, skipping")
        except AttributeError:
            logger.warning("pause_autonomous_system not implemented yet, skipping")
    async def _resume_autonomous_engine(self):
        """Resume autonomous message generation after voice session"""
        try:
            from utils.autonomous import resume_autonomous_system
            resume_autonomous_system()
            logger.info("✓ Autonomous engine resumed")
        except ImportError:
            logger.warning("autonomous module not found, skipping")
        except AttributeError:
            logger.warning("resume_autonomous_system not implemented yet, skipping")
    async def _pause_scheduled_events(self):
        """Pause all scheduled jobs during voice session"""
        try:
            globals.scheduler.pause()
            logger.info("✓ Scheduled events paused")
        except Exception as e:
            logger.error(f"Failed to pause scheduler: {e}")
    async def _resume_scheduled_events(self):
        """Resume scheduled jobs after voice session"""
        try:
            globals.scheduler.resume()
            logger.info("✓ Scheduled events resumed")
        except Exception as e:
            logger.error(f"Failed to resume scheduler: {e}")
    async def _pause_figurine_notifier(self):
        """Pause figurine notifications during voice session"""
        try:
            # Assuming figurine notifier is a scheduled job
            globals.scheduler.pause_job('figurine_notifier')
            logger.info("✓ Figurine notifier paused")
        except Exception as e:
            # Job might not exist, that's okay
            logger.debug(f"Could not pause figurine notifier (may not exist): {e}")
    async def _resume_figurine_notifier(self):
        """Resume figurine notifications after voice session"""
        try:
            globals.scheduler.resume_job('figurine_notifier')
            logger.info("✓ Figurine notifier resumed")
        except Exception as e:
            # Job might not exist, that's okay
            logger.debug(f"Could not resume figurine notifier (may not exist): {e}")
    async def _cleanup_failed_start(self):
        """Cleanup resources if session start fails"""
        logger.warning("Cleaning up after failed session start...")
        try:
            await self._unblock_vision_model()
            await self._enable_image_generation()
            await self._resume_text_channels()
            await self._enable_bipolar_mode()
            await self._enable_profile_picture_switching()
            await self._resume_autonomous_engine()
            await self._resume_scheduled_events()
            await self._resume_figurine_notifier()
        except Exception as e:
            logger.error(f"Error during cleanup: {e}")
 class VoiceSession:
    """
    Represents an active voice chat session.
    Phase 1: Basic structure only, voice connection in Phase 2.
    """
    def __init__(self, guild_id: int, voice_channel: discord.VoiceChannel, text_channel: discord.TextChannel):
        self.guild_id = guild_id
        self.voice_channel = voice_channel
        self.text_channel = text_channel
        self.voice_client: Optional[discord.VoiceClient] = None
        self.active = False
        logger.info(f"VoiceSession created for {voice_channel.name} in guild {guild_id}")
    # Phase 2: Implement voice connection, audio streaming, TTS integration
 # Global singleton instance
 voice_manager = VoiceSessionManager()
--- a/readmes/VOICE_CHAT_IMPLEMENTATION_PLAN.md
+++ b/readmes/VOICE_CHAT_IMPLEMENTATION_PLAN.md