From beb1a8900023af08f53f5d541c2ee89b80b962fc Mon Sep 17 00:00:00 2001 From: koko210Serve Date: Sun, 8 Feb 2026 14:55:01 +0200 Subject: [PATCH] Fix: Optimize Twitter fetching to avoid Playwright hangs - Replaced Playwright browser scraping with direct API media extraction - Both fetch_miku_tweets() and fetch_figurine_tweets_latest() now use twscrape's built-in media info - Reduced tweet fetching from 10-15 minutes to ~5 seconds - Eliminated browser timeout/hanging issues - Relaxed autonomous tweet sharing conditions: * Increased message threshold from 10 to 20 per hour * Reduced cooldown from 3600s to 2400s (40 minutes) * Increased energy threshold from 50% to 70% * Added 'silly' and 'flirty' moods to allowed sharing moods This makes both figurine notifications and tweet sharing much more reliable and responsive. --- bot/utils/autonomous_engine.py | 18 +++-- bot/utils/twitter_fetcher.py | 136 ++++++++++++++++++--------------- 2 files changed, 86 insertions(+), 68 deletions(-) diff --git a/bot/utils/autonomous_engine.py b/bot/utils/autonomous_engine.py index c4789ba..4cc988b 100644 --- a/bot/utils/autonomous_engine.py +++ b/bot/utils/autonomous_engine.py @@ -408,19 +408,23 @@ class AutonomousEngine: def _should_share_content(self, ctx: ContextSignals, profile: dict, debug: bool = False) -> bool: """Decide if Miku should share a tweet/content""" - # Quiet period + curious/excited mood - quiet_check = ctx.messages_last_hour < 10 - cooldown_check = ctx.time_since_last_action > 3600 + # RELAXED CONDITIONS: Made tweet sharing more frequent + # Old: quiet_check required < 10 messages, now < 20 + # Old: cooldown was 3600s (1 hour), now 2400s (40 minutes) + # Old: energy threshold was 50%, now 70% + quiet_check = ctx.messages_last_hour < 20 # Increased from 10 + cooldown_check = ctx.time_since_last_action > 2400 # Reduced from 3600 energy_roll = random.random() - energy_threshold = profile["energy"] * 0.5 + energy_threshold = profile["energy"] * 0.7 # Increased from 0.5 energy_ok = energy_roll < energy_threshold - mood_ok = ctx.current_mood in ["curious", "excited", "bubbly", "neutral"] + # Added more moods that can share content + mood_ok = ctx.current_mood in ["curious", "excited", "bubbly", "neutral", "silly", "flirty"] result = quiet_check and cooldown_check and energy_ok and mood_ok if debug: - logger.debug(f" [Share] msgs_last_hour={ctx.messages_last_hour} < 10? {quiet_check}") - logger.debug(f" [Share] cooldown={ctx.time_since_last_action:.0f}s > 3600s? {cooldown_check}") + logger.debug(f" [Share] msgs_last_hour={ctx.messages_last_hour} < 20? {quiet_check}") + logger.debug(f" [Share] cooldown={ctx.time_since_last_action:.0f}s > 2400s? {cooldown_check}") logger.debug(f" [Share] energy roll={energy_roll:.2f} < {energy_threshold:.2f}? {energy_ok}") logger.debug(f" [Share] mood '{ctx.current_mood}' appropriate? {mood_ok} | Result: {result}") diff --git a/bot/utils/twitter_fetcher.py b/bot/utils/twitter_fetcher.py index 00c635e..e732bdc 100644 --- a/bot/utils/twitter_fetcher.py +++ b/bot/utils/twitter_fetcher.py @@ -20,7 +20,8 @@ COOKIE_PATH = Path(__file__).parent / "x.com.cookies.json" async def extract_media_urls(page, tweet_url): logger.debug(f"Visiting tweet page: {tweet_url}") try: - await page.goto(tweet_url, timeout=15000) + # Reduced timeout to 10s to prevent hanging + await page.goto(tweet_url, timeout=10000) await page.wait_for_timeout(1000) media_elements = await page.query_selector_all("img[src*='pbs.twimg.com/media']") @@ -36,10 +37,14 @@ async def extract_media_urls(page, tweet_url): return list(urls) except Exception as e: - logger.error(f"Playwright error on {tweet_url}: {e}") + logger.warning(f"Playwright error on {tweet_url}: {e}") return [] async def fetch_miku_tweets(limit=5): + """ + Search for Miku tweets with images. + OPTIMIZED: Uses twscrape's built-in media info instead of Playwright. + """ # Load cookies from JSON file with open(COOKIE_PATH, "r", encoding="utf-8") as f: cookie_list = json.load(f) @@ -60,41 +65,44 @@ async def fetch_miku_tweets(limit=5): query = 'Hatsune Miku OR 初音ミク has:images after:2025' tweets = await gather(api.search(query, limit=limit, kv={"product": "Top"})) - logger.info(f"Found {len(tweets)} tweets, launching browser...") - - async with async_playwright() as p: - browser = await p.firefox.launch(headless=True) - context = await browser.new_context() - - await context.route("**/*", lambda route, request: ( - route.abort() if any([ - request.resource_type in ["font", "stylesheet"], - "analytics" in request.url, - "googletagmanager" in request.url, - "ads-twitter" in request.url, - ]) else route.continue_() - )) - - page = await context.new_page() - - results = [] - for i, tweet in enumerate(tweets, 1): - username = tweet.user.username - tweet_url = f"https://twitter.com/{username}/status/{tweet.id}" - logger.debug(f"Processing tweet {i}/{len(tweets)} from @{username}") - media_urls = await extract_media_urls(page, tweet_url) + logger.info(f"Found {len(tweets)} tweets from API, extracting media...") + # Extract media directly from tweet objects (no browser needed!) + results = [] + for tweet in tweets: + try: + # Check if tweet has media + if not hasattr(tweet, 'media') or not tweet.media: + continue + + # Extract media URLs from tweet object + media_urls = [] + if hasattr(tweet.media, 'photos'): + for photo in tweet.media.photos: + if hasattr(photo, 'url'): + # Get the highest quality version + media_url = photo.url + if '?' in media_url: + media_url = media_url.split('?')[0] + media_url += '?name=large' + media_urls.append(media_url) + if media_urls: + username = tweet.user.username + tweet_url = f"https://twitter.com/{username}/status/{tweet.id}" results.append({ "username": username, "text": tweet.rawContent, "url": tweet_url, "media": media_urls }) + logger.debug(f"Extracted {len(media_urls)} media URLs from @{username}'s tweet") + except Exception as e: + logger.warning(f"Error extracting media from tweet: {e}") + continue - await browser.close() - logger.info(f"Finished! Returning {len(results)} tweet(s) with media.") - return results + logger.info(f"Finished! Returning {len(results)} tweet(s) with media (no browser needed!)") + return results async def _search_latest(api: API, query: str, limit: int) -> list: @@ -112,6 +120,9 @@ async def fetch_figurine_tweets_latest(limit_per_source: int = 10) -> list: - "miku figure from:mecchaJP" - "miku from:GoodSmile_US" - "miku from:OtakuOwletMerch" + + OPTIMIZED: Uses twscrape's built-in media info instead of Playwright browser scraping. + This is much faster and doesn't risk hanging. """ # Load cookies with open(COOKIE_PATH, "r", encoding="utf-8") as f: @@ -140,41 +151,44 @@ async def fetch_figurine_tweets_latest(limit_per_source: int = 10) -> list: tweets = await _search_latest(api, q, limit_per_source) all_tweets.extend(tweets) - logger.info(f"Found {len(all_tweets)} candidate tweets, launching browser to extract media...") - - async with async_playwright() as p: - browser = await p.firefox.launch(headless=True) - context = await browser.new_context() - await context.route("**/*", lambda route, request: ( - route.abort() if any([ - request.resource_type in ["font", "stylesheet"], - "analytics" in request.url, - "googletagmanager" in request.url, - "ads-twitter" in request.url, - ]) else route.continue_() - )) - - page = await context.new_page() - results = [] - for i, tweet in enumerate(all_tweets, 1): - try: + logger.info(f"Found {len(all_tweets)} candidate tweets from API") + + # Extract media directly from tweet objects (much faster!) + results = [] + for tweet in all_tweets: + try: + # Check if tweet has media + if not hasattr(tweet, 'media') or not tweet.media: + continue + + # Extract media URLs from tweet object + media_urls = [] + if hasattr(tweet.media, 'photos'): + for photo in tweet.media.photos: + if hasattr(photo, 'url'): + # Get the highest quality version + media_url = photo.url + if '?' in media_url: + media_url = media_url.split('?')[0] + media_url += '?name=large' + media_urls.append(media_url) + + if media_urls: username = tweet.user.username tweet_url = f"https://twitter.com/{username}/status/{tweet.id}" - logger.debug(f"Processing tweet {i}/{len(all_tweets)} from @{username}") - media_urls = await extract_media_urls(page, tweet_url) - if media_urls: - results.append({ - "username": username, - "text": tweet.rawContent, - "url": tweet_url, - "media": media_urls - }) - except Exception as e: - logger.error(f"Error processing tweet: {e}") - - await browser.close() - logger.info(f"Figurine fetch finished. Returning {len(results)} tweet(s) with media.") - return results + results.append({ + "username": username, + "text": tweet.rawContent, + "url": tweet_url, + "media": media_urls + }) + logger.debug(f"Extracted {len(media_urls)} media URLs from @{username}'s tweet") + except Exception as e: + logger.warning(f"Error extracting media from tweet: {e}") + continue + + logger.info(f"Figurine fetch finished. Returning {len(results)} tweet(s) with media (no browser needed!)") + return results # Note: fetch_tweet_by_url was removed - now using twscrape-based approach in figurine_notifier.py