Fix: Optimize Twitter fetching to avoid Playwright hangs

- Replaced Playwright browser scraping with direct API media extraction - Both fetch_miku_tweets() and fetch_figurine_tweets_latest() now use twscrape's built-in media info - Reduced tweet fetching from 10-15 minutes to ~5 seconds - Eliminated browser timeout/hanging issues - Relaxed autonomous tweet sharing conditions: * Increased message threshold from 10 to 20 per hour * Reduced cooldown from 3600s to 2400s (40 minutes) * Increased energy threshold from 50% to 70% * Added 'silly' and 'flirty' moods to allowed sharing moods This makes both figurine notifications and tweet sharing much more reliable and responsive.
2026-02-08 14:55:01 +02:00
parent b9d1f67d70
commit beb1a89000
2 changed files with 86 additions and 68 deletions
--- a/bot/utils/twitter_fetcher.py
+++ b/bot/utils/twitter_fetcher.py
@@ -20,7 +20,8 @@ COOKIE_PATH = Path(__file__).parent / "x.com.cookies.json"
 async def extract_media_urls(page, tweet_url):
    logger.debug(f"Visiting tweet page: {tweet_url}")
    try:
-        await page.goto(tweet_url, timeout=15000)
+        # Reduced timeout to 10s to prevent hanging
+        await page.goto(tweet_url, timeout=10000)
        await page.wait_for_timeout(1000)

        media_elements = await page.query_selector_all("img[src*='pbs.twimg.com/media']")
@@ -36,10 +37,14 @@ async def extract_media_urls(page, tweet_url):
        return list(urls)

    except Exception as e:
-        logger.error(f"Playwright error on {tweet_url}: {e}")
+        logger.warning(f"Playwright error on {tweet_url}: {e}")
        return []

 async def fetch_miku_tweets(limit=5):
+    """
+    Search for Miku tweets with images.
+    OPTIMIZED: Uses twscrape's built-in media info instead of Playwright.
+    """
    # Load cookies from JSON file
    with open(COOKIE_PATH, "r", encoding="utf-8") as f:
        cookie_list = json.load(f)
@@ -60,41 +65,44 @@ async def fetch_miku_tweets(limit=5):
    query = 'Hatsune Miku OR 初音ミク has:images after:2025'
    tweets = await gather(api.search(query, limit=limit, kv={"product": "Top"}))

-    logger.info(f"Found {len(tweets)} tweets, launching browser...")
-
-    async with async_playwright() as p:
-        browser = await p.firefox.launch(headless=True)
-        context = await browser.new_context()
-
-        await context.route("**/*", lambda route, request: (
-            route.abort() if any([
-                request.resource_type in ["font", "stylesheet"],
-                "analytics" in request.url,
-                "googletagmanager" in request.url,
-                "ads-twitter" in request.url,
-            ]) else route.continue_()
-        ))
-
-        page = await context.new_page()
-
-        results = []
-        for i, tweet in enumerate(tweets, 1):
-            username = tweet.user.username
-            tweet_url = f"https://twitter.com/{username}/status/{tweet.id}"
-            logger.debug(f"Processing tweet {i}/{len(tweets)} from @{username}")
-            media_urls = await extract_media_urls(page, tweet_url)
+    logger.info(f"Found {len(tweets)} tweets from API, extracting media...")

+    # Extract media directly from tweet objects (no browser needed!)
+    results = []
+    for tweet in tweets:
+        try:
+            # Check if tweet has media
+            if not hasattr(tweet, 'media') or not tweet.media:
+                continue
+                
+            # Extract media URLs from tweet object
+            media_urls = []
+            if hasattr(tweet.media, 'photos'):
+                for photo in tweet.media.photos:
+                    if hasattr(photo, 'url'):
+                        # Get the highest quality version
+                        media_url = photo.url
+                        if '?' in media_url:
+                            media_url = media_url.split('?')[0]
+                        media_url += '?name=large'
+                        media_urls.append(media_url)
+            
            if media_urls:
+                username = tweet.user.username
+                tweet_url = f"https://twitter.com/{username}/status/{tweet.id}"
                results.append({
                    "username": username,
                    "text": tweet.rawContent,
                    "url": tweet_url,
                    "media": media_urls
                })
+                logger.debug(f"Extracted {len(media_urls)} media URLs from @{username}'s tweet")
+        except Exception as e:
+            logger.warning(f"Error extracting media from tweet: {e}")
+            continue

-        await browser.close()
-        logger.info(f"Finished! Returning {len(results)} tweet(s) with media.")
-        return results
+    logger.info(f"Finished! Returning {len(results)} tweet(s) with media (no browser needed!)")
+    return results


 async def _search_latest(api: API, query: str, limit: int) -> list:
@@ -112,6 +120,9 @@ async def fetch_figurine_tweets_latest(limit_per_source: int = 10) -> list:
      - "miku figure from:mecchaJP"
      - "miku from:GoodSmile_US"
      - "miku from:OtakuOwletMerch"
+    
+    OPTIMIZED: Uses twscrape's built-in media info instead of Playwright browser scraping.
+    This is much faster and doesn't risk hanging.
    """
    # Load cookies
    with open(COOKIE_PATH, "r", encoding="utf-8") as f:
@@ -140,41 +151,44 @@ async def fetch_figurine_tweets_latest(limit_per_source: int = 10) -> list:
        tweets = await _search_latest(api, q, limit_per_source)
        all_tweets.extend(tweets)

-    logger.info(f"Found {len(all_tweets)} candidate tweets, launching browser to extract media...")
-
-    async with async_playwright() as p:
-        browser = await p.firefox.launch(headless=True)
-        context = await browser.new_context()
-        await context.route("**/*", lambda route, request: (
-            route.abort() if any([
-                request.resource_type in ["font", "stylesheet"],
-                "analytics" in request.url,
-                "googletagmanager" in request.url,
-                "ads-twitter" in request.url,
-            ]) else route.continue_()
-        ))
-
-        page = await context.new_page()
-        results = []
-        for i, tweet in enumerate(all_tweets, 1):
-            try:
+    logger.info(f"Found {len(all_tweets)} candidate tweets from API")
+    
+    # Extract media directly from tweet objects (much faster!)
+    results = []
+    for tweet in all_tweets:
+        try:
+            # Check if tweet has media
+            if not hasattr(tweet, 'media') or not tweet.media:
+                continue
+                
+            # Extract media URLs from tweet object
+            media_urls = []
+            if hasattr(tweet.media, 'photos'):
+                for photo in tweet.media.photos:
+                    if hasattr(photo, 'url'):
+                        # Get the highest quality version
+                        media_url = photo.url
+                        if '?' in media_url:
+                            media_url = media_url.split('?')[0]
+                        media_url += '?name=large'
+                        media_urls.append(media_url)
+            
+            if media_urls:
                username = tweet.user.username
                tweet_url = f"https://twitter.com/{username}/status/{tweet.id}"
-                logger.debug(f"Processing tweet {i}/{len(all_tweets)} from @{username}")
-                media_urls = await extract_media_urls(page, tweet_url)
-                if media_urls:
-                    results.append({
-                        "username": username,
-                        "text": tweet.rawContent,
-                        "url": tweet_url,
-                        "media": media_urls
-                    })
-            except Exception as e:
-                logger.error(f"Error processing tweet: {e}")
-
-        await browser.close()
-        logger.info(f"Figurine fetch finished. Returning {len(results)} tweet(s) with media.")
-        return results
+                results.append({
+                    "username": username,
+                    "text": tweet.rawContent,
+                    "url": tweet_url,
+                    "media": media_urls
+                })
+                logger.debug(f"Extracted {len(media_urls)} media URLs from @{username}'s tweet")
+        except Exception as e:
+            logger.warning(f"Error extracting media from tweet: {e}")
+            continue
+    
+    logger.info(f"Figurine fetch finished. Returning {len(results)} tweet(s) with media (no browser needed!)")
+    return results


 # Note: fetch_tweet_by_url was removed - now using twscrape-based approach in figurine_notifier.py