# utils/twitter_fetcher.py import asyncio import json from typing import Dict, Any # Apply twscrape fix BEFORE importing twscrape from utils.twscrape_fix import apply_twscrape_fix apply_twscrape_fix() from twscrape import API, gather, Account from playwright.async_api import async_playwright from pathlib import Path from utils.logger import get_logger logger = get_logger('media') COOKIE_PATH = Path(__file__).parent / "x.com.cookies.json" async def extract_media_urls(page, tweet_url): logger.debug(f"Visiting tweet page: {tweet_url}") try: # Reduced timeout to 10s to prevent hanging await page.goto(tweet_url, timeout=10000) await page.wait_for_timeout(1000) media_elements = await page.query_selector_all("img[src*='pbs.twimg.com/media']") urls = set() for element in media_elements: src = await element.get_attribute("src") if src: cleaned = src.split("&name=")[0] + "&name=large" urls.add(cleaned) logger.debug(f"Found {len(urls)} media URLs on tweet: {tweet_url}") return list(urls) except Exception as e: logger.warning(f"Playwright error on {tweet_url}: {e}") return [] async def fetch_miku_tweets(limit=5): """ Search for Miku tweets with images. OPTIMIZED: Uses twscrape's built-in media info instead of Playwright. """ # Load cookies from JSON file with open(COOKIE_PATH, "r", encoding="utf-8") as f: cookie_list = json.load(f) cookie_header = "; ".join(f"{c['name']}={c['value']}" for c in cookie_list) # Add the account to twscrape api = API() await api.pool.add_account( username="HSankyuu39", password="x", # placeholder (won't be used) email="x", # optional email_password="x", # optional cookies=cookie_header ) await api.pool.login_all() logger.info(f"Searching for Miku tweets (limit={limit})...") query = 'Hatsune Miku OR 初音ミク has:images after:2025' tweets = await gather(api.search(query, limit=limit, kv={"product": "Top"})) logger.info(f"Found {len(tweets)} tweets from API, extracting media...") # Extract media directly from tweet objects (no browser needed!) results = [] for tweet in tweets: try: # Check if tweet has media if not hasattr(tweet, 'media') or not tweet.media: continue # Extract media URLs from tweet object media_urls = [] if hasattr(tweet.media, 'photos'): for photo in tweet.media.photos: if hasattr(photo, 'url'): # Get the highest quality version media_url = photo.url if '?' in media_url: media_url = media_url.split('?')[0] media_url += '?name=large' media_urls.append(media_url) if media_urls: username = tweet.user.username tweet_url = f"https://twitter.com/{username}/status/{tweet.id}" results.append({ "username": username, "text": tweet.rawContent, "url": tweet_url, "media": media_urls }) logger.debug(f"Extracted {len(media_urls)} media URLs from @{username}'s tweet") except Exception as e: logger.warning(f"Error extracting media from tweet: {e}") continue logger.info(f"Finished! Returning {len(results)} tweet(s) with media (no browser needed!)") return results async def _search_latest(api: API, query: str, limit: int) -> list: # kv product "Latest" to search by latest try: return await gather(api.search(query, limit=limit, kv={"product": "Latest"})) except Exception as e: logger.error(f"Latest search failed for '{query}': {e}") return [] async def fetch_figurine_tweets_latest(limit_per_source: int = 10) -> list: """Search three sources by Latest, collect tweets with images, and return unified list of dicts. Sources: - "miku figure from:mecchaJP" - "miku from:GoodSmile_US" - "miku from:OtakuOwletMerch" OPTIMIZED: Uses twscrape's built-in media info instead of Playwright browser scraping. This is much faster and doesn't risk hanging. """ # Load cookies with open(COOKIE_PATH, "r", encoding="utf-8") as f: cookie_list = json.load(f) cookie_header = "; ".join(f"{c['name']}={c['value']}" for c in cookie_list) api = API() await api.pool.add_account( username="HSankyuu39", password="x", email="x", email_password="x", cookies=cookie_header ) await api.pool.login_all() queries = [ "miku figure from:mecchaJP", "miku from:GoodSmile_US", "miku from:OtakuOwletMerch", ] logger.info("Searching figurine tweets by Latest across sources...") all_tweets = [] for q in queries: tweets = await _search_latest(api, q, limit_per_source) all_tweets.extend(tweets) logger.info(f"Found {len(all_tweets)} candidate tweets from API") # Extract media directly from tweet objects (much faster!) results = [] for tweet in all_tweets: try: # Check if tweet has media if not hasattr(tweet, 'media') or not tweet.media: continue # Extract media URLs from tweet object media_urls = [] if hasattr(tweet.media, 'photos'): for photo in tweet.media.photos: if hasattr(photo, 'url'): # Get the highest quality version media_url = photo.url if '?' in media_url: media_url = media_url.split('?')[0] media_url += '?name=large' media_urls.append(media_url) if media_urls: username = tweet.user.username tweet_url = f"https://twitter.com/{username}/status/{tweet.id}" results.append({ "username": username, "text": tweet.rawContent, "url": tweet_url, "media": media_urls }) logger.debug(f"Extracted {len(media_urls)} media URLs from @{username}'s tweet") except Exception as e: logger.warning(f"Error extracting media from tweet: {e}") continue logger.info(f"Figurine fetch finished. Returning {len(results)} tweet(s) with media (no browser needed!)") return results # Note: fetch_tweet_by_url was removed - now using twscrape-based approach in figurine_notifier.py # This avoids Playwright browser dependencies while maintaining functionality