- Replaced Playwright browser scraping with direct API media extraction - Both fetch_miku_tweets() and fetch_figurine_tweets_latest() now use twscrape's built-in media info - Reduced tweet fetching from 10-15 minutes to ~5 seconds - Eliminated browser timeout/hanging issues - Relaxed autonomous tweet sharing conditions: * Increased message threshold from 10 to 20 per hour * Reduced cooldown from 3600s to 2400s (40 minutes) * Increased energy threshold from 50% to 70% * Added 'silly' and 'flirty' moods to allowed sharing moods This makes both figurine notifications and tweet sharing much more reliable and responsive.
196 lines
6.9 KiB
Python
196 lines
6.9 KiB
Python
# utils/twitter_fetcher.py
|
|
|
|
import asyncio
|
|
import json
|
|
from typing import Dict, Any
|
|
|
|
# Apply twscrape fix BEFORE importing twscrape
|
|
from utils.twscrape_fix import apply_twscrape_fix
|
|
apply_twscrape_fix()
|
|
|
|
from twscrape import API, gather, Account
|
|
from playwright.async_api import async_playwright
|
|
from pathlib import Path
|
|
from utils.logger import get_logger
|
|
|
|
logger = get_logger('media')
|
|
|
|
COOKIE_PATH = Path(__file__).parent / "x.com.cookies.json"
|
|
|
|
async def extract_media_urls(page, tweet_url):
|
|
logger.debug(f"Visiting tweet page: {tweet_url}")
|
|
try:
|
|
# Reduced timeout to 10s to prevent hanging
|
|
await page.goto(tweet_url, timeout=10000)
|
|
await page.wait_for_timeout(1000)
|
|
|
|
media_elements = await page.query_selector_all("img[src*='pbs.twimg.com/media']")
|
|
urls = set()
|
|
|
|
for element in media_elements:
|
|
src = await element.get_attribute("src")
|
|
if src:
|
|
cleaned = src.split("&name=")[0] + "&name=large"
|
|
urls.add(cleaned)
|
|
|
|
logger.debug(f"Found {len(urls)} media URLs on tweet: {tweet_url}")
|
|
return list(urls)
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Playwright error on {tweet_url}: {e}")
|
|
return []
|
|
|
|
async def fetch_miku_tweets(limit=5):
|
|
"""
|
|
Search for Miku tweets with images.
|
|
OPTIMIZED: Uses twscrape's built-in media info instead of Playwright.
|
|
"""
|
|
# Load cookies from JSON file
|
|
with open(COOKIE_PATH, "r", encoding="utf-8") as f:
|
|
cookie_list = json.load(f)
|
|
cookie_header = "; ".join(f"{c['name']}={c['value']}" for c in cookie_list)
|
|
|
|
# Add the account to twscrape
|
|
api = API()
|
|
await api.pool.add_account(
|
|
username="HSankyuu39",
|
|
password="x", # placeholder (won't be used)
|
|
email="x", # optional
|
|
email_password="x", # optional
|
|
cookies=cookie_header
|
|
)
|
|
await api.pool.login_all()
|
|
|
|
logger.info(f"Searching for Miku tweets (limit={limit})...")
|
|
query = 'Hatsune Miku OR 初音ミク has:images after:2025'
|
|
tweets = await gather(api.search(query, limit=limit, kv={"product": "Top"}))
|
|
|
|
logger.info(f"Found {len(tweets)} tweets from API, extracting media...")
|
|
|
|
# Extract media directly from tweet objects (no browser needed!)
|
|
results = []
|
|
for tweet in tweets:
|
|
try:
|
|
# Check if tweet has media
|
|
if not hasattr(tweet, 'media') or not tweet.media:
|
|
continue
|
|
|
|
# Extract media URLs from tweet object
|
|
media_urls = []
|
|
if hasattr(tweet.media, 'photos'):
|
|
for photo in tweet.media.photos:
|
|
if hasattr(photo, 'url'):
|
|
# Get the highest quality version
|
|
media_url = photo.url
|
|
if '?' in media_url:
|
|
media_url = media_url.split('?')[0]
|
|
media_url += '?name=large'
|
|
media_urls.append(media_url)
|
|
|
|
if media_urls:
|
|
username = tweet.user.username
|
|
tweet_url = f"https://twitter.com/{username}/status/{tweet.id}"
|
|
results.append({
|
|
"username": username,
|
|
"text": tweet.rawContent,
|
|
"url": tweet_url,
|
|
"media": media_urls
|
|
})
|
|
logger.debug(f"Extracted {len(media_urls)} media URLs from @{username}'s tweet")
|
|
except Exception as e:
|
|
logger.warning(f"Error extracting media from tweet: {e}")
|
|
continue
|
|
|
|
logger.info(f"Finished! Returning {len(results)} tweet(s) with media (no browser needed!)")
|
|
return results
|
|
|
|
|
|
async def _search_latest(api: API, query: str, limit: int) -> list:
|
|
# kv product "Latest" to search by latest
|
|
try:
|
|
return await gather(api.search(query, limit=limit, kv={"product": "Latest"}))
|
|
except Exception as e:
|
|
logger.error(f"Latest search failed for '{query}': {e}")
|
|
return []
|
|
|
|
|
|
async def fetch_figurine_tweets_latest(limit_per_source: int = 10) -> list:
|
|
"""Search three sources by Latest, collect tweets with images, and return unified list of dicts.
|
|
Sources:
|
|
- "miku figure from:mecchaJP"
|
|
- "miku from:GoodSmile_US"
|
|
- "miku from:OtakuOwletMerch"
|
|
|
|
OPTIMIZED: Uses twscrape's built-in media info instead of Playwright browser scraping.
|
|
This is much faster and doesn't risk hanging.
|
|
"""
|
|
# Load cookies
|
|
with open(COOKIE_PATH, "r", encoding="utf-8") as f:
|
|
cookie_list = json.load(f)
|
|
cookie_header = "; ".join(f"{c['name']}={c['value']}" for c in cookie_list)
|
|
|
|
api = API()
|
|
await api.pool.add_account(
|
|
username="HSankyuu39",
|
|
password="x",
|
|
email="x",
|
|
email_password="x",
|
|
cookies=cookie_header
|
|
)
|
|
await api.pool.login_all()
|
|
|
|
queries = [
|
|
"miku figure from:mecchaJP",
|
|
"miku from:GoodSmile_US",
|
|
"miku from:OtakuOwletMerch",
|
|
]
|
|
|
|
logger.info("Searching figurine tweets by Latest across sources...")
|
|
all_tweets = []
|
|
for q in queries:
|
|
tweets = await _search_latest(api, q, limit_per_source)
|
|
all_tweets.extend(tweets)
|
|
|
|
logger.info(f"Found {len(all_tweets)} candidate tweets from API")
|
|
|
|
# Extract media directly from tweet objects (much faster!)
|
|
results = []
|
|
for tweet in all_tweets:
|
|
try:
|
|
# Check if tweet has media
|
|
if not hasattr(tweet, 'media') or not tweet.media:
|
|
continue
|
|
|
|
# Extract media URLs from tweet object
|
|
media_urls = []
|
|
if hasattr(tweet.media, 'photos'):
|
|
for photo in tweet.media.photos:
|
|
if hasattr(photo, 'url'):
|
|
# Get the highest quality version
|
|
media_url = photo.url
|
|
if '?' in media_url:
|
|
media_url = media_url.split('?')[0]
|
|
media_url += '?name=large'
|
|
media_urls.append(media_url)
|
|
|
|
if media_urls:
|
|
username = tweet.user.username
|
|
tweet_url = f"https://twitter.com/{username}/status/{tweet.id}"
|
|
results.append({
|
|
"username": username,
|
|
"text": tweet.rawContent,
|
|
"url": tweet_url,
|
|
"media": media_urls
|
|
})
|
|
logger.debug(f"Extracted {len(media_urls)} media URLs from @{username}'s tweet")
|
|
except Exception as e:
|
|
logger.warning(f"Error extracting media from tweet: {e}")
|
|
continue
|
|
|
|
logger.info(f"Figurine fetch finished. Returning {len(results)} tweet(s) with media (no browser needed!)")
|
|
return results
|
|
|
|
|
|
# Note: fetch_tweet_by_url was removed - now using twscrape-based approach in figurine_notifier.py
|
|
# This avoids Playwright browser dependencies while maintaining functionality
|