- Created new logging infrastructure with per-component filtering - Added 6 log levels: DEBUG, INFO, API, WARNING, ERROR, CRITICAL - Implemented non-hierarchical level control (any combination can be enabled) - Migrated 917 print() statements across 31 files to structured logging - Created web UI (system.html) for runtime configuration with dark theme - Added global level controls to enable/disable levels across all components - Added timestamp format control (off/time/date/datetime options) - Implemented log rotation (10MB per file, 5 backups) - Added API endpoints for dynamic log configuration - Configured HTTP request logging with filtering via api.requests component - Intercepted APScheduler logs with proper formatting - Fixed persistence paths to use /app/memory for Docker volume compatibility - Fixed checkbox display bug in web UI (enabled_levels now properly shown) - Changed System Settings button to open in same tab instead of new window Components: bot, api, api.requests, autonomous, persona, vision, llm, conversation, mood, dm, scheduled, gpu, media, server, commands, sentiment, core, apscheduler All settings persist across container restarts via JSON config.
182 lines
6.2 KiB
Python
182 lines
6.2 KiB
Python
# utils/twitter_fetcher.py
|
|
|
|
import asyncio
|
|
import json
|
|
from typing import Dict, Any
|
|
|
|
# Apply twscrape fix BEFORE importing twscrape
|
|
from utils.twscrape_fix import apply_twscrape_fix
|
|
apply_twscrape_fix()
|
|
|
|
from twscrape import API, gather, Account
|
|
from playwright.async_api import async_playwright
|
|
from pathlib import Path
|
|
from utils.logger import get_logger
|
|
|
|
logger = get_logger('media')
|
|
|
|
COOKIE_PATH = Path(__file__).parent / "x.com.cookies.json"
|
|
|
|
async def extract_media_urls(page, tweet_url):
|
|
logger.debug(f"Visiting tweet page: {tweet_url}")
|
|
try:
|
|
await page.goto(tweet_url, timeout=15000)
|
|
await page.wait_for_timeout(1000)
|
|
|
|
media_elements = await page.query_selector_all("img[src*='pbs.twimg.com/media']")
|
|
urls = set()
|
|
|
|
for element in media_elements:
|
|
src = await element.get_attribute("src")
|
|
if src:
|
|
cleaned = src.split("&name=")[0] + "&name=large"
|
|
urls.add(cleaned)
|
|
|
|
logger.debug(f"Found {len(urls)} media URLs on tweet: {tweet_url}")
|
|
return list(urls)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Playwright error on {tweet_url}: {e}")
|
|
return []
|
|
|
|
async def fetch_miku_tweets(limit=5):
|
|
# Load cookies from JSON file
|
|
with open(COOKIE_PATH, "r", encoding="utf-8") as f:
|
|
cookie_list = json.load(f)
|
|
cookie_header = "; ".join(f"{c['name']}={c['value']}" for c in cookie_list)
|
|
|
|
# Add the account to twscrape
|
|
api = API()
|
|
await api.pool.add_account(
|
|
username="HSankyuu39",
|
|
password="x", # placeholder (won't be used)
|
|
email="x", # optional
|
|
email_password="x", # optional
|
|
cookies=cookie_header
|
|
)
|
|
await api.pool.login_all()
|
|
|
|
logger.info(f"Searching for Miku tweets (limit={limit})...")
|
|
query = 'Hatsune Miku OR 初音ミク has:images after:2025'
|
|
tweets = await gather(api.search(query, limit=limit, kv={"product": "Top"}))
|
|
|
|
logger.info(f"Found {len(tweets)} tweets, launching browser...")
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.firefox.launch(headless=True)
|
|
context = await browser.new_context()
|
|
|
|
await context.route("**/*", lambda route, request: (
|
|
route.abort() if any([
|
|
request.resource_type in ["font", "stylesheet"],
|
|
"analytics" in request.url,
|
|
"googletagmanager" in request.url,
|
|
"ads-twitter" in request.url,
|
|
]) else route.continue_()
|
|
))
|
|
|
|
page = await context.new_page()
|
|
|
|
results = []
|
|
for i, tweet in enumerate(tweets, 1):
|
|
username = tweet.user.username
|
|
tweet_url = f"https://twitter.com/{username}/status/{tweet.id}"
|
|
logger.debug(f"Processing tweet {i}/{len(tweets)} from @{username}")
|
|
media_urls = await extract_media_urls(page, tweet_url)
|
|
|
|
if media_urls:
|
|
results.append({
|
|
"username": username,
|
|
"text": tweet.rawContent,
|
|
"url": tweet_url,
|
|
"media": media_urls
|
|
})
|
|
|
|
await browser.close()
|
|
logger.info(f"Finished! Returning {len(results)} tweet(s) with media.")
|
|
return results
|
|
|
|
|
|
async def _search_latest(api: API, query: str, limit: int) -> list:
|
|
# kv product "Latest" to search by latest
|
|
try:
|
|
return await gather(api.search(query, limit=limit, kv={"product": "Latest"}))
|
|
except Exception as e:
|
|
logger.error(f"Latest search failed for '{query}': {e}")
|
|
return []
|
|
|
|
|
|
async def fetch_figurine_tweets_latest(limit_per_source: int = 10) -> list:
|
|
"""Search three sources by Latest, collect tweets with images, and return unified list of dicts.
|
|
Sources:
|
|
- "miku figure from:mecchaJP"
|
|
- "miku from:GoodSmile_US"
|
|
- "miku from:OtakuOwletMerch"
|
|
"""
|
|
# Load cookies
|
|
with open(COOKIE_PATH, "r", encoding="utf-8") as f:
|
|
cookie_list = json.load(f)
|
|
cookie_header = "; ".join(f"{c['name']}={c['value']}" for c in cookie_list)
|
|
|
|
api = API()
|
|
await api.pool.add_account(
|
|
username="HSankyuu39",
|
|
password="x",
|
|
email="x",
|
|
email_password="x",
|
|
cookies=cookie_header
|
|
)
|
|
await api.pool.login_all()
|
|
|
|
queries = [
|
|
"miku figure from:mecchaJP",
|
|
"miku from:GoodSmile_US",
|
|
"miku from:OtakuOwletMerch",
|
|
]
|
|
|
|
logger.info("Searching figurine tweets by Latest across sources...")
|
|
all_tweets = []
|
|
for q in queries:
|
|
tweets = await _search_latest(api, q, limit_per_source)
|
|
all_tweets.extend(tweets)
|
|
|
|
logger.info(f"Found {len(all_tweets)} candidate tweets, launching browser to extract media...")
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.firefox.launch(headless=True)
|
|
context = await browser.new_context()
|
|
await context.route("**/*", lambda route, request: (
|
|
route.abort() if any([
|
|
request.resource_type in ["font", "stylesheet"],
|
|
"analytics" in request.url,
|
|
"googletagmanager" in request.url,
|
|
"ads-twitter" in request.url,
|
|
]) else route.continue_()
|
|
))
|
|
|
|
page = await context.new_page()
|
|
results = []
|
|
for i, tweet in enumerate(all_tweets, 1):
|
|
try:
|
|
username = tweet.user.username
|
|
tweet_url = f"https://twitter.com/{username}/status/{tweet.id}"
|
|
logger.debug(f"Processing tweet {i}/{len(all_tweets)} from @{username}")
|
|
media_urls = await extract_media_urls(page, tweet_url)
|
|
if media_urls:
|
|
results.append({
|
|
"username": username,
|
|
"text": tweet.rawContent,
|
|
"url": tweet_url,
|
|
"media": media_urls
|
|
})
|
|
except Exception as e:
|
|
logger.error(f"Error processing tweet: {e}")
|
|
|
|
await browser.close()
|
|
logger.info(f"Figurine fetch finished. Returning {len(results)} tweet(s) with media.")
|
|
return results
|
|
|
|
|
|
# Note: fetch_tweet_by_url was removed - now using twscrape-based approach in figurine_notifier.py
|
|
# This avoids Playwright browser dependencies while maintaining functionality
|