fix(twitter): update twscrape monkey patch for JS bundle format change

Twitter changed the JS bundle structure from the old single-map format (e=>e+"."+{...}[e]+"a.js") to a new two-map format (u.u=e=>""+(({name})[e]||e)+"."+({hash})[e]+"a.js"), breaking x-client-transaction-id generation. This caused IndexError: list index out of range, which twscrape interpreted as an account timeout (15-min lockout), preventing Miku from fetching/sharing tweets. The fix adds: - A robust multi-pattern parser that tries known formats in order - The _js_obj_to_dict helper from PR #303 for handling unquoted numeric keys and scientific notation in JS object literals - Debug logging to capture the JS snippet when ALL patterns fail, making future breakage easier to diagnose References: - https://github.com/vladkens/twscrape/issues/302 - https://github.com/vladkens/twscrape/pull/303
2026-04-29 21:32:27 +03:00
parent 694590a620
commit 20891179ee
1 changed files with 115 additions and 19 deletions
--- a/bot/utils/twscrape_fix.py
+++ b/bot/utils/twscrape_fix.py
@@ -1,12 +1,21 @@
 # utils/twscrape_fix.py
 """
-Monkey patch for twscrape to fix "Failed to parse scripts" error.
-Twitter started returning malformed JSON with unquoted keys.
-See: https://github.com/vladkens/twscrape/issues/284
+Monkey patch for twscrape to fix parsing of Twitter's JS bundle.
+
+Fixes two known issues:
+1. Issue #284: Malformed JSON with unquoted keys
+   (old fix, kept for backward compatibility)
+2. Issue #302: Twitter changed JS bundle format, breaking x-client-transaction-id
+   generation. The old format 'e=>e+"."+{...}[e]+"a.js"' changed to
+   'u.u=e=>""+(({...})[e]||e)+"."+({...})[e]+"a.js"'
+   Fix from: https://github.com/vladkens/twscrape/pull/303
+
+Without this patch, twscrape raises IndexError and locks accounts for 15 minutes.
 """

 import json
 import re
+from typing import Iterator
 from utils.logger import get_logger

 logger = get_logger('core')
@@ -16,22 +25,109 @@ def script_url(k: str, v: str):
    return f"https://abs.twimg.com/responsive-web/client-web/{k}.{v}.js"


-def patched_get_scripts_list(text: str):
-    """Fixed version that handles unquoted keys in Twitter's JSON response"""
-    scripts = text.split('e=>e+"."+')[1].split('[e]+"a.js"')[0]
+def _js_obj_to_dict(s: str) -> dict:
+    """
+    Parse a JavaScript object literal with unquoted numeric keys into a Python dict.
+    Handles both plain integers (20113) and scientific notation (88e3 → 88000).
    
-    try:
-        for k, v in json.loads(scripts).items():
-            yield script_url(k, f"{v}a")
-    except json.decoder.JSONDecodeError:
-        # Fix unquoted keys like: node_modules_pnpm_ws_8_18_0_node_modules_ws_browser_js
-        fixed_scripts = re.sub(
-            r'([,\{])(\s*)([\w]+_[\w_]+)(\s*):',
-            r'\1\2"\3"\4:',
-            scripts
-        )
-        for k, v in json.loads(fixed_scripts).items():
-            yield script_url(k, f"{v}a")
+    From: https://github.com/vladkens/twscrape/pull/303
+    """
+    # Scientific notation first so the plain-int pass does not consume only the mantissa
+    s = re.sub(r'\b(\d+e\d+)(?=\s*:)', lambda m: '"' + str(int(float(m.group(1)))) + '"', s)
+    # Plain integer keys
+    s = re.sub(r'\b(\d+)(?=\s*:)', r'"\1"', s)
+    return json.loads('{' + s + '}')
+
+
+def patched_get_scripts_list(text: str) -> Iterator[str]:
+    """
+    Fixed version that handles Twitter's changing JS bundle format.
+    
+    Uses a robust two-pass approach:
+    1. Try to find the script map using generic regex patterns
+    2. Fall back to known format-specific splits
+    
+    Twitter keeps changing the JS bundle structure. The key invariant is that
+    there's always a JavaScript object literal mapping chunk IDs to hashes,
+    somewhere in a function that constructs script URLs with ".a.js" suffix.
+    """
+    # Strategy: Find the JS object that maps IDs to hash values.
+    # The format is always some variation of:
+    #   ... => "" + ({...})[e] + "." + ({...})[e] + "a.js"
+    # or:
+    #   ... => e + "." + ({...})[e] + "a.js"
+    #
+    # We use regex to find the LAST object literal before "a.js" that looks
+    # like a hash map (integer keys, short hex-ish string values).
+    
+    # Approach 1: Known patterns (newest first)
+    patterns = [
+        # Pattern from PR #303 (April 2026):
+        # u.u=e=>""+(({name_map})[e]||e)+"."+({hash_map})[e]+"a.js"
+        {
+            "name_split_start": '(({',
+            "name_split_end": '})[e]||e)',
+            "hash_split_start": '|e)+"."+({',
+            "hash_split_end": '})[e]+"a.js"',
+        },
+        # Alternative: same but without the ||e fallback
+        {
+            "name_split_start": '""+(({',
+            "name_split_end": '})[e]',
+            "hash_split_start": ')+"."+({',
+            "hash_split_end": '})[e]+"a.js"',
+        },
+        # Old format (pre-April 2026):
+        # e=>e+"."+{...}[e]+"a.js"
+        {
+            "name_split_start": None,  # single map
+            "name_split_end": None,
+            "hash_split_start": 'e=>e+"."+',
+            "hash_split_end": '[e]+"a.js"',
+        },
+    ]
+    
+    for pattern in patterns:
+        try:
+            if pattern["name_split_start"] is None:
+                # Single-map old format
+                scripts = text.split(pattern["hash_split_start"])[1].split(pattern["hash_split_end"])[0]
+                names = None
+                hashes = _js_obj_to_dict(scripts)
+            else:
+                # Two-map new format
+                name_raw = text.split(pattern["name_split_start"])[1].split(pattern["name_split_end"])[0]
+                hash_raw = text.split(pattern["hash_split_start"])[1].split(pattern["hash_split_end"])[0]
+                names = _js_obj_to_dict(name_raw)
+                hashes = _js_obj_to_dict(hash_raw)
+            
+            for k, hash_val in hashes.items():
+                name = names.get(k, k) if names else k
+                yield script_url(name, f"{hash_val}a")
+            logger.info(f"Successfully parsed scripts using pattern: {pattern['hash_split_start'][:40]}...")
+            return
+        except (IndexError, KeyError, json.JSONDecodeError):
+            continue
+    
+    # If ALL patterns failed, log a snippet of the text for debugging
+    # Find any line near "a.js" to help diagnose
+    snippet = ""
+    for line in text.split('\n'):
+        if 'a.js' in line and ('{' in line or '=>' in line):
+            snippet = line.strip()[:300]
+            break
+    if not snippet:
+        # Try to find any JSON-like object near script URL construction
+        match = re.search(r'.{0,200}a\.js.{0,200}', text, re.DOTALL)
+        if match:
+            snippet = match.group(0)[:400]
+    
+    logger.error(f"Failed to parse scripts. Text snippet near 'a.js': {snippet}")
+    raise Exception(
+        "Failed to parse scripts: unknown JS bundle format. "
+        "Twitter may have changed their JS structure again. "
+        "See: https://github.com/vladkens/twscrape/issues"
+    )


 def apply_twscrape_fix():
@@ -39,6 +135,6 @@ def apply_twscrape_fix():
    try:
        from twscrape import xclid
        xclid.get_scripts_list = patched_get_scripts_list
-        logger.info("Applied twscrape monkey patch for 'Failed to parse scripts' fix")
+        logger.info("Applied twscrape monkey patch (JS bundle parsing fix for issues #284 + #302)")
    except Exception as e:
        logger.error(f"Failed to apply twscrape monkey patch: {e}")