Cyberwatch icon indicating copy to clipboard operation
Cyberwatch copied to clipboard

[BUG] Google News URL decode not working anymore

Open moehmeni opened this issue 1 year ago • 2 comments

For decoding Google News URLs into their real ones, I am getting error:

import base64
import re
from typing import Optional

# Ref: https://github.com/Casualtek/Cyberwatch/blob/8648e9ad646e708dd1d801d6e2ebb3c40539ffde/rss.py#L111
_ENCODED_URL_PREFIX = "https://news.google.com/rss/articles/"
_ENCODED_URL_RE = re.compile(
    rf"^{re.escape(_ENCODED_URL_PREFIX)}(?P<encoded_url>[^?]+)"
)
_DECODED_URL_RE = re.compile(rb'^\x08\x13".+?(?P<primary_url>http[^\xd2]+)\xd2\x01')


def decode_google_news_url(url: str) -> Optional[str]:
    match = _ENCODED_URL_RE.match(url)
    encoded_text = match.groupdict()["encoded_url"]  # type: ignore
    encoded_text += (
        "==="  # Fix incorrect padding. Ref: https://stackoverflow.com/a/49459036/
    )
    decoded_text = base64.urlsafe_b64decode(encoded_text)

    match = _DECODED_URL_RE.match(decoded_text)
    primary_url = match.groupdict()["primary_url"]  # type: ignore
    primary_url = primary_url.decode()
    return primary_url


# Test the function
url = "https://news.google.com/rss/articles/CBMi2AFBVV95cUxQOHZlbFBOSXZDQTVDNWhibW9nMlUzaWpfbVRZaTNKMXd4VFNtQ2YxQWt2UmtDbHdia2xvbHZDMU03eXVabzFscDdMcHV4aGFnNW1zdU9zakVyaEFmMm1FVDVBRVotdktTbkJBOUFrT3dwNTY5bVNzZWRJQk1RT3l5SnBBeWdXS1laeVpwejQzN3luZjgwVjN0bFB5NkZSM2oxRXJ6Q0ItbDNMUDZJRTdEZXhjbUV1Z3NYMHdXV1hKV3N3YndWOVZjVE9uZlBGNkk0SS1mbTZ3b0Q?oc=5"
result = decode_google_news_url(url)
print("Result:", result)
primary_url = match.groupdict()["primary_url"]  # type: ignore
AttributeError: 'NoneType' object has no attribute 'groupdict'

I think they changed recently because it was working just yesterday.

moehmeni avatar Jul 24 '24 16:07 moehmeni

Thanks. I noticed it as well. I'm working on a fix. Any suggestions will be welcome though ;)

Casualtek avatar Jul 24 '24 16:07 Casualtek

import json
from urllib.parse import quote, urlparse
import requests
from bs4 import BeautifulSoup

app = Flask(__name__)

# Helper functions
def get_decoding_params(gn_art_id):
    try:
        response = requests.get(f"https://news.google.com/articles/{gn_art_id}")
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "lxml")
        div = soup.select_one("c-wiz > div")
        return {
            "signature": div.get("data-n-a-sg"),
            "timestamp": div.get("data-n-a-ts"),
            "gn_art_id": gn_art_id,
        }
    except Exception as e:
        raise Exception(f"Error fetching decoding parameters: {str(e)}")

def decode_urls(articles):
    try:
        articles_reqs = [
            [
                "Fbv4je",
                f'["garturlreq",[["X","X",["X","X"],null,null,1,1,"US:en",null,1,null,null,null,null,null,0,1],"X","X",1,[1,1,1],1,1,null,0,0,null,0],"{art["gn_art_id"]}",{art["timestamp"]},"{art["signature"]}"]',
            ]
            for art in articles
        ]
        payload = f"f.req={quote(json.dumps([articles_reqs]))}"
        headers = {"content-type": "application/x-www-form-urlencoded;charset=UTF-8"}
        response = requests.post(
            url="https://news.google.com/_/DotsSplashUi/data/batchexecute",
            headers=headers,
            data=payload,
        )
        response.raise_for_status()
        return [json.loads(res[2])[1] for res in json.loads(response.text.split("\n\n")[1])[:-2]]
    except Exception as e:
        raise Exception(f"Error decoding URLs: {str(e)}")

# Flask routes
@app.route('/decode', methods=['POST'])
def decode_api():
    try:
        # Get encoded URLs from the POST request
        data = request.json
        encoded_urls = data.get("encoded_urls")
        if not encoded_urls or not isinstance(encoded_urls, list):
            print(type(encoded_urls))
            return jsonify({"error": "Invalid input. Provide a list of encoded URLs."}), 400
            

        # Process each URL to extract parameters
        articles_params = [
            {
                **url,  # Preserve the original object
                **get_decoding_params(urlparse(url["link"]).path.split("/")[-1])  # Add decoding params
            }
            for url in encoded_urls
        ]

        # Decode the URLs
        decoded_urls = decode_urls(articles_params)

        # Append decoded URLs to the original objects
        for i, article in enumerate(articles_params):
            article["decoded_url"] = decoded_urls[i]

        # Return the enriched JSON
        return jsonify({"decoded_urls": articles_params})
    except Exception as e:
        return jsonify({"error": str(e)}), 500

# Run the Flask app
if __name__ == '__main__':
    app.run(debug=True)

I wrote this API which works perfectly, except for not respecting or handling rate limits (yes there seems to be 10 or 20 per minute). If you haven't figured out a solution for this issue this might help

CS-PK avatar Nov 27 '24 12:11 CS-PK