Cyberwatch
Cyberwatch copied to clipboard
[BUG] Google News URL decode not working anymore
For decoding Google News URLs into their real ones, I am getting error:
import base64
import re
from typing import Optional
# Ref: https://github.com/Casualtek/Cyberwatch/blob/8648e9ad646e708dd1d801d6e2ebb3c40539ffde/rss.py#L111
_ENCODED_URL_PREFIX = "https://news.google.com/rss/articles/"
_ENCODED_URL_RE = re.compile(
rf"^{re.escape(_ENCODED_URL_PREFIX)}(?P<encoded_url>[^?]+)"
)
_DECODED_URL_RE = re.compile(rb'^\x08\x13".+?(?P<primary_url>http[^\xd2]+)\xd2\x01')
def decode_google_news_url(url: str) -> Optional[str]:
match = _ENCODED_URL_RE.match(url)
encoded_text = match.groupdict()["encoded_url"] # type: ignore
encoded_text += (
"===" # Fix incorrect padding. Ref: https://stackoverflow.com/a/49459036/
)
decoded_text = base64.urlsafe_b64decode(encoded_text)
match = _DECODED_URL_RE.match(decoded_text)
primary_url = match.groupdict()["primary_url"] # type: ignore
primary_url = primary_url.decode()
return primary_url
# Test the function
url = "https://news.google.com/rss/articles/CBMi2AFBVV95cUxQOHZlbFBOSXZDQTVDNWhibW9nMlUzaWpfbVRZaTNKMXd4VFNtQ2YxQWt2UmtDbHdia2xvbHZDMU03eXVabzFscDdMcHV4aGFnNW1zdU9zakVyaEFmMm1FVDVBRVotdktTbkJBOUFrT3dwNTY5bVNzZWRJQk1RT3l5SnBBeWdXS1laeVpwejQzN3luZjgwVjN0bFB5NkZSM2oxRXJ6Q0ItbDNMUDZJRTdEZXhjbUV1Z3NYMHdXV1hKV3N3YndWOVZjVE9uZlBGNkk0SS1mbTZ3b0Q?oc=5"
result = decode_google_news_url(url)
print("Result:", result)
primary_url = match.groupdict()["primary_url"] # type: ignore
AttributeError: 'NoneType' object has no attribute 'groupdict'
I think they changed recently because it was working just yesterday.
Thanks. I noticed it as well. I'm working on a fix. Any suggestions will be welcome though ;)
import json
from urllib.parse import quote, urlparse
import requests
from bs4 import BeautifulSoup
app = Flask(__name__)
# Helper functions
def get_decoding_params(gn_art_id):
try:
response = requests.get(f"https://news.google.com/articles/{gn_art_id}")
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
div = soup.select_one("c-wiz > div")
return {
"signature": div.get("data-n-a-sg"),
"timestamp": div.get("data-n-a-ts"),
"gn_art_id": gn_art_id,
}
except Exception as e:
raise Exception(f"Error fetching decoding parameters: {str(e)}")
def decode_urls(articles):
try:
articles_reqs = [
[
"Fbv4je",
f'["garturlreq",[["X","X",["X","X"],null,null,1,1,"US:en",null,1,null,null,null,null,null,0,1],"X","X",1,[1,1,1],1,1,null,0,0,null,0],"{art["gn_art_id"]}",{art["timestamp"]},"{art["signature"]}"]',
]
for art in articles
]
payload = f"f.req={quote(json.dumps([articles_reqs]))}"
headers = {"content-type": "application/x-www-form-urlencoded;charset=UTF-8"}
response = requests.post(
url="https://news.google.com/_/DotsSplashUi/data/batchexecute",
headers=headers,
data=payload,
)
response.raise_for_status()
return [json.loads(res[2])[1] for res in json.loads(response.text.split("\n\n")[1])[:-2]]
except Exception as e:
raise Exception(f"Error decoding URLs: {str(e)}")
# Flask routes
@app.route('/decode', methods=['POST'])
def decode_api():
try:
# Get encoded URLs from the POST request
data = request.json
encoded_urls = data.get("encoded_urls")
if not encoded_urls or not isinstance(encoded_urls, list):
print(type(encoded_urls))
return jsonify({"error": "Invalid input. Provide a list of encoded URLs."}), 400
# Process each URL to extract parameters
articles_params = [
{
**url, # Preserve the original object
**get_decoding_params(urlparse(url["link"]).path.split("/")[-1]) # Add decoding params
}
for url in encoded_urls
]
# Decode the URLs
decoded_urls = decode_urls(articles_params)
# Append decoded URLs to the original objects
for i, article in enumerate(articles_params):
article["decoded_url"] = decoded_urls[i]
# Return the enriched JSON
return jsonify({"decoded_urls": articles_params})
except Exception as e:
return jsonify({"error": str(e)}), 500
# Run the Flask app
if __name__ == '__main__':
app.run(debug=True)
I wrote this API which works perfectly, except for not respecting or handling rate limits (yes there seems to be 10 or 20 per minute). If you haven't figured out a solution for this issue this might help