newspaper4k
newspaper4k copied to clipboard
[BUG] Google News link schema changed?
For decoding Google News URLs into their real ones, I am getting error
import base64
import re
# Some url encoding related constants
_ENCODED_URL_PREFIX = "https://news.google.com/rss/articles/"
_ENCODED_URL_PREFIX_WITH_CONSENT = (
"https://consent.google.com/m?continue=https://news.google.com/rss/articles/"
)
_ENCODED_URL_RE = re.compile(
rf"^{re.escape(_ENCODED_URL_PREFIX_WITH_CONSENT)}(?P<encoded_url>[^?]+)"
)
_ENCODED_URL_RE = re.compile(
rf"^{re.escape(_ENCODED_URL_PREFIX)}(?P<encoded_url>[^?]+)"
)
_DECODED_URL_RE = re.compile(rb'^\x08\x13".+?(?P<primary_url>http[^\xd2]+)\xd2\x01')
def prepare_gnews_url(url):
# There seems to be a case when we get a URL with consent.google.com
# see https://github.com/ranahaani/GNews/issues/62
# Also, the URL is directly decoded, no need to go through news.google.com
match = _ENCODED_URL_RE.match(url)
encoded_text = match.groupdict()["encoded_url"]
# Fix incorrect padding. Ref: https://stackoverflow.com/a/49459036/
encoded_text += "==="
decoded_text = base64.urlsafe_b64decode(encoded_text)
match = _DECODED_URL_RE.match(decoded_text)
primary_url = match.groupdict()["primary_url"]
primary_url = primary_url.decode()
return primary_url
# Test the function
url = "https://news.google.com/rss/articles/CBMi2AFBVV95cUxQOHZlbFBOSXZDQTVDNWhibW9nMlUzaWpfbVRZaTNKMXd4VFNtQ2YxQWt2UmtDbHdia2xvbHZDMU03eXVabzFscDdMcHV4aGFnNW1zdU9zakVyaEFmMm1FVDVBRVotdktTbkJBOUFrT3dwNTY5bVNzZWRJQk1RT3l5SnBBeWdXS1laeVpwejQzN3luZjgwVjN0bFB5NkZSM2oxRXJ6Q0ItbDNMUDZJRTdEZXhjbUV1Z3NYMHdXV1hKV3N3YndWOVZjVE9uZlBGNkk0SS1mbTZ3b0Q?oc=5"
result = prepare_gnews_url(url)
print("Result:", result)
AttributeError: 'NoneType' object has no attribute 'groupdict'
I think they changed recently while it was working before.