gocomics has changed again
Site has changed again, and now the XPath used to find the images isn't working. (I overrode the robots.txt check to keep this working after reporting #335)
Having the same issue. I was able to workaround the robots.txt issue, but I'm not an expert Python programmer so this one is beyond me. :-(
hmm.. still no update on this?
I'm not an expert on the plugin code. I tried to figure it out but am I'm still getting stuck.
Todays comic is not available on the date url anymore, we don't need latestsearch and don't need indirectstarter.
The comic__container has 5 different comics, but if you pick the first one in the list it gets todays
imageSearch = "(//div[contains(@class, 'Comic_comic__container')]//img/@src)[1]"
prevSearch = '//a[contains(@class, "Controls_controls__button_previous")]/@href'
# starter = indirectStarter
help = 'Index format: yyyy/mm/dd'
Now I need to figure out how to rename the image (same issue that ComicsKingdom has).
The date is stored in ButtonCalendar_buttonCalendar class but now I need to figure out how to get the page 'data' to parse the date from the class.
comicskingdom.py name fix
from ..helpers import indirectStarter
from ..scraper import ParserScraper
import re
class ComicsKingdom(ParserScraper):
partDiv = '//div[d:class("comic-reader-item")]'
imageSearch = '//meta[@property="og:image"]/@content'
prevSearch = partDiv + '[2]/@data-link'
starter = indirectStarter
help = 'Index format: yyyy-mm-dd'
def __init__(self, name, path, lang=None):
super().__init__('ComicsKingdom/' + name)
self.url = 'https://comicskingdom.com/' + path
self.stripUrl = self.url + '/%s'
self.latestSearch = f'//a[re:test(@href, "/{path}/[0-9-]+$")]'
self.shortname = name
if lang:
self.lang = lang
def namer(self, image_url, page_url):
"""Generate filename from extracted date"""
print(f"[DEBUG] Generating filename for {page_url}")
# Check if page_url has a date pattern like "2002-10-29"
date_match = re.search(r'(\d{4}-\d{2}-\d{2})(?:/|$)', page_url)
if page_url and date_match:
date_part = date_match.group(1) # This will be just "2025-04-24"
year, month, day = date_part.split('-')
return f"{self.shortname}_{year}{month}{day}.gif"
# Fallback to superclass namer if date extraction fails
print("[DEBUG] Using default naming")
return super().namer(image_url, page_url)
def link_modifier(self, fromurl, tourl):
return tourl.replace('//wp.', '//', 1)
gocomics.py ugly hack
#from ..helpers import indirectStarter
import re
from datetime import datetime
class GoComics(ParserScraper):
url = 'https://www.gocomics.com/'
# imageSearch = '//picture[d:class("item-comic-image")]/img'
# prevSearch = '//a[d:class("js-previous-comic")]'
# latestSearch = '//div[d:class("gc-deck--cta-0")]//a'
imageSearch = '//div[contains(@class, "ComicViewer_comicViewer")]//img[contains(@class, "Comic_comic__image")]/@src'
prevSearch = '//a[contains(@class, "Controls_controls__button_previous")]/@href'
# latestSearch = '//a[@data-link="comics"]'
# starter = indirectStarter
help = 'Index format: yyyy/mm/dd'
def __init__(self, name, path, lang=None):
super(GoComics, self).__init__('GoComics/' + name)
self.session.add_throttle('www.gocomics.com', 1.0, 2.0)
self.url = 'https://www.gocomics.com/' + path
self.shortname = name
if lang:
self.lang = lang
def _extract_date(self, data):
"""Extract date from the calendar button in the HTML"""
print("[DEBUG] Extracting date from HTML")
# Ensure data is in string format
if not isinstance(data, str):
print("[DEBUG] Converting data to string")
try:
if isinstance(data, bytes):
data = data.decode('utf-8', errors='ignore')
else:
data = str(data)
except Exception as e:
print(f"[DEBUG] Conversion failed: {e}")
raise ValueError("Failed to process page content")
try:
date_match = re.search(
r'<button[^>]*class="[^"]*ButtonCalendar_buttonCalendar[^"]*"[^>]*>([^<]+)',
data
)
if date_match:
date_str = date_match.group(1).strip()
print(f"[DEBUG] Found date string: {date_str}")
try:
# Parse date like "Wednesday, April 16"
date_obj = datetime.strptime(date_str, "%A, %B %d")
# Use current year since it's not in the string
date_obj = date_obj.replace(year=datetime.now().year)
return date_obj.strftime("%Y%m%d")
except ValueError as e:
print(f"[DEBUG] Date parsing failed: {e}")
except Exception as e:
print(f"[DEBUG] Date extraction failed: {e}")
return None
def namer(self, image_url, page_url):
"""Generate filename from extracted date"""
print(f"[DEBUG] Generating filename for {page_url}")
# Check if page_url has a date pattern like "2002/10/29"
if page_url and re.search(r'/\d{4}/\d{2}/\d{2}(?:/|$)', page_url):
try:
prefix, year, month, day = page_url.rsplit('/', 3)
return "%s_%s%s%s.gif" % (self.shortname, year, month, day)
except ValueError:
# In case the split fails, fall through to other methods
pass
# If not, we need to parse the date from the calendar class
try:
# Use the session object to fetch the page content
response = self.session.get(page_url)
datatwo = response.text
# First try to get date from current page data
date_str = self._extract_date(datatwo)
if date_str:
print(f"[DEBUG] Using extracted date for filename: {date_str}")
return f"{self.shortname}_{date_str}.gif" # Modified to include shortname
except Exception as e:
print(f"[DEBUG] Failed to fetch page content: {e}")
# Fallback to superclass namer if date extraction fails
print("[DEBUG] Using default naming")
return super().namer(image_url, page_url)
Thanks very much @deekboy! The only thing I had to change was to comment out the "starter = indirectStarter" line in your comicskingdom.py file changes. I can confirm that this does fix both ComicsKingdom and GoComics downloads.
For gocomics.py,
imageSearch = "(//div[contains(@class, 'ComicViewer_comicViewer')]//button//img/@src)[1]"
works more reliably. Appreciate the patch!
For
gocomics.py,imageSearch = "(//div[contains(@class, 'ComicViewer_comicViewer')]//button//img/@src)[1]"works more reliably. Appreciate the patch!
FYI this will randomly pick the wrong comic on the page which is why I changed it. I have not had issues since I changed it.
Another quick update. I didn't realize extra stuff was being added to image name which make the image blurry.
Add at the top from urllib.parse import urlparse
Imagesearch still same imageSearch = '//div[contains(@class, "ComicViewer_comicViewer")]//img[contains(@class, "Comic_comic__image")]/@src'
After the init add
def imageUrlModifier(self, url, data):
if url and '?' in url:
return url.split('?')[0]
return url
I tried editing dosagelib/plugins/comicskingdom.py and gocomics.py with what you have in the comments. It builds, but when I try to grab something like Wallace the Brave off of gocomics, I get an xpath error. GoComics/WallaceTheBrave> ERROR: XPath //div[contains(@class, "ComicViewer_comicViewer")]//img[contains(@class, "Comic_comic__image")]/@src not found at URL https://www.gocomics.com/wallace-the-brave.
I'm getting various errors as well starting this morning. Looks like something changed again on the GoComics site.
@deekboy?
Thanks @TobiX!
Thank you!