icrawler
icrawler copied to clipboard
Crawl only jpg files
Hi there!
When I use GoogleImageCrawler, I get sometimes png and sometimes jpg files, depending on what google finds.
Is there a way to configure the crawler to only download jpg files and no other file types?
You could overwrite the parse function for now
for example
class GoogleParser(Parser):
def parse(self, response):
soup = BeautifulSoup(
response.content.decode('utf-8', 'ignore'), 'lxml')
#image_divs = soup.find_all('script')
image_divs = soup.find_all(name='script')
for div in image_divs:
#txt = div.text
txt = str(div)
#if not txt.startswith('AF_initDataCallback'):
if 'AF_initDataCallback' not in txt:
continue
if 'ds:0' in txt or 'ds:1' not in txt:
continue
#txt = re.sub(r"^AF_initDataCallback\({.*key: 'ds:(\d)'.+data:function\(\){return (.+)}}\);?$",
# "\\2", txt, 0, re.DOTALL)
#meta = json.loads(txt)
#data = meta[31][0][12][2]
#uris = [img[1][3][0] for img in data if img[0] == 1]
uris = re.findall(r'http.*?\.(?:jpg|jpeg)', txt)
return [{'file_url': uri} for uri in uris]
Okay thx, but I thought there would be an easier way.