分析

原页面共收录了7584张图片，分为380页，每页20条。

第一页的链接： https://usdawatercolors.nal.usda.gov/pom/search.xhtml?start=0 第二页的链接： https://usdawatercolors.nal.usda.gov/pom/search.xhtml?start=20 ... 以此类推，还是比较简单的。

每条数据的HTML元素布局如下：

我们可以获取到：

artist
year
scientific name
common name
缩略图的url

点击图片进入到详情页面：

点击Download high resolution按钮，我们就可以获取到原图了。

但是这样的话就意味着每张图都要打开一个新的页面，后来发现缩略图的url和原图的url有关联：

缩略图url， ../download/POM00007435/thumbnail
原图url， https://usdawatercolors.nal.usda.gov/pom/download.xhtml?id=POM00007435

我们只要从缩略图的url中获取到POM00007435，就可以拼出对应的原图地址了。

爬虫

依赖

requests
beautifulsoup4

源码

循环380次，对应380页
每个页面获取20条记录对应的html标签
对于每个html标签
获取artist，year等信息
从缩略图url拼出对应的原图url
下载原图，保存到本地

import requests
from bs4 import BeautifulSoup

IMG_FOLDER = 'fruit_images/'


def run():
    for (idx, page) in enumerate(range(380)):
        resp = requests.get(
            'https://usdawatercolors.nal.usda.gov/pom/search.xhtml?start={}&searchText=&searchField=&sortField='.format(
                idx * 20))
        soup = BeautifulSoup(resp.text, 'html.parser')
        for (div_idx, div) in enumerate(soup.select('div.document')):
            doc = div.select_one('dl.defList')
            artist = doc.select_one(':nth-child(2)>a').get_text()
            year = doc.select_one(':nth-child(4)>a').get_text()
            # cannot parse scientific name or common name for some pictures, just use 'none' instead to avoid terminating
            scientific_name = 'none' if doc.select_one(':nth-child(6)>a') is None else doc.select_one(
                ':nth-child(6)>a').get_text()
            common_name = 'none' if doc.select_one(':nth-child(8)>a') is None else doc.select_one(
                ':nth-child(8)>a').get_text()
            thumb_pic_src = div.select_one('div.thumb-frame>a>img')['src']
            id = (idx + 1) * 20 + div_idx + 1
            info = FruitInfo(id, artist, year, scientific_name, common_name, thumb_pic_src)
            print(info)
            info.download_and_save()


class FruitInfo:
    def __init__(self, id, artist, year, scientific_name, common_name, thumb_pic_url):
        self.id = id
        self.artist = artist
        self.year = year
        self.scientific_name = scientific_name
        self.common_name = common_name
        self.thumb_pic_url = thumb_pic_url

    def download_and_save(self):
        filename = '{}-{}-{}-{}.png'.format(self.id, self.common_name, self.year, self.artist).replace(' ', '_')
        print('filename = ', filename)
        ori_img_url = self.__parse_ori_img_url()
        print('original img url = ', ori_img_url)
        resp = requests.get(ori_img_url)
        with open(IMG_FOLDER + filename, 'wb') as f:
            f.write(resp.content)
            print('saved...', filename)

    def __parse_ori_img_url(self) -> str:
        img_id = self.thumb_pic_url.split('/')[2]
        print('img id = ', img_id)
        return 'https://usdawatercolors.nal.usda.gov/pom/download.xhtml?id={}'.format(img_id)

    def __str__(self):
        return 'FruitInfo(artist={},year={},scientific_name={},common_name={},thumb_pic_url={})'.format(self.artist,
                                                                                                        self.year,
                                                                                                        self.scientific_name,
                                                                                                        self.common_name,
                                                                                                        self.thumb_pic_url)


if __name__ == '__main__':
    run()