learn_python3_spider icon indicating copy to clipboard operation
learn_python3_spider copied to clipboard

第一天学习打卡,先把学习的文章链接爬下来,防止某天github挂了

Open djy1994327 opened this issue 8 months ago • 2 comments

Image

github应该不会挂吧

djy1994327 avatar Mar 25 '25 09:03 djy1994327

这是来自QQ邮箱的假期自动回复邮件。   您好,我最近正在休假中,无法亲自回复您的邮件。我将在假期结束后,尽快给您回复。

Tany99 avatar Mar 25 '25 09:03 Tany99

第三天,使用多线程爬取风景图片

""""
1. 先获取每一页中的图片组的连接
2. 每一组图片点进去,再获取下载按钮的连接
3. 点击下载按钮, 进入一个新的页面, 可以获取到高清图片的链接
4. 通过这个高清图片的链接下载图片
"""
import concurrent.futures
import os

import requests
from bs4 import BeautifulSoup

# 要爬取的网站主页
zhuye_url = "https://www.fengjingbizhi.com"


# 请求url,返回页面, 单独拎出来方便代码复用
def request_all(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
    }
    response_html = requests.get(url=url, headers=headers)
    return response_html


# 返回的页面进行转码,初始化
def request_fengjing(url):
    response_html = request_all(url)
    html_text = response_html.content.decode('utf-8')
    bsoup = BeautifulSoup(html_text, 'lxml')
    # print(bsoup)
    return bsoup


# 拼接url地址
def pingjie_url(banjie_url):
    wanzhen_url = zhuye_url + banjie_url
    return wanzhen_url


# 每一页的图组对应的url
def suolvetu_pic_url(bsoup):
    main_list = bsoup.find_all(class_="main_list")
    wanzhen_url_list = []
    for item in main_list:
        # print(item.find("a").get("href"))
        banjie_url = item.find("a").get("href")
        wanzhen_url = pingjie_url(banjie_url)
        wanzhen_url_list.append(wanzhen_url)
    return wanzhen_url_list


# 每一页中的每一组图片里面对应的高清图片的url,获取高清图片的url
def item_url(wanzhen_url_list):
    get_gaoqing_pic_url_list = []
    for i in wanzhen_url_list:
        bsoup = request_fengjing(i)
        banjie = bsoup.find(class_="details_r").find("a").get("a-url")
        get_gaoqing_pic_wanzheng_url = pingjie_url(banjie)
        get_gaoqing_pic_url_list.append(get_gaoqing_pic_wanzheng_url)
    return get_gaoqing_pic_url_list


# 下载并保存图片
def download_gaoqing_pic(get_gaoqing_pic_url_list):
    pic_pwd = "fengjing"
    os.makedirs(pic_pwd, exist_ok=True)
    for i in get_gaoqing_pic_url_list:
        id_name = i.split("=")[2]
        filename = "%s.jpg" % id_name
        file_path = os.path.join(pic_pwd, filename)
        print(file_path)
        with open(file_path, 'wb') as f:
            print(f"正在下载:id为{id_name}张图片,{filename},{i}", )
            img = request_all(i).content
            f.write(img)


def dowload_threadPool(url_list, works):
    with concurrent.futures.ThreadPoolExecutor(works) as exector:
        for url in url_list:
            exector.submit(main, url)


# 开始爬取的入口
def main(url):
    bsoup = request_fengjing(url)
    wanzhen_url_list = suolvetu_pic_url(bsoup)
    get_gaoqing_pic_url_list = item_url(wanzhen_url_list)
    download_gaoqing_pic(get_gaoqing_pic_url_list)


if __name__ == '__main__':
    url_list = []
    works = 5
    # 根据网站的规则,生成每一页的url,并存放到列表中
    for i in range(1, 80):
        url = f"https://www.fengjingbizhi.com/fengjing/index{'' if i <= 1 else '_' + str(i)}.html"
        url_list.append(url)
    dowload_threadPool(url_list, works)


djy1994327 avatar Mar 27 '25 07:03 djy1994327