daily-share icon indicating copy to clipboard operation
daily-share copied to clipboard

利用nodejs 写一个简单的小爬虫爬图片(2021-6-1)

Open yaogengzhu opened this issue 3 years ago • 3 comments

仅做学习参考

  • node
  • axios
  • axios
  • cheerio
  • iconv-lite (解决中文gbk乱码问题)
const fs = require('fs')
const path = require('path')
const axios = require('axios')
const cheerio = require('cheerio')
const iconv = require('iconv-lite')

const getPageHtml = async (pageNumber) => {
    // const BASER_URL = `https://pic.netbian.com/4kqiche/index_${pageNumber}.html`
    const BASER_URL = `https://pic.netbian.com/4kdongman/index_${pageNumber}.html`
    const res = await axios.get(BASER_URL, {responseType: 'stream'})
    return new Promise((resolve, reject) => {
        fs.mkdir(path.resolve(__dirname, `./第${pageNumber}页`), (err) => {
            if (err) {
                reject(err)
            }
            const chunks = []
            res.data.on('data', chunk => {
                chunks.push(chunk)
            })
            res.data.on('end', () => {
                const buffer = Buffer.concat(chunks)
                const str = iconv.decode(buffer, 'gbk')
                resolve(str)
            })
        })
    })
   
}

const getPageData = async (pageNumber) => {
    const result =  await getPageHtml(pageNumber)
    const $ = cheerio.load(result, {decodeEntities:false})
    $('.clearfix > li > a > img').each((i, element) => {
        const imgUrl = $(element).attr('src')
        const title = $(element).attr('alt')
        sleep().then(() => {
            downloadPageImg(imgUrl, title.replace(/\s+/g, ''), pageNumber)
        })
    })
}

function downloadPageImg(imgUrl, title, pageNumber) {

    const BASER_URL = 'https://pic.netbian.com' + imgUrl
    const extendName = path.extname(BASER_URL)
    const imgPath =  path.resolve(__dirname, `./第${pageNumber}页`) + `/${title}${extendName}`
    // 写入流
    const ws = fs.createWriteStream(imgPath)
    axios.get(BASER_URL, { responseType: 'stream' }).then((res) => {
        // 管道
        console.log(imgPath + '写入完成')
        res.data.pipe(ws)
    })
    ws.on('finish', function() {
        ws.end()
    });
}


const getMorePageData = () => {
    for (let i = 2; i <= 10; i++) {
        sleep().then(() => {
            getPageData(i)
        })
    }
}

const sleep = (timer = 1000) => {
    return new Promise((relove) => {
        setTimeout(() => {
            relove()
        }, timer)
    })
}
getMorePageData()

yaogengzhu avatar Jun 01 '21 11:06 yaogengzhu

升级版,高清图片

const fs = require('fs')
const path = require('path')
const axios = require('axios')
const cheerio = require('cheerio')
const iconv = require('iconv-lite')

// 升级版
const getPageHtml = async (pageNumber) => {
    // 可以在升级一波类目的的爬虫
    // const BASER_URL = `https://pic.netbian.com/4kqiche/index_${pageNumber}.html`
    const BASER_URL = `https://pic.netbian.com/4kdongman/index_${pageNumber}.html`
    const res = await axios.get(BASER_URL, {responseType: 'stream'})
    return new Promise((resolve, reject) => {
        fs.mkdir(path.resolve(__dirname, `./第${pageNumber - 1}页`), { recursive: true }, (err) => {
            if (err) {
                reject(err)
            }
            const chunks = []
            res.data.on('data', chunk => {
                chunks.push(chunk)
            })
            res.data.on('end', () => {
                const buffer = Buffer.concat(chunks)
                const str = iconv.decode(buffer, 'gbk')
                resolve(str)
            })
        })
    })
   
}

// 处理高清图片的一层
const getImageDetailHtml = async (linkUrl) => {
     const res = await axios.get(linkUrl, {responseType: 'stream'})
    return new Promise((resolve, ject) => {
        const chunks = []
        res.data.on('data', chunk => {
            chunks.push(chunk)
        })
        res.data.on('end', () => {
            const buffer = Buffer.concat(chunks)
            const str = iconv.decode(buffer, 'gbk')
            resolve(str)
        })
    })
}

const getPageData = async (pageNumber) => {
    const result =  await getPageHtml(pageNumber)
    const $ = cheerio.load(result, {decodeEntities:false})
    $('.clearfix > li > a ').each(async (i, element) => {
        const linkUrl = $(element).attr('href')
        // 第二次读写
        const res = await getImageDetailHtml( `https://pic.netbian.com/` + linkUrl)
        const $1 = cheerio.load(res, {decodeEntities:false})
        const title = $1('.photo-pic > a >img').attr('title')
        const imgUrl = $1('.photo-pic > a >img').attr('src')

        sleep().then( async () => {
            await downloadPageImg(imgUrl, title.replace(/\s+/g, ''), pageNumber)
        })
    })
}

function downloadPageImg(imgUrl, title, pageNumber) {
    const BASER_URL = 'https://pic.netbian.com' + imgUrl
    const extendName = path.extname(BASER_URL)
    const imgPath =  path.resolve(__dirname, `./第${pageNumber - 1}页`) + `/${title}${extendName}`
    // 写入流
    const ws = fs.createWriteStream(imgPath)
    return new Promise((resolve, reject) => {
        axios.get(BASER_URL, { responseType: 'stream' }).then((res) => {
            // 管道
            console.log(imgPath + '写入完成')
            res.data.pipe(ws)
            resolve()
        })
        ws.on('finish', function() {
            ws.end()
        });
        ws.on('error', function() {
            ws.end()
        });
    })
}


const getMorePageData = () => {
    for (let i = 2; i <= 15; i++) {
        sleep().then(() => {
            getPageData(i)
        })
    }
}

function sleep (time = 500) {
    return new Promise((resolve) => setTimeout(resolve, time));
}
getMorePageData()

yaogengzhu avatar Jun 01 '21 12:06 yaogengzhu

读取淘宝详情 --- 好像被拦截, 可以本地的方式获取

const fs = require("fs");
const path = require("path");
const cheerio = require("cheerio");
const axios = require("axios");

const root = path.resolve(__dirname, "./宝贝");
const mainRoot = path.resolve(__dirname, "./宝贝/主图");
const detailRoot = path.resolve(__dirname, "./宝贝/详情");
const colorRoot = path.resolve(__dirname, "./宝贝/颜色");

function getPageInfo() {
  return new Promise((resolve, reject) => {
    fs.readFile(
      path.resolve(__dirname, "./html/page.html"),
      "utf8",
      (err, result) => {
        if (err) {
          reject(err);
        }
        resolve(result);
      }
    );
  });
}

function createDir(pathUrl) {
  return new Promise((resolve, reject) => {
    fs.mkdir(pathUrl, { recursive: true }, (err) => {
      if (err) {
        console.log(err);
        reject(err);
      }
      console.log(pathUrl, "文件夹新建成功");
      resolve();
    });
  });
}

async function getImgInfo() {
  await createDir(root); // 创建文件夹
  await createDir(mainRoot); // 创建文件夹
  await createDir(detailRoot); // 创建文件夹
  await createDir(colorRoot); // 创建文件夹
  const result = await getPageInfo();

  return new Promise((resolve, reject) => {
    const detail = [];
    const colorList = [];
    const textList = [];
    const main = [];
    const $ = cheerio.load(result, { decodeEntities: false });
    const title = $('h1').text().trim()
    textList.unshift(title)
    $("#J_UlThumb > li > a >img").each((i, e) => {
      main.push(
        $(e)
          .attr("src")
          .replace("jpg_60x60q90", "jpg_800x800q90")
          .replace("//img.alicdn.com", "http://img.alicdn.com")
      );
    });
    $(" #description .content >p >img").each((i, e) => {
      detail.push($(e).attr("src"));
    });
    $(".J_TSaleProp.tb-img > li > a").each((i, e) => {
      const str = $(e)
        .attr("style")
        .split(" ")[0]
        .replace("background:", "")
        .replace("url(", "")
        .replace(")", "")
        .replace("jpg_40x40q90", "jpg_800x800q90")
        .replace("//img.alicdn.com", "http://img.alicdn.com");
      const r = str;
      const text = $(e).children().text();
      colorList.push({
        title: text.replace(/^[a-zA-Z0-9\u4E00-\u9FA5]+$/g, ''),
        img: r,
      });
      textList.push(text);
    });
    resolve({
      detail,
      colorList,
      main,
    });
    textList.forEach((item) => {
      writeTextInfo(item)
    })
  });
}

function writeTextInfo(data) {
  fs.writeFile(root+ '/info.text', data + '\n', {flag: 'a', encoding: 'utf8'}, (err) => {
    if(err) {
      console.log(err)
    }
    console.log(data, '写入成功')
  })
}
async function getImgList() {
  const { detail, colorList, main } = await getImgInfo();
  // console.log(detail, colorList, main)
  forEachGetImg(detail, detailRoot);
  forEachGetImg(main, mainRoot);
  forEachGetOtherImg(colorList, colorRoot)
1}

function forEachGetImg(list, currentP) {
  list.forEach((item, index) => {
    downloadPageImg(item, index, currentP);
  });
}
function forEachGetOtherImg(list, currentP) {
  list.forEach((item, index) => {
    downloadPageColorImg(item.img, item.title, currentP);
  });
}


function downloadPageImg(imgUrl, pageNumber, currentP) {
  const BASER_URL = imgUrl;
  // const extendName = path.extname(BASER_URL)
  const imgPath = currentP + `/detail${pageNumber}.jpg`;
  // 写入流
  const ws = fs.createWriteStream(imgPath);
  axios.get(BASER_URL, { responseType: "stream" }).then((res) => {
    console.log(imgPath + "写入完成");
    res.data.pipe(ws);
  });
  ws.on("finish", function () {
    ws.end();
  });
}

function downloadPageColorImg(imgUrl, title, currentP) {
  const BASER_URL = imgUrl;
  const imgPath = currentP + `/${title}.jpg`;
  // 写入流
  const ws = fs.createWriteStream(imgPath);
  axios.get(BASER_URL, { responseType: "stream" }).then((res) => {
    // 管道
    console.log(imgPath + "写入完成");
    res.data.pipe(ws);
  });
  ws.on("finish", function () {
    ws.end();
  });
}

getImgList();

yaogengzhu avatar Jun 04 '21 08:06 yaogengzhu

补充下异步等待

async function dosomething() {
    await 异步
}

yaogengzhu avatar Sep 27 '21 10:09 yaogengzhu