daily-share
daily-share copied to clipboard
利用nodejs 写一个简单的小爬虫爬图片(2021-6-1)
仅做学习参考
- node
- axios
- axios
- cheerio
- iconv-lite (解决中文gbk乱码问题)
const fs = require('fs')
const path = require('path')
const axios = require('axios')
const cheerio = require('cheerio')
const iconv = require('iconv-lite')
const getPageHtml = async (pageNumber) => {
// const BASER_URL = `https://pic.netbian.com/4kqiche/index_${pageNumber}.html`
const BASER_URL = `https://pic.netbian.com/4kdongman/index_${pageNumber}.html`
const res = await axios.get(BASER_URL, {responseType: 'stream'})
return new Promise((resolve, reject) => {
fs.mkdir(path.resolve(__dirname, `./第${pageNumber}页`), (err) => {
if (err) {
reject(err)
}
const chunks = []
res.data.on('data', chunk => {
chunks.push(chunk)
})
res.data.on('end', () => {
const buffer = Buffer.concat(chunks)
const str = iconv.decode(buffer, 'gbk')
resolve(str)
})
})
})
}
const getPageData = async (pageNumber) => {
const result = await getPageHtml(pageNumber)
const $ = cheerio.load(result, {decodeEntities:false})
$('.clearfix > li > a > img').each((i, element) => {
const imgUrl = $(element).attr('src')
const title = $(element).attr('alt')
sleep().then(() => {
downloadPageImg(imgUrl, title.replace(/\s+/g, ''), pageNumber)
})
})
}
function downloadPageImg(imgUrl, title, pageNumber) {
const BASER_URL = 'https://pic.netbian.com' + imgUrl
const extendName = path.extname(BASER_URL)
const imgPath = path.resolve(__dirname, `./第${pageNumber}页`) + `/${title}${extendName}`
// 写入流
const ws = fs.createWriteStream(imgPath)
axios.get(BASER_URL, { responseType: 'stream' }).then((res) => {
// 管道
console.log(imgPath + '写入完成')
res.data.pipe(ws)
})
ws.on('finish', function() {
ws.end()
});
}
const getMorePageData = () => {
for (let i = 2; i <= 10; i++) {
sleep().then(() => {
getPageData(i)
})
}
}
const sleep = (timer = 1000) => {
return new Promise((relove) => {
setTimeout(() => {
relove()
}, timer)
})
}
getMorePageData()
升级版,高清图片
const fs = require('fs')
const path = require('path')
const axios = require('axios')
const cheerio = require('cheerio')
const iconv = require('iconv-lite')
// 升级版
const getPageHtml = async (pageNumber) => {
// 可以在升级一波类目的的爬虫
// const BASER_URL = `https://pic.netbian.com/4kqiche/index_${pageNumber}.html`
const BASER_URL = `https://pic.netbian.com/4kdongman/index_${pageNumber}.html`
const res = await axios.get(BASER_URL, {responseType: 'stream'})
return new Promise((resolve, reject) => {
fs.mkdir(path.resolve(__dirname, `./第${pageNumber - 1}页`), { recursive: true }, (err) => {
if (err) {
reject(err)
}
const chunks = []
res.data.on('data', chunk => {
chunks.push(chunk)
})
res.data.on('end', () => {
const buffer = Buffer.concat(chunks)
const str = iconv.decode(buffer, 'gbk')
resolve(str)
})
})
})
}
// 处理高清图片的一层
const getImageDetailHtml = async (linkUrl) => {
const res = await axios.get(linkUrl, {responseType: 'stream'})
return new Promise((resolve, ject) => {
const chunks = []
res.data.on('data', chunk => {
chunks.push(chunk)
})
res.data.on('end', () => {
const buffer = Buffer.concat(chunks)
const str = iconv.decode(buffer, 'gbk')
resolve(str)
})
})
}
const getPageData = async (pageNumber) => {
const result = await getPageHtml(pageNumber)
const $ = cheerio.load(result, {decodeEntities:false})
$('.clearfix > li > a ').each(async (i, element) => {
const linkUrl = $(element).attr('href')
// 第二次读写
const res = await getImageDetailHtml( `https://pic.netbian.com/` + linkUrl)
const $1 = cheerio.load(res, {decodeEntities:false})
const title = $1('.photo-pic > a >img').attr('title')
const imgUrl = $1('.photo-pic > a >img').attr('src')
sleep().then( async () => {
await downloadPageImg(imgUrl, title.replace(/\s+/g, ''), pageNumber)
})
})
}
function downloadPageImg(imgUrl, title, pageNumber) {
const BASER_URL = 'https://pic.netbian.com' + imgUrl
const extendName = path.extname(BASER_URL)
const imgPath = path.resolve(__dirname, `./第${pageNumber - 1}页`) + `/${title}${extendName}`
// 写入流
const ws = fs.createWriteStream(imgPath)
return new Promise((resolve, reject) => {
axios.get(BASER_URL, { responseType: 'stream' }).then((res) => {
// 管道
console.log(imgPath + '写入完成')
res.data.pipe(ws)
resolve()
})
ws.on('finish', function() {
ws.end()
});
ws.on('error', function() {
ws.end()
});
})
}
const getMorePageData = () => {
for (let i = 2; i <= 15; i++) {
sleep().then(() => {
getPageData(i)
})
}
}
function sleep (time = 500) {
return new Promise((resolve) => setTimeout(resolve, time));
}
getMorePageData()
读取淘宝详情 --- 好像被拦截, 可以本地的方式获取
const fs = require("fs");
const path = require("path");
const cheerio = require("cheerio");
const axios = require("axios");
const root = path.resolve(__dirname, "./宝贝");
const mainRoot = path.resolve(__dirname, "./宝贝/主图");
const detailRoot = path.resolve(__dirname, "./宝贝/详情");
const colorRoot = path.resolve(__dirname, "./宝贝/颜色");
function getPageInfo() {
return new Promise((resolve, reject) => {
fs.readFile(
path.resolve(__dirname, "./html/page.html"),
"utf8",
(err, result) => {
if (err) {
reject(err);
}
resolve(result);
}
);
});
}
function createDir(pathUrl) {
return new Promise((resolve, reject) => {
fs.mkdir(pathUrl, { recursive: true }, (err) => {
if (err) {
console.log(err);
reject(err);
}
console.log(pathUrl, "文件夹新建成功");
resolve();
});
});
}
async function getImgInfo() {
await createDir(root); // 创建文件夹
await createDir(mainRoot); // 创建文件夹
await createDir(detailRoot); // 创建文件夹
await createDir(colorRoot); // 创建文件夹
const result = await getPageInfo();
return new Promise((resolve, reject) => {
const detail = [];
const colorList = [];
const textList = [];
const main = [];
const $ = cheerio.load(result, { decodeEntities: false });
const title = $('h1').text().trim()
textList.unshift(title)
$("#J_UlThumb > li > a >img").each((i, e) => {
main.push(
$(e)
.attr("src")
.replace("jpg_60x60q90", "jpg_800x800q90")
.replace("//img.alicdn.com", "http://img.alicdn.com")
);
});
$(" #description .content >p >img").each((i, e) => {
detail.push($(e).attr("src"));
});
$(".J_TSaleProp.tb-img > li > a").each((i, e) => {
const str = $(e)
.attr("style")
.split(" ")[0]
.replace("background:", "")
.replace("url(", "")
.replace(")", "")
.replace("jpg_40x40q90", "jpg_800x800q90")
.replace("//img.alicdn.com", "http://img.alicdn.com");
const r = str;
const text = $(e).children().text();
colorList.push({
title: text.replace(/^[a-zA-Z0-9\u4E00-\u9FA5]+$/g, ''),
img: r,
});
textList.push(text);
});
resolve({
detail,
colorList,
main,
});
textList.forEach((item) => {
writeTextInfo(item)
})
});
}
function writeTextInfo(data) {
fs.writeFile(root+ '/info.text', data + '\n', {flag: 'a', encoding: 'utf8'}, (err) => {
if(err) {
console.log(err)
}
console.log(data, '写入成功')
})
}
async function getImgList() {
const { detail, colorList, main } = await getImgInfo();
// console.log(detail, colorList, main)
forEachGetImg(detail, detailRoot);
forEachGetImg(main, mainRoot);
forEachGetOtherImg(colorList, colorRoot)
1}
function forEachGetImg(list, currentP) {
list.forEach((item, index) => {
downloadPageImg(item, index, currentP);
});
}
function forEachGetOtherImg(list, currentP) {
list.forEach((item, index) => {
downloadPageColorImg(item.img, item.title, currentP);
});
}
function downloadPageImg(imgUrl, pageNumber, currentP) {
const BASER_URL = imgUrl;
// const extendName = path.extname(BASER_URL)
const imgPath = currentP + `/detail${pageNumber}.jpg`;
// 写入流
const ws = fs.createWriteStream(imgPath);
axios.get(BASER_URL, { responseType: "stream" }).then((res) => {
console.log(imgPath + "写入完成");
res.data.pipe(ws);
});
ws.on("finish", function () {
ws.end();
});
}
function downloadPageColorImg(imgUrl, title, currentP) {
const BASER_URL = imgUrl;
const imgPath = currentP + `/${title}.jpg`;
// 写入流
const ws = fs.createWriteStream(imgPath);
axios.get(BASER_URL, { responseType: "stream" }).then((res) => {
// 管道
console.log(imgPath + "写入完成");
res.data.pipe(ws);
});
ws.on("finish", function () {
ws.end();
});
}
getImgList();
补充下异步等待
async function dosomething() {
await 异步
}