blog
blog copied to clipboard
async, await 练习- 存储关注专栏里面的前3篇文章
const request = require('request-promise-native');
const cheerio = require('cheerio');
const config = require('../config');
const zhihuRoot = config.zhihu.root;
const pageSize = config.page.pageSize;
const ColumnModel = require('../model/column');
const ContentModel = require('../model/content');
const USER_NAME = 'anran-0423';
// db start
const {host, database, port} = config.db;
const mongoose = require('mongoose');
mongoose.Promise = global.Promise;
mongoose.connect(host, database, port);
const exploreColumns = async (offset, limit) => {
// 获取我offset页开始的 每页limit的专栏数据
const paramObj = [`offset=${offset}`, `limit=${limit}`].join('&');
const options = {
method: 'GET',
uri: `https://www.zhihu.com/api/v4/members/${USER_NAME}/following-columns?${paramObj}`,
json: true,
};
const rsData = await request(options);
const promiseArr = rsData.data.map(async (column) => {
return ColumnModel
.findOneAndUpdate({id: column.id}, column, {upsert: true, new: true})
.exec();
// 存储专栏相关的数据 这里涉及到findOneAndUpdate 与 update 方法的区别
// https://segmentfault.com/a/1190000009706886,
// Mongoose: findOneAndUpdate doesn't return updated document
// https://stackoverflow.com/questions/32811510/mongoose-findoneandupdate-doesnt-return-updated-document
});
return Promise.all(promiseArr);
};
const getArticledData = (column) => {
// 获取专栏里的最新的一篇文章数据
return new Promise((resolve, reject) => {
const uri = `https://zhuanlan.zhihu.com/api2/columns/${column.id}/articles`;
const options = {
uri,
json: true,
};
request(options)
.then((res) => {
// 取每个文章的前3个
const result = res.data.slice(0, 3).map((arr) => {
arr.columnId = column.id;
return arr;
});
resolve(result);
});
});
};
const getPageSize = () => {
return new Promise((resolve) => {
// 获取关注的专栏的页码数
request(`${zhihuRoot}/people/anran-0423/following/columns`)
.then((res) => {
const $ = cheerio.load(res);
const jsonData = JSON.parse($('#js-initialData').html());
const data = jsonData.initialState.entities.users;
resolve(data[USER_NAME].followingColumnsCount);
})
.catch((err) => {
console.log(err);
});
});
};
const saveArticles = (articleArr, column) => {
const promiseArr = articleArr.map(async (article) => {
article.columnId = column._id;
ContentModel
.update({id: article.id}, article, {upsert: true})
.exec();
});
return Promise.all(promiseArr);
};
const init = async () => {
const allNum = await getPageSize();
const pageCount = Math.ceil(allNum / pageSize);
let pageArr = Array.from(new Array(pageCount), (val, index) => index);
pageArr = pageArr.map(async (cur) => {
const startPage = cur * pageSize;
const endPage = cur * pageSize + pageSize;
const columns = await exploreColumns(startPage, endPage);
const articleArrs = columns.map(async (column) => {
const articleArr = await getArticledData(column);
saveArticles(articleArr, column);
});
return Promise.all(articleArrs);
});
Promise.all(pageArr)
.then(() => {
console.log('抓取数据成功!');
})
.catch((err) => {
console.log(err);
});
};
init();
主要遇到的问题: Using async/await with a forEach loop https://stackoverflow.com/questions/37576685/using-async-await-with-a-foreach-loop