爬虫puppeteer-贝太美食网页列表详情页 demo(二)

下面的代码比较混乱,只是实现了功能,但是包有点混用,最后实现了通过上一篇内容获取的各个详情页的id,依次获取详情页面的数据。

其中用了cheerio 对页面数据进行了操作。

最后,也是把数据写入本地文件夹中。

// 'use strict';

const puppeteer = require('puppeteer');

const requestSys = require('request');

const cheerio = require('cheerio');

const fs = require('fs');

const detailIdList = require('./data/detail/detailIdList.json');

const urlArr = [

'http://www.beitaichufang.com/recipe/'

];

// 等待3000毫秒

const sleep = time => new Promise(resolve => {

setTimeout(resolve, time);

})

// gotoDetailPage()

function gotoDetailPage(id) {

(async () => {

try {

// for(let i = 0; i<detailIdList.length; i++ ){

await sleep(3000)

const browser = await puppeteer.launch({

ignoreHTTPSErrors: true,

headless: false,

args: ['--no-sandbox', '--disable-setuid-sandbox'],

devtools: true,

timeout: 0

}).catch(() => browser.close);

const page = await browser.newPage();

// var num = 0;

// await page.on('request', request => {

// // console.log(request.body)

// });

// {timeout: 3000}

await page.goto(urlArr[0]+id+'.html',{waitUntil: 'networkidle2'});

const html = await page.content();

let $=cheerio.load(html);

setDetailItem($, id)

await sleep(3000)

await browser.close();

// }

} catch (e) {

console.log(e);

}

})();

}

function regString(str) {

console.log(str, '==================str')

// let strTemp = str.toString()

// strTemp.replace(/\ +/g,"");

return str

}

function setDetailItem($, id) {

console.log('进入到页面解析。。。')

let item = {}

item.converUrl = regString($('#menu-img').attr('src')); // 封面

item.name = regString($('#menu-name').text()); // 标题

// item.videoUrl = $('#menu-name').text(); // 视频地址

item.desc = regString($('#chef-intro').text()); // 描述

let stepList = $('#menu-step .item'); // 步骤

let itemList = [];

stepList.each(function(item) {

var cap = $(this);

//console.log(cap.find('h3').text());

var item = {

step: regString(cap.find('.item-title').text()),

intro: regString(cap.find('.item-intro').text()),

imgUrl: cap.find('img').attr('src')

}

itemList.push(item);

});

console.info(itemList);

item.stepList = itemList

item.makeTime = regString($('#menu-time').text()); // 制作时间

item.level = regString($('#menu-degree').text()); // 难易程度

item.texture = regString($('#menu-taste').text()); // 口感

// 将数据写入

let writerStream = fs.createWriteStream(`./data/${id}.json`);

writerStream.write(JSON.stringify(item, undefined, 2), 'UTF8');

writerStream.end();

console.log('结束')

}

// 获取详情页面数据

function getDetailData() {

let count = 0;

// detailIdList.length

let T = setInterval(function(){

if(count < detailIdList.length) {

gotoDetailPage(detailIdList[count])

} else {

clearInterval(T)

}

count = count + 1

}, 15000)

// for(let i = 0; i<detailIdList.length; i++ ){

// // (async () => {

// // await sleep(3000)

// gotoDetailPage(detailIdList[i])

// // })();

// }

}

getDetailData()

猜你喜欢

转载自blog.csdn.net/u014085502/article/details/89673276