nodejs爬虫学习

'use strict';
const rp = require('request-promise');
// http请求库
const request = rp.defaults({
    jar: rp.jar(),
    gzip: true,
    headers: {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36',
    },
});
// 网页解析库
const cheerio = require('cheerio');
// 读写文件库
const fs = require('fs');

async function main(){
    
    const dataList = [];
    const fs = require('fs');
    // 该文件由数组对象直接写入，中间有逗号分隔
    const List = fs.readFileSync('zhuanyezixunUrl.txt', 'utf-8').split(',');
    // let i = 0;
    for (let url of List){
        // 注意：一定要在循环内声明
        const data = {};
        // console.log(i++);
        // 发送请求，异步函数中，需加await，否则会直接执行下面的语句，发生错误
        const html = await request(url);
        const $ = cheerio.load(html, {decodeEntities: false});
        data.title = $('h1').text();
        data.time = $('.conter_main_one_nav').children('p').text();
        // 除去所有img标签
        $('#article-content img').remove();
        // 替换a标签，并释放出里面的内容
        $('#article-content a').replaceWith(function(){ return $(this).html()});
        // 除去所有tag的style和class
        $('#article-content [style]').removeAttr('style');
        $('#article-content [class]').removeAttr('class');
        // 除去p标签中有(推荐阅读：)的后面所有内容，然后把自己也干掉
        $('p:contains(推荐阅读：)').nextAll().remove();
        $('p:contains(推荐阅读：)').remove();
        //取出处理好的正文内容
        data.content = $('#article-content').html().trim();
        // 替换关键字
        data.content = data.content.replace(/出国留学网/g, '智课网');
        dataList.push(data);
    }
    const json = JSON.stringify(dataList, 0, 2);
    fs.writeFileSync('zhuanyezixun.json', json, 'utf-8');
    console.log('ok');
}

main().catch(err => {
    console.error(err.stack);
});

// async function main() {
//     console.log('正在爬取网页。。。。。。');
//     const urlList = [];
//     for (let i = 0; i < 10; i++) {
//         const url = `https://www.example.com/tiaojian/${(i === 0) ? '' : `${i + 1}.html`}`;
//         console.log(`正在爬取第${i + 1}页+${url}`);
//         const html = await request(url);
//         const $ = cheerio.load(html);
//         $('.news-title').each(function(){
//             urlList.push($('a', this).attr('href'))
//             console.log($('a', this).attr('href'));
//         });
//     }
//     fs.writeFileSync('tiaojianUrl.txt', urlList, 'utf-8');
//     console.log('写入完毕');
// }
// main().catch(err => {
//     console.error(err.stack);
// });

python 中 pyquery对比用法

url = 'https://www.example.com/a/3764073.html'
html = requests.get(url)
html.encoding = 'utf-8'
doc = pq(html.text)
doc('#article-content img').remove()
doc('#article-content [style]').remove_attr('style')
doc('#article-content [class]').remove_attr('class')
doc('p:contains(推荐阅读：)').next_all().remove()
doc('p:contains(推荐阅读：)').remove()
# 释放a标签中的内容还未解决
doc('#article-content [href]').remove_attr('href')
doc('#article-content [target]').remove_attr('target')
doc = doc('#article-content').html().strip().replace('<a>', '').replace('</a>', '')
print(doc)

猜你喜欢