nodejs定时爬虫,持续抓取

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/Super_Sloppy/article/details/80839662

第一步:创建一个nodejs项目

mkdir <projectName>

cd <projectName>

npm init -y

npm install -D eslint

npx eslint --init     # 注:会让你选择一些配置,popular -> airbnb -> no react -> yml包 其余默认

然后安装项目中所要用到的模块 npm install request request-promise 等等,这里就不一一列举了

创建 .gitignore文件,里面加入一些不需要传git的文件,git时会忽略掉

在自动生成的风格包中 .eslintrc.yml 内容替换成如下:

parserOptions:
  ecmaVersion: 8
env:
  es6: true
  node: true
  mocha: true
globals:
  Service: true
extends: 'eslint:recommended'
rules:
  indent:
    - warn
    - 2
    - SwitchCase: 1
      VariableDeclarator:
        var: 2
        let: 2
        const: 3
  linebreak-style:
    - error
    - unix
  quotes:
    - warn
    - single
  semi:
    - error
    - always
  comma-dangle:
    - warn
    - always-multiline
  no-dupe-keys: error
  no-dupe-args: error
  use-isnan: error
  valid-typeof: error
  curly: error
  default-case: error
  eqeqeq:
    - error
    - allow-null
  guard-for-in: warn
  no-else-return: warn
  no-fallthrough: error
  no-floating-decimal: warn
  no-multi-str: error
  no-octal: error
  no-octal-escape: error
  no-redeclare: error
  no-with: error
  no-void: error
  radix: error
  strict: error
  no-delete-var: error
  array-bracket-spacing:
    - error
    - never
  block-spacing: error
  brace-style:
    - error
    - 1tbs
    - allowSingleLine: true
  comma-spacing: error
  comma-style:
    - error
    - last
  computed-property-spacing: error
  camelcase: warn
  key-spacing:
    - error
    - beforeColon: false
      afterColon: true
  keyword-spacing: error
  max-params:
    - warn
    - 6
  new-cap:
    - error
    - newIsCap: true
      capIsNew: false
      properties: true
  no-array-constructor: error
  no-spaced-func: error
  no-whitespace-before-property: error
  no-trailing-spaces:
    - error
    - skipBlankLines: true
  operator-linebreak: off
  space-before-blocks:
    - error
    - always
  space-before-function-paren:
    - error
    - anonymous: never
      named: never
      asyncArrow: always
  space-in-parens:
    - error
    - never
  space-infix-ops: error
  space-unary-ops: error
  spaced-comment:
    - warn
    - always
  arrow-spacing: error
  semi-spacing: error
  constructor-super: error
  generator-star-spacing: warn
  yield-star-spacing: warn
  no-const-assign: error
  no-dupe-class-members: error
  no-this-before-super: error
  no-var: error
  no-unused-vars:
    - warn
    - vars: local
      args: none
  no-use-before-define:
    - error
    - functions: false
      classes: false
      variables: false
  prefer-arrow-callback: warn
  prefer-const: off
  prefer-rest-params: warn
  prefer-spread: warn
  prefer-template: warn
  template-curly-spacing:
    - warn
    - never
  object-curly-spacing:
    - warn
    - always
  no-multi-spaces:
    - warn
    - ignoreEOLComments: true
  valid-jsdoc: off
  no-global-assign: error
  no-unsafe-negation: error
  require-yield: off
  no-warning-comments:
    - warn
    - location: start
      terms:
        - todo
        - fixme
        - xxx
        - hack
        - review

index.js爬虫代码

'use strict';
const rp = require('request-promise');
const log = require('xxd-log');
const bluebird = require('bluebird');

// 网页解析库
const cheerio = require('cheerio');
// 加密库
const crypto = require('crypto');

// 读写文件库
const fs = require('fs');
// 载入通行证json文件
const ticket = require('./ticket.json');
// 日历库,定时执行任务
const schedule = require('node-schedule');

// http请求库
const request = rp.defaults({
  jar: rp.jar(),
  gzip: true,
  headers: {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36',
  },
});
// 构造post请求,要看具体服务器要求配置
const post = rp.defaults({
  baseUrl: ticket.baseUrl,
  method: 'post',
  uri: '/news/import',
  headers: {
    'x-xxd-ticket': ticket.ticket,
  },
  json: true,
});

// sha1加密封装
function sha1(str) {
  return crypto.createHash('sha1').update(str).digest('hex');
}

// 缓存html文件,注意request时要try catch
async function getTmpRecord(url) {
  const sha1Url = sha1(url);
  const fileUrl = `${__dirname}/tmpRecord/${sha1Url}.txt`;
  const bool = fs.existsSync(fileUrl);
  if (bool) {
    return fs.readFileSync(fileUrl, 'utf-8');
  }
  
  try {
    const html = await request(url);
    fs.writeFileSync(fileUrl, html, 'utf-8');
    return html;
  } catch (err) {
    log.error(err.message.slice(0, 200));
    return null;
  }
}

// sleep函数,默认参数1000
function sleep(milliseconds = 1000) {
  // 封装sleep函数,返回一个Promise,在异步使用时前面加个await
  return new Promise((resolve, reject) => {
    setTimeout(() => {
      resolve();
    }, milliseconds);
  });
}


async function main() {
  const dataList = [];
  const sections = [
    {
      sectionName: '申请条件',
      section: 'tiaojian',
    },
    {
      sectionName: '国家优势',
      section: 'yuanxiaozhuanye/guojia',
    },
    {
      sectionName: '教育体系',
      section: 'yuanxiaozhuanye/jiaoyutixi',
    },
    {
      sectionName: '专业咨询',
      section: 'yuanxiaozhuanye/zhuanyezixun',
    },
    {
      sectionName: '热门专业',
      section: 'yuanxiaozhuanye/remenzhuanye',
    },
  ];

  for (let i = 0; i < 5; i += 1) {
    for (let page = 0; page < 10; page += 1) {

      try {
        const url = `https://www.liuxue86.com/${sections[i].section}/${(page === 0) ? '' : `${page + 1}.html`}`;
        log.trace('正在抓取-->', url);
        const html = await request(url);
        const section = sections[i].sectionName;
        const $ = cheerio.load(html, { decodeEntities: false });
        // bulebird执行异步数组任务
        await bluebird.map($('.news-title').get(), async (element) => {
          const contentUrl = $('a', element).attr('href');
          const data = {};
          const contentHtml = await getTmpRecord(contentUrl);
          if (contentHtml == null) {
            // 当前bluebird函数不需要返回值,所以return相当于continue
            return;
          }
          const dollar = cheerio.load(`${contentHtml}`, { decodeEntities: false });
          data.section = section;
          data.url = contentUrl;
          data.title = dollar('h1').text();
          log.trace('正在抓取--->', data.title);
          data.time = dollar('.conter_main_one_nav').children('p').text();
          dollar('#article-content img').remove();
          dollar('#article-content a').replaceWith(function() { return dollar(this).html(); });
          dollar('#article-content [style]').removeAttr('style');
          dollar('#article-content [class]').removeAttr('class');
          dollar('p:contains(推荐阅读:)').nextAll().remove();
          dollar('p:contains(推荐阅读:)').remove();
          data.content = dollar('#article-content').html().trim();
          data.content = data.content.replace(/出国留学网/g, '智课网');
          dataList.push(data);
        }, { concurrency: 4 });
      } catch (err) {
        log.error(err.stack);
      }
    }
  }
  log.trace('全部抓取完毕');
  // 将抓取到的数据的url取出,处理好后装入Set对象待比较
  // 注Set函数查找元素,效率高于普通数组查找
  const dataListUrlSet = new Set(dataList.map(data => `${sha1(data.url)}.txt`));
  // 通过Set.has() 过滤需要删除的缓存文件
  const tmpRecordList = fs.readdirSync(`${__dirname}/tmpRecord`)
    .filter((x) => !dataListUrlSet.has(x));
  const successRecordList = fs.readdirSync(`${__dirname}/successRecord`)
    .filter((x) => !dataListUrlSet.has(x));
  // 删除缓存文件
  tmpRecordList.forEach((element) => {
    fs.unlinkSync(`${__dirname}/tmpRecord/${element}`);
  });
  successRecordList.forEach((element) => {
    fs.unlinkSync(`${__dirname}/successRecord/${element}`);
  });

  // for (let i = 0; i < dataList.length; i++) {
  //   if (fs.existsSync(`${__dirname}/successRecord/${sha1(dataList[i].url)}.txt`)) {
  //     dataList.splice(i, i);
  //   }
  // }
  
  // 使用filter,对上面注释代码的优化
  const sendList = dataList.filter((x) => !fs.existsSync(`${__dirname}/successRecord/${sha1(x.url)}.txt`));

  bluebird.map(sendList, async (item) => {
    try {
      // 每半秒发送一次post请求
      await sleep(500);
      // 装载formData内容,然后发送
      const res = await post({
        formData: {
          title: item.title,
          content: item.content,
          source: `liuxue86-院校专业-${item.section}`,
        },
      });
      // 如果返回内容发生错误,且不是`该资讯已存在`,抛出异常
      if (res.code !== 0 && res.msg !== '该资讯已存在') {
        throw new Error(res.msg);
      }
      // 请求成功,把成功的处理后的url存成文件名,放入缓存文件中
      fs.writeFileSync(`${__dirname}/successRecord/${sha1(item.url)}.txt`, null);
      log.trace(item.title, '添加成功');
    } catch (err) {
      log.error(err.message);
    }
  }, { concurrency: 1 })
    .catch((err) => {
      log.fatal(err.stack);
    });
}
// 刚开始运行时,服务器中当前目录不存在如下两个文件夹,则创建
if (!fs.existsSync(`${__dirname}/tmpRecord`)) {
  fs.mkdirSync(`${__dirname}/tmpRecord`);
}
if (!fs.existsSync(`${__dirname}/successRecord`)) {
  fs.mkdirSync(`${__dirname}/successRecord`);
}

// 设置执行任务的时间,字符串中6个位置分别表示(秒 分 时 天 月 年),第二个参数就是所要执行的任务
schedule.scheduleJob('0 0 4 * * *', () => { // 详细内容可查看schedule模块文档
  main().catch((err) => {
    log.fatal(err.stack);
    
  });
});

// main().catch((err) => {
//   log.fatal(err.stack);
// });

猜你喜欢

转载自blog.csdn.net/Super_Sloppy/article/details/80839662
今日推荐