nodejs 04 - example

参考文章: nodejs 爬虫实战

 
01. 配置开发环境
01) 选择合适的目录,新建文件夹nodejs-webcrawler。打开命令行终端,进入当前目录,执行命令npm init生成并初始化package.json文件
 1 Weizhens-Mac-mini:~ weizhen$ cd /Users/weizhen/Sites/nodejs/webcrawler 
 2 Weizhens-Mac-mini:webcrawler weizhen$ npm init
 3 This utility will walk you through creating a package.json file.
 4 It only covers the most common items, and tries to guess sensible defaults.
 5  
 6 See `npm help json` for definitive documentation on these fields
 7 and exactly what they do.
 8  
 9 Use `npm install <pkg>` afterwards to install a package and
10 save it as a dependency in the package.json file.
11  
12 Press ^C at any time to quit.
13 package name: (webcrawler)
14 version: (1.0.0)
15 description: webcrawler
16 entry point: (index.js)
17 test command: none
18 git repository: none
19 keywords: none
20 author: weizhen
21 license: (ISC)
22 About to write to /Users/weizhen/Sites/nodejs/webcrawler/package.json:
23  
24 {
25   "name": "webcrawler",
26   "version": "1.0.0",
27   "description": "webcrawler",
28   "main": "index.js",
29   "scripts": {
30     "test": "none"
31   },
32   "repository": {
33     "type": "git",
34     "url": "none"
35   },
36   "keywords": [
37     "none"
38   ],
39   "author": "weizhen",
40   "license": "ISC"
41 }
42  
43 Is this OK? (yes)

  

02) 安装项目的依赖项
1 # -s 会自动将依赖关系写入 package.json 文件中
2 # express用来搭建简单的服务器
3 npm install express -s 
4 # superagent用来请求页面
5 npm install superagent -s
6 # cheerio形如jquery处理页面元素
7 npm install cheerio -s
8 # xlsx用来将生成excel文件
9 npm install xlsx -s

  

02. 编写代码
01) util.js 文件
 1 const accessPage = (url, callback) => {
 2     const superagent = require('superagent');
 3     superagent.get(url).retry(3).end((err, res) => {
 4         if (err) {
 5             console.log(`访问页面失败${err}`);
 6         } else {
 7             callback && callback(res);
 8         }
 9     });
10 };
11  
12 const formatData = (arr) => {
13     if (arr.length === 0) { return []; }
14     let fields = Object.keys(arr[0]);
15     let sheet_data = arr.map(item => fields.map(field => item[field]));
16     sheet_data.unshift(fields);
17     return sheet_data;
18 };
19  
20 const saveToExcel = (sheet_data, fileName) => {
21     const XLSX = require('xlsx');
22     const sheet = XLSX.utils.aoa_to_sheet(sheet_data);  // data to sheet
23     let book = XLSX.utils.book_new();  // create an empty workbook
24     XLSX.utils.book_append_sheet(book, sheet, 'sheet1');  // append sheet into workbook
25     XLSX.writeFile(book, fileName); // save to file
26 };
27  
28 module.exports = {
29     accessPage,
30     formatData,
31     saveToExcel,
32 };

  

02) index.js 文件
 1 const express = require('express');
 2 const app = express();
 3 const util = require('./util.js');
 4  
 5 let server = app.listen(3000, () => {
 6   let { adress, port } = server.address();
 7   console.log(`App is running at http://${adress}:${port}`);
 8 });
 9  
10 app.get('/', async (req, res, next) => {
11   util.accessPage('http://news.baidu.com/', (resdom) => {
12     let hostNews = getPageInfo(resdom);
13     if (hostNews && hostNews.length > 0) {
14       const sheet_data = util.formatData(hostNews);
15       util.saveToExcel(sheet_data, 'out.xlsx');
16     }
17     res.send(hostNews);
18   });
19 });
20  
21 const getPageInfo = (resdom) => {
22   // 抓取页面信息
23   const cheerio = require('cheerio');
24   // 使用cheerio模块的load()方法,将htmldocument作为参数传入函数,就可以使用类似Jquery的$(selector)的方式获取页面元素
25   let $ = cheerio.load(resdom.text);
26   let hostNews = [];
27   $('div#pane-news ul li a').each((idx, ele) => {
28     let news = {
29       title: $(ele).text(),
30       href: $(ele).attr('href'),
31     };
32     hostNews.push(news);
33   });
34   return hostNews;
35 }

  

03. 执行代码
01) 打开命令行终端,进入当前目录,执行命令 node index.js
02) 浏览器打开 http://localhost:3000/ 可以查看效果

猜你喜欢

转载自www.cnblogs.com/vision2015/p/11434289.html