Nodejs爬虫(定时爬取)

Nodejs爬虫(定时爬取)

前言

Node.js是一个Javascript运行环境(runtime)。实际上它是对Google V8引擎进行了封装。V8引 擎执行Javascript的速度非常快,性能非常好。Node.js对一些特殊用例进行了优化,提供了替代的API,使得V8在非浏览器环境下运行得更好。

Node.js是一个基于Chrome JavaScript运行时建立的平台, 用于方便地搭建响应速度快、易于扩展的网络应用。Node.js使用事件驱动, 非阻塞I/O 模型而得以轻量和高效,非常适合在分布式设备上运行数据密集型的实时应用。

使用NodeJs写网页爬虫的优势

大家都知道,我们要写一个网页爬虫,爬取网页上的信息,实际上就是将目标网站的页面html下载下来,然后通过各种方式(如正则表达式)获取我们想要的信息并保存起来。从这点看来,使用Nodejs来写网页爬虫便有着相当大的优势。

Nodejs采用了Javascript的语法规则,是前端开发人员能够很容易上手

Nodejs写爬虫可以避免写一大堆正则表达式去匹配元素,我们可以用jquery的语法直接获取dom对象,方便快捷,可读性强。

Nodejs解决了Javascript无法直接操作系统文件的短板,让我们可以轻松操作系统中文件。

NodeJs写网页爬虫需要准备的环境

1.首先,如果你的电脑没有安装nodejs,那么,你需要到nodejs的官网中下载一个nodejs安装包并安装(安装过程跟普通程序无异,这里就不再赘述)。

Nodejs的官方网址为:

https://nodejs.org/en/

2.安装好NodeJs之后,我们就可以在我们的项目空间中创建我们的项目目录,并通过npm命令对项目进行初始化,并安装以下插件(具体安装过程不再赘述,大家可百度一下npm安装插件的方法)。

"bufferhelper":"^0.2.1",
"cheerio":"^0.20.0",
"http":"^0.0.0",
"https":"^1.0.0",
"iconv-lite":"^0.4.13",
"node-schedule":"^1.1.1",
"path":"^0.12.7",
"request":"^2.74.0",
"url":"^0.11.0"

3.然后,我们可以全局安装一下express模块,命令如下:

 

npm install -g express-generator

cnpm install -g express-generator

 

 

4.安装好express模块之后呢,我们就可以通过express创建一个新的爬虫项目啦,具体命令如下:

 

express spider

 

命令执行完后我们就可以看到这样的一个项目啦:

 

项目构建好之后,我们还要为项目安装依赖,命令如下:

npm install

 

 

做完上面的步骤,我们的环境就算是搭建好了,接下来,我们就来看一下我们的爬虫系统涉及到了那些模块。

 

1) 文件系统(./module/File.js

 

/**
 * 常用文件操作模块
 * Created by 汤文辉 on 2016-08-02.
 */


var fs = require('fs'),//文件操作
    mkdirp = require("mkdirp");//目录操作

var File= function(options) {

    this.path= options.path|| "";
    this.filename= options.filename|| "";
    this.encoding= options.encoding|| "UTF-8";

};

/**
 * 修改文件内容并保存
 *
@paramcontent   文件内容
 *
@parambAppend   是否追加模式
 *
@paramencoding  文件编码,默认为UTF-8
 */
File.prototype.save= function(content,bAppend,encoding) {

    varself = this;


    varbuffer = newBuffer(content,encoding || self.encoding);

    vardoFs =function () {

        fs.open(self.path+self.filename,bAppend ? 'a': 'w',"0666",function (err,fd) {
            if(err) {
                throwerr;
            }
            varcb2 =function (err) {
                if(err){
                    throwerr;
                }

                fs.close(fd,function(err){
                    if(err){
                        throwerr;
                    }
                    console.log('文件成功关闭...');
                })
            };
            fs.write(fd,buffer,0,buffer.length,0,cb2);
        });

    };

    fs.exists(self.path,function (exists) {
        if(!exists) {
            self.mkdir(self.path,"0666",function () {
                doFs();
            });
        }else {
            doFs();
        }
    });

};

/**
 * 递归创建目录
 *
@parampath      目录路径
 *
@parammode      模式  默认使用 0666
 *
@paramfn        回调
 *
@paramprefix    父级菜单
 */
File.prototype.mkdir= function(path,mode,fn,prefix) {

    sPath = path.replace(/\\+/g,'/');
    varaPath = sPath.split('/');
    prefix = prefix ||'';
    varsPath = prefix + aPath.shift();
    varself = this;
    varcb =function () {
        fs.mkdir(sPath,mode,function (err) {
            if((!err) || ( ([47,-4075]).indexOf(err["errno"]) > -1 )) {//创建成功或者目录已存在
                if(aPath.length> 0) {
                    self.mkdir(aPath.join('/'),mode,fn,sPath.replace(/\/$/,'') +'/');
                }else {
                    fn();
                }
            } else{
                console.log(err);
                console.log('创建目录:'+ sPath + '失败');
            }
        });
    };
    fs.exists(sPath,function (exists) {
        if(!exists) {
            cb();
        }else if (aPath.length> 0) {
            self.mkdir(aPath.join('/'),mode,fn,sPath.replace(/\/$/,'') +'/');
        }else {
            fn();
        }
    });

};

module.exports= File;

 

 

2) URL系统(./module/URL.js)

 

/**
 * URL处理类
 * Created by 汤文辉 on 2016-08-02.
 */

var urlUtil = require("url");
var pathUtil = require("path");

var URL= function(){

};

/**
 *
@desc获取URL地址 路径部分 不包含域名以及QUERYSTRING
 *
 *
@paramstring url
 *
 *
@returnstring
 */
URL.getUrlPath= function(url){

    if(!url){
        return'';
    }
    varoUrl = urlUtil.parse(url);
    if(oUrl["pathname"] && (/\/$/).test(oUrl["pathname"])){
        oUrl["pathname"] +="index.html";
    }
    if(oUrl["pathname"]){
        returnoUrl["pathname"].replace(/^\/+/,'');
    }
    return'';

};

/**
 *
@desc判断是否是合法的URL地址一部分
 *
 *
@paramstring urlPart
 *
 *
@returnboolean
 */
URL.isValidPart= function(urlPart){
    if(!urlPart){
        return false;
    }
    if(urlPart.indexOf("javascript") > -1){
        return false;
    }
    if(urlPart.indexOf("mailto") > -1){
        return false;
    }
    if(urlPart.charAt(0) === '#'){
        return false;
    }
    if(urlPart ==='/'){
        return false;
    }
    if(urlPart.substring(0,4) === "data"){//base64编码图片
        return false;
    }
    return true;
};

/**
 *
@desc修正被访问地址分析出来的URL 返回合法完整的URL地址
 *
 *
@paramstring url 访问地址
 *
@paramstring url2 被访问地址分析出来的URL
 *
 *
@returnstring || boolean
 */
URL.prototype.fix= function(url,url2){
    if(!url || !url2){
        return false;
    }
    varoUrl = urlUtil.parse(url);
    if(!oUrl["protocol"] || !oUrl["host"] || !oUrl["pathname"]){//无效的访问地址
        return false;
    }
    if(url2.substring(0,2) === "//"){
        url2 = oUrl["protocol"]+url2;
    }
    varoUrl2 = urlUtil.parse(url2);
    if(oUrl2["host"]){
        if(oUrl2["hash"]){
            deleteoUrl2["hash"];
        }
        returnurlUtil.format(oUrl2);
    }
    varpathname = oUrl["pathname"];
    if(pathname.indexOf('/') > -1){
        pathname = pathname.substring(0,pathname.lastIndexOf('/'));
    }
    if(url2.charAt(0) === '/'){
        pathname = '';
    }
    url2 = pathUtil.normalize(url2);//修正 ./ 和 ../
    url2 = url2.replace(/\\/g,'/');
    while(url2.indexOf("../") > -1){//修正以../开头的路径
        pathname = pathUtil.dirname(pathname);
        url2 = url2.substring(3);
    }
    if(url2.indexOf('#') > -1){
        url2 = url2.substring(0,url2.lastIndexOf('#'));
    }else if(url2.indexOf('?') > -1){
        url2 = url2.substring(0,url2.lastIndexOf('?'));
    }
    varoTmp = {
        "protocol": oUrl["protocol"],
        "host": oUrl["host"],
        "pathname": pathname +'/' + url2
    };
    returnurlUtil.format(oTmp);
};



module.exports= URL;

 

 

3) Robot系统(即爬虫系统主体)

 

 

/**
 * 网页爬虫
 * Created by 汤文辉 on 2016-08-02.
 */

var File = require("./File.js");
var URL = require("./URL.js");
var http = require("http");
var https = require("https");
var cheerio = require('cheerio');
var iconv = require('iconv-lite');
var BufferHelper = require("bufferhelper");
var request = require('request');

var oResult = {
    aNewURLQueue: [],//尚未执行爬取任务的队列
    aOldURLQueue: [],//已完成爬取任务的队列
    aTargetURLList: [],//目标对象URL集合
    oTargetInfoList: {},//目标对象集合
    oRetryCount:{},//失败重试记录
    iCount:0,//爬取url总数
    iSuccessNum:0//爬取成功数
};



/**
 * 爬虫程序主体
 *
@paramoptions
 *
@constructor
 
*/
var Robot= function(options) {

    varself = this;
    this.domain= options.domain|| "";//需要爬取网站的域名
    this.firstUrl= options.firstUrl|| "";//需要爬取网站的url
    this.id= this.constructor.create();//唯一标识符
    this.encoding= options.encoding|| "UTF-8";//页面编码
    this.outputPath= options.outputPath|| "";//爬取内容存放路径
    this.outputFileName= options.outputFileName|| "result.txt";//结果保存文件名
    this.timeout= options.timeout|| 5000;//超时时间
    this.retryNum= options.retryNum|| 5;//失败重试次数
    this.robots= options.robots|| true;//是否读取robots.txt文件

    this.debug= options.debug|| false;//是否开启调试模式

    this.file= newFile({
        path:this.outputPath,
        filename:this.outputFileName
    });

    oResult.aNewURLQueue.push(this.firstUrl);//将第一个url添加进队列之中

    this.handlerComplete= options.handlerComplete|| function(){//队列中所有的url均抓取完毕时执行回调
            console.log("抓取结束...");


            varstr = "",i=0,len=oResult.aTargetURLList.length;

            for(i=0;i<len;i++){

                url = oResult.aTargetURLList[i];
                str+="("+oResult.oTargetInfoList[url].name+") : "+url+"\n"

            }
            this.file.save(str,true);


            this.file.save("\n抓取完成...\n",true);
        };

    this.disAllowArr= [];//不允许爬取路径

    varrobotsURL = this.firstUrl+"robots.txt";

    request(robotsURL,function(error,response,body){
        if(!error && response.statusCode== 200) {
            this.disAllowArr= self.parseRobots(body);
        }

    });


};



//默认唯一标识
Robot.id= 1;

/**
 * 累加唯一标识
 *
@returns{number}
 */
Robot.create= function() {
    return this.id++;
};

/**
 * 解析robots.txt
 *
@paramstr
 *
@returns{Array}
 */
Robot.prototype.parseRobots= function(str){

    varline = str.split("\r\n");

    vari= 0,len=line.length,arr = [];

    for(i=0;i<len;i++){

        if(line[i].indexOf("Disallow:")!=-1){

            arr.push(line[i].split(":")[1].trim())

        }

    }

    returnarr;

};

/**
 * 判断当前路径是否允许爬取
 *
@paramurl
 *
@returns{boolean}
 */
Robot.prototype.isAllow= function(url){

    vari= 0,len=this.disAllowArr.length;
    for(i=0;i<len;i++){

        if(url.toLowerCase().indexOf(this.disAllowArr[i].toLowerCase())!=-1){
            return false;
        }

    }

    return true;

};

/**
 * 开启爬虫任务
 */
Robot.prototype.go= function(callback) {

    varurl = "";

    if(oResult.aNewURLQueue.length>0){

        url = oResult.aNewURLQueue.pop();

        if(this.robots&&this.isAllow(url)){

            this.send(url,callback);

            oResult.iCount++;

            oResult.aOldURLQueue.push(url);

        }else{

            console.log("禁止爬取页面:"+url);

        }



    }else{

        this.handlerComplete.call(this,oResult,this.file);

    }

};

/**
 * 发送请求
 *
@paramurl   请求链接
 *
@paramcallback  请求网页成功回调
 */
Robot.prototype.send= function(url,callback){

    varself = this;

    vartimeoutEvent;//由于nodejs不支持timeout,所以,需要自己手动实现

    varreq = '';
    if(url.indexOf("https") > -1){
        req = https.request(url);
    }else {
        req = http.request(url);
    }

    timeoutEvent = setTimeout(function() {
        req.emit("timeout");
    },this.timeout);

    req.on('response',function(res){
        varaType = self.getResourceType(res.headers["content-type"]);
        varbufferHelper = newBufferHelper();
        if(aType[2] !== "binary"){
        } else{
            res.setEncoding("binary");
        }
        res.on('data',function(chunk){
            bufferHelper.concat(chunk);
        });
        res.on('end',function(){//获取数据结束
            clearTimeout(timeoutEvent);

            self.debug&& console.log("\n抓取URL:"+url+"成功\n");

            //将拉取的数据进行转码,具体编码跟需爬去数据的目标网站一致
            data= iconv.decode(bufferHelper.toBuffer(),self.encoding);

            //触发成功回调
            self.handlerSuccess(data,aType,url,callback);

            //回收变量
            data= null;
        });
        res.on('error',function(){
            clearTimeout(timeoutEvent);
            self.handlerFailure(url);
            self.debug&& console.log("服务器端响应失败URL:"+url+"\n");
        });
    }).on('error',function(err){
        clearTimeout(timeoutEvent);
        self.handlerFailure(url);
        self.debug&& console.log("\n抓取URL:"+url+"失败\n");
    }).on('finish',function(){//调用END方法之后触发
        self.debug&& console.log("\n开始抓取URL:"+url+"\n");
    });
    req.on("timeout",function() {
        //对访问超时的资源,进行指定次数的重新抓取,当抓取次数达到预定次数后将不在抓取改url下的数据
        if(oResult.oRetryCount[url]==undefined){
            oResult.oRetryCount[url] =0;
        }else if(oResult.oRetryCount[url]!=undefined&&oResult.oRetryCount[url]<self.retryNum){
            oResult.oRetryCount[url]++;
            console.log("请求超时,调度到队列最后...");
            oResult.aNewURLQueue.unshift(url);
        }
        if(req.res) {
            req.res.emit("abort");
        }

        req.abort();
    });

    req.end();//发起请求

};

/**
 * 修改初始化数据,须在调用go方法前使用方能生效
 *
@paramoptions
 */
Robot.prototype.setOpt= function(options){

    this.domain= options.domain|| this.domain||"";//需要爬取网站的域名
    this.firstUrl= options.firstUrl|| this.firstUrl|| "";//需要爬取网站的url
    this.id= this.constructor.create();//唯一标识符
    this.encoding= options.encoding|| this.encoding|| "UTF-8";//页面编码
    this.outputPath= options.outputPath|| this.outputPath|| "";//爬取内容存放路径
    this.outputFileName= options.outputFileName|| this.outputFileName|| "result.txt";//结果保存文件名
    this.timeout= options.timeout|| this.timeout|| 5000;//超时时间
    this.retryNum= options.retryNum|| this.retryNum|| 5;//失败重试次数
    this.robots= options.robots|| this.robots|| true;//是否读取robots.txt文件

    this.debug= options.debug|| this.debug|| false;//是否开启调试模式

    this.file= newFile({
        path:this.outputPath,
        filename:this.outputFileName
    });

    oResult.aNewURLQueue.push(this.firstUrl);//将第一个url添加进队列之中

    this.handlerComplete= options.handlerComplete|| this.handlerComplete|| function(){
            console.log("抓取结束...");


            varstr = "",i=0,len=oResult.aTargetURLList.length;

            for(i=0;i<len;i++){

                url = oResult.aTargetURLList[i];
                str+="("+oResult.oTargetInfoList[url].name+") : "+url+"\n"

            }
            this.file.save(str,true);


            this.file.save("\n抓取完成...\n",true);
        };


};

/**
 * 数据拉取成功回调
 *
@paramdata  拉取回来的数据
 *
@paramaType 数据类型
 *
@paramurl   访问链接
 *
@paramcallback  用户给定访问成功回调,抛出给用户做一些处理
 */
Robot.prototype.handlerSuccess= function(data,aType,url,callback){


    if(callback){

        var$ = cheerio.load(data);
        callback.call(this,$,aType,url,oResult.aNewURLQueue,oResult.aTargetURLList,oResult.oTargetInfoList);

        oResult.iSuccessNum++;
        this.go(callback);
    }else{
        this.go();
    }

};

/**
 * 失败后继续执行其他爬取任务
 *
@paramurl
 */
Robot.prototype.handlerFailure= function(url){

    //oResult.aNewURLQueue.indexOf(url)==-1&&oResult.aNewURLQueue.unshift(url);
    this.go();

};




/**
 *
@desc判断请求资源类型
 *
 *
@paramstring  Content-Type头内容
 *
 *
@return[大分类,小分类,编码类型] ["image","png","utf8"]
 */
Robot.prototype.getResourceType= function(type){
    if(!type){
        return'';
    }
    varaType = type.split('/');
    aType.forEach(function(s,i,a){
        a[i] = s.toLowerCase();
    });
    if(aType[1] && (aType[1].indexOf(';') > -1)){
        varaTmp = aType[1].split(';');
        aType[1] = aTmp[0];
        for(vari = 1;i < aTmp.length;i++){
            if(aTmp[i] && (aTmp[i].indexOf("charset") > -1)){
                aTmp2= aTmp[i].split('=');
                aType[2] =aTmp2[1] ?aTmp2[1].replace(/^\s+|\s+$/,'').replace('-','').toLowerCase() : '';
            }
        }
    }
    if((["image"]).indexOf(aType[0]) > -1){
        aType[2] ="binary";
    }
    returnaType;
};


module.exports= Robot;

 

 

上面的功能都实现后,我们就可以开始来使用我们的爬虫系统了,首先,在app.js中调用我们的Robot模块

 

/**
 * Created by 汤文辉 on 2016-08-03.
 */
var express = require("express");
var Robot = require("./module/robot.js");
var schedule = require("node-schedule");


function getTime(){
    vardate = newDate();
    vary = date.getFullYear();
    varm = date.getMonth()+1;
    vard = date.getDate();
    varh = date.getHours();
    varmi = date.getMinutes();
    vars = date.getSeconds();

    m = m<10?"0"+m:m;
    d = d<10?"0"+d:d;
    h = h<10?"0"+h:h;
    mi = mi<10?"0"+mi:mi;
    s = s<10?"0"+s:s;

    returny+"_"+m+"_"+d+"_"+h+"_"+mi+"_"+s;

}

var options = {
    domain:"dytt8.net",
    firstUrl:"http://www.dytt8.net/",
    outputPath:"./output/testRobot/",
    outputFileName:"test.txt",
    encoding:"GBK",
    timeout:6000,
    robots:true,
    debug:true,
    handlerComplete:function(oResult,file){

        console.log("抓取结束...");


        file.save("\n抓取完成...\n总共访问网页数为"+oResult.iCount+"条,其中成功访问网页数"+oResult.iSuccessNum+"条",true);

    }
};
var robot =new Robot(options);
var reg1 =/\/html\/[a-z0-9]+\/[a-z0-9]+\/[\d]+\/[\d]+\.html/gmi;
var reg2 =/\/html\/[a-z0-9]+\/index\.html/gmi;
//var reg3 = /(ftp|http):\/\/.+\.(rmvb|mp4|avi|flv|mkv|3gp|wmv|wav|mpg|mov)/gmi;


function start(){

    robot.go(function($,aType,url,aNewURLQueue,aTargetURLList,oTargetInfoList){

        varself = this;
        varpUrl = url;
        if(url===options.firstUrl){

            varaA = $("a");

            aA.each(function(){

                varhref = $(this).attr('href');

                if(href.indexOf("http://")==-1){

                    href = options.firstUrl+href.substring(1);

                }

                varres = reg1.exec(href);

                if(res){

                    aNewURLQueue.indexOf(href)==-1&&aNewURLQueue.push(href);

                }

            });

        }else{

            $('a').each(function(){

                varhref = $(this).attr('href');
                varres2 = reg2.exec(href);

                console.log("页面["+pUrl+"]二级页面:【"+ href + "】");

                if(href.indexOf("thunder://")!=-1){

                    varurl = $(this).text().trim();
                    console.log("\n目标链接【"+$("h1").text().trim()+"】:"+url+"\n");
                    varname = $("h1").text().trim();
                    if(aTargetURLList.indexOf(url)){
                        aTargetURLList.push(url);
                        oTargetInfoList[url] = {
                            name:name
                        };
                    }

                    self.file.save(url+"\n",true);

                }else if(href.indexOf("ftp://")!=-1){
                    varurl = $(this).attr("href");
                    console.log("\n目标链接【"+$("h1").text().trim()+"】:"+url+"\n");
                    varname = $("h1").text().trim();
                    if(aTargetURLList.indexOf(url)){
                        aTargetURLList.push(url);
                        oTargetInfoList[url] = {
                            name:name
                        };
                    }
                    self.file.save(url+"\n",true);


                }else if(res2){
                    if(href.indexOf("http://")==-1){

                        href = options.firstUrl+href.substring(1);

                    }

                    varres = reg1.exec(href);

                    if(res){

                        aNewURLQueue.indexOf(href)==-1&&aNewURLQueue.push(href);

                    }
                }

            });


        }

    });
}


var rule =new schedule.RecurrenceRule();

rule.dayOfWeek= [0,new schedule.Range(1,6)];

rule.hour= 19;

rule.minute= 45;

console.log("定时爬取任务,下次爬取时间为"+rule.hour+"时"+rule.minute+"分");

var j = schedule.scheduleJob(rule,function(){

    robot.setOpt({
        outputFileName:getTime()+"-"+"电影天堂.txt"
    });
    console.log("开始定时爬取任务...");
    start();

});

 

然后,我们在命令行中输入

 

 

 

node app.js

 

运行即可,爬虫将会在星期一~星期天的晚上1945分定时爬取电影天堂电影下载链接,并输出到output目录中

 

 

猜你喜欢

转载自blog.csdn.net/u010651383/article/details/52107434