Pyspider爬虫模板-自己用的-需要使用json处理的网站

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-09-12 14:54:59
# Project: www_caijing_com_cn
"""
=============================================爬虫详细===================================================
网站名称：财经网
网站地址：www_caijing_com_cn
要爬取的网址：http://roll.caijing.com.cn/ajax_lists.php?modelid=0&time=0.45231726534172245
"""
from pyspider.libs.base_handler import *
import os
import time
import json
import random
import pathlib
import hashlib
from elasticsearch import Elasticsearch
#引入基本的python包使用##################
s = open('/home/git/PyspiderProfile/Retile.txt').read().split("@")
#引入配置文件，链接数据库，已经kafka的基本数据
#引入配置文件############################
sys.path.append('/home/git/PyspiderProfile')
from GT import itemExiste_news
from GT import log
from GT import Quote
from GT import insertMysql
from GT import Mysql
from GT import Sentiment
from GT import Kafka
from GT import insertMysql
AGENT='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
SOURCE="财经网 "#网站名称
TABLE_NAME="www_caijing_com_cn"#该网站所具有的的唯一数据库表
URL_LIST={"http://roll.caijing.com.cn/ajax_lists.php?modelid=0&time=0.45231726534172245"}
class Handler(BaseHandler):
crawl_config = {
"headers": {
"Cache-Control": "no-cache",
"Upgrade-Insecure-Requests": "1",
"User-Agent": AGENT,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"DNT": "1",
"Accept-Encoding": "gzip, deflate, sdch",
"Accept-Language": "zh-CN,zh;q=0.8"}
}
@every(minutes=3)
def on_start(self):
Mysql(s[0],s[1],s[2],s[3],TABLE_NAME).createpage()#创建数据库，用作数据去重
for URL in URL_LIST:
self.crawl(URL, callback=self.index_page)
@config(age=60 * 60)
def index_page(self, response):
myjson=response.text
for each in json.loads(myjson):
self.crawl(each["url"], callback=self.detail_page)
#正文接口处理，获取到相应的模块，将链接指定到具体的新闻地址
def detail_page(self, response):
if not response.ok:
return
url = response.url#URL
bRepeated = itemExiste_news(s[0],s[1],s[2],s[3],TABLE_NAME,url).itemExisted()#根据新闻的URL进行去重
if bRepeated:
log(TABLE_NAME,"repeat"+url+"-->此条新闻已经处理过").log()
return
else:
author=""
try:
print("ok")
author= response.doc("#source_baidu").text()
except:
author = ""
if author=="":
try:
author=response.doc("div.ws-infor > a:nth-child(2)").text()
except:
author = ""
if author=="":
try:
author=response.doc("#endWebsite > a").text()
except:
author = ""
if " 评论" in author:
author=author.split(" 评论")[0]
if "[\xa0\xa0" in author:
author=author.replace("[\xa0\xa0","")
if "\xa0\xa0]" in author:
author=author.replace("\xa0\xa0]","")
print(author)
source=SOURCE
print("okok")
title=Quote(response.doc('title').text()).DelQuote()
publish_date=""
publish_date=response.doc("#pubtime_baidu").text()
print(publish_date)
if "\xa0" in publish_date:
publish_date=publish_date.replace("\xa0"," ")
if publish_date=="":
publish_date=response.doc("div.wzzzly > span:nth-child(2)").text()
print(publish_date)
if "/" in publish_date:
publish_date=publish_date.replace("/","-")
if "年" in publish_date:
publish_date=publish_date.replace("年","-")
if "月" in publish_date:
publish_date=publish_date.replace("月","-")
if "日" in publish_date:
publish_date=publish_date.replace("日","")
if ":" not in publish_date:
publish_date=publish_date+" 00:00:00"
if publish_date.count(":")==1:
publish_date=publish_date+":00"
if len(publish_date)>19:
publish_date = publish_date[:19]
if 0==len(publish_date):
publish_date = '1970-01-01 00:00:00'
log(TABLE_NAME,"publish_date is null "+url).log()
return
if not Quote(publish_date).is_valid_date():
log(TABLE_NAME,"Error publish_date ="+url).log()
publish_date = '1970-01-01 00:00:00'
return
print(publish_date)
crawl_time=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
content=""
for each in response.doc('#the_content> p').items():
if len(each('style').text())>0:
continue
if len(each('script').text())>0:
continue
if len(each('table').text())>0:
continue
if len(each.text())>0:
content += ' ' + each.text() + '<br>'
if content=="":
for each in response.doc('#Content > p').items():
if len(each('style').text())>0:
continue
if len(each('script').text())>0:
continue
if len(each('table').text())>0:
continue
if len(each.text())>0:
content += ' ' + each.text() + '<br>'
content = Quote(content).ConverQuote()
print(content)
if len(content) > 0:
print("ok")
insertMysql(s[0],s[1],s[2],s[3],TABLE_NAME,url,"").insertSQLpage()#将URL插入到数据库
self.send_msg(source,title,publish_date,url,crawl_time,content,author)
#正文处理模块，将正文的内容从网页中提取处理，并组织好相应的结构进行发送
def send_msg(self,source,title,publish_date,url,crawl_time,content,author):
#发送情况1
"""
参数：
source:网站名称，在爬虫开头进行定义
title：文章标题
publish_date:文章发布时间
url:新闻网址
crawl_time:爬取时间，就是获取当前时间
content:新闻爬去的正文
"""
#创建MD5对象
hl = hashlib.md5()
strid = source+title+publish_date
hl.update(strid.encode(encoding='utf-8'))
#向ES直接写入数据时使用这个
#_id=hl.hexdigest()
#通过kafka向ES写入数据时使用这个
id=hl.hexdigest()
#将数据发送到ES里面
data = {
"url": url,
"title": title,
"publish_date":publish_date,
"source":source,
"content":content,
"author":author,
"crawl_time":crawl_time,
"id":id #只有通过kafka向ES写入数据时才有这个配置
}
print(data)
#修改1:由直接向kafka写入数据转为通过kafka队列向kafka写入数据
#向kafka插入数据，需要自己定义topic，topic直接确定
Kafka(s[6],"news",data).SendToKafka()
#es = Elasticsearch([{"host":"192.168.31.161","port":9200,"timeout":15000}])
#es.index( index="news", doc_type="news", body=data, id=_id)
#具体参数需要根据情况进行确定

Pyspider爬虫模板-自己用的-需要使用json处理的网站

猜你喜欢