-
#!/usr/bin/env python
-
# -*- encoding: utf-8 -*-
-
# Created on 2018-09-12 14:54:59
-
# Project: www_caijing_com_cn
-
-
"""
-
=============================================爬虫详细===================================================
-
网站名称:财经网
-
网站地址:www_caijing_com_cn
-
要爬取的网址:http://roll.caijing.com.cn/ajax_lists.php?modelid=0&time=0.45231726534172245
-
-
"""
-
from pyspider.libs.base_handler import *
-
import os
-
import time
-
import json
-
import random
-
import pathlib
-
import hashlib
-
from elasticsearch import Elasticsearch
-
#引入基本的python包使用##################
-
s = open('/home/git/PyspiderProfile/Retile.txt').read().split("@")
-
#引入配置文件,链接数据库,已经kafka的基本数据
-
#引入配置文件############################
-
sys.path.append('/home/git/PyspiderProfile')
-
from GT import itemExiste_news
-
from GT import log
-
from GT import Quote
-
from GT import insertMysql
-
from GT import Mysql
-
from GT import Sentiment
-
from GT import Kafka
-
from GT import insertMysql
-
-
AGENT='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
-
SOURCE="财经网 "#网站名称
-
TABLE_NAME="www_caijing_com_cn"#该网站所具有的的唯一数据库表
-
URL_LIST={"http://roll.caijing.com.cn/ajax_lists.php?modelid=0&time=0.45231726534172245"}
-
class Handler(BaseHandler):
-
crawl_config = {
-
"headers": {
-
"Cache-Control": "no-cache",
-
"Upgrade-Insecure-Requests": "1",
-
"User-Agent": AGENT,
-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
-
"DNT": "1",
-
"Accept-Encoding": "gzip, deflate, sdch",
-
"Accept-Language": "zh-CN,zh;q=0.8"}
-
}
-
-
@every(minutes=3)
-
def on_start(self):
-
Mysql(s[0],s[1],s[2],s[3],TABLE_NAME).createpage()#创建数据库,用作数据去重
-
for URL in URL_LIST:
-
self.crawl(URL, callback=self.index_page)
-
-
@config(age=60 * 60)
-
def index_page(self, response):
-
myjson=response.text
-
for each in json.loads(myjson):
-
self.crawl(each["url"], callback=self.detail_page)
-
-
#正文接口处理,获取到相应的模块,将链接指定到具体的新闻地址
-
def detail_page(self, response):
-
if not response.ok:
-
return
-
url = response.url#URL
-
bRepeated = itemExiste_news(s[0],s[1],s[2],s[3],TABLE_NAME,url).itemExisted()#根据新闻的URL进行去重
-
if bRepeated:
-
log(TABLE_NAME,"repeat"+url+"-->此条新闻已经处理过").log()
-
return
-
else:
-
author=""
-
try:
-
print("ok")
-
author= response.doc("#source_baidu").text()
-
except:
-
author = ""
-
if author=="":
-
try:
-
author=response.doc("div.ws-infor > a:nth-child(2)").text()
-
except:
-
author = ""
-
if author=="":
-
try:
-
author=response.doc("#endWebsite > a").text()
-
except:
-
author = ""
-
if " 评论" in author:
-
author=author.split(" 评论")[0]
-
if "[\xa0\xa0" in author:
-
author=author.replace("[\xa0\xa0","")
-
if "\xa0\xa0]" in author:
-
author=author.replace("\xa0\xa0]","")
-
print(author)
-
source=SOURCE
-
print("okok")
-
title=Quote(response.doc('title').text()).DelQuote()
-
-
publish_date=""
-
-
publish_date=response.doc("#pubtime_baidu").text()
-
print(publish_date)
-
if "\xa0" in publish_date:
-
publish_date=publish_date.replace("\xa0"," ")
-
if publish_date=="":
-
publish_date=response.doc("div.wzzzly > span:nth-child(2)").text()
-
print(publish_date)
-
if "/" in publish_date:
-
publish_date=publish_date.replace("/","-")
-
if "年" in publish_date:
-
publish_date=publish_date.replace("年","-")
-
if "月" in publish_date:
-
publish_date=publish_date.replace("月","-")
-
if "日" in publish_date:
-
publish_date=publish_date.replace("日","")
-
if ":" not in publish_date:
-
publish_date=publish_date+" 00:00:00"
-
if publish_date.count(":")==1:
-
publish_date=publish_date+":00"
-
if len(publish_date)>19:
-
publish_date = publish_date[:19]
-
if 0==len(publish_date):
-
publish_date = '1970-01-01 00:00:00'
-
log(TABLE_NAME,"publish_date is null "+url).log()
-
return
-
if not Quote(publish_date).is_valid_date():
-
log(TABLE_NAME,"Error publish_date ="+url).log()
-
publish_date = '1970-01-01 00:00:00'
-
return
-
print(publish_date)
-
crawl_time=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
-
content=""
-
for each in response.doc('#the_content> p').items():
-
if len(each('style').text())>0:
-
continue
-
if len(each('script').text())>0:
-
continue
-
if len(each('table').text())>0:
-
continue
-
if len(each.text())>0:
-
content += ' ' + each.text() + '<br>'
-
if content=="":
-
for each in response.doc('#Content > p').items():
-
if len(each('style').text())>0:
-
continue
-
if len(each('script').text())>0:
-
continue
-
if len(each('table').text())>0:
-
continue
-
if len(each.text())>0:
-
content += ' ' + each.text() + '<br>'
-
content = Quote(content).ConverQuote()
-
print(content)
-
if len(content) > 0:
-
print("ok")
-
insertMysql(s[0],s[1],s[2],s[3],TABLE_NAME,url,"").insertSQLpage()#将URL插入到数据库
-
self.send_msg(source,title,publish_date,url,crawl_time,content,author)
-
#正文处理模块,将正文的内容从网页中提取处理,并组织好相应的结构进行发送
-
def send_msg(self,source,title,publish_date,url,crawl_time,content,author):
-
#发送情况1
-
"""
-
参数:
-
source:网站名称,在爬虫开头进行定义
-
title:文章标题
-
publish_date:文章发布时间
-
url:新闻网址
-
crawl_time:爬取时间,就是获取当前时间
-
content:新闻爬去的正文
-
"""
-
#创建MD5对象
-
hl = hashlib.md5()
-
strid = source+title+publish_date
-
hl.update(strid.encode(encoding='utf-8'))
-
#向ES直接写入数据时使用这个
-
#_id=hl.hexdigest()
-
#通过kafka向ES写入数据时使用这个
-
id=hl.hexdigest()
-
#将数据发送到ES里面
-
data = {
-
"url": url,
-
"title": title,
-
"publish_date":publish_date,
-
"source":source,
-
"content":content,
-
"author":author,
-
"crawl_time":crawl_time,
-
"id":id #只有通过kafka向ES写入数据时才有这个配置
-
}
-
print(data)
-
#修改1:由直接向kafka写入数据转为通过kafka队列向kafka写入数据
-
#向kafka插入数据,需要自己定义topic,topic直接确定
-
Kafka(s[6],"news",data).SendToKafka()
-
#es = Elasticsearch([{"host":"192.168.31.161","port":9200,"timeout":15000}])
-
#es.index( index="news", doc_type="news", body=data, id=_id)
-
#具体参数需要根据情况进行确定
Pyspider爬虫模板-自己用的-需要使用json处理的网站
猜你喜欢
转载自www.cnblogs.com/songdongdong6/p/10075546.html
今日推荐
周排行