Pyspider爬虫模板-自己用的-需要使用json处理的网站

  1. #!/usr/bin/env python  
  2. # -*- encoding: utf-8 -*-  
  3. # Created on 2018-09-12 14:54:59  
  4. # Project: www_caijing_com_cn  
  5.     
  6. """ 
  7. =============================================爬虫详细=================================================== 
  8. 网站名称:财经网 
  9. 网站地址:www_caijing_com_cn 
  10. 要爬取的网址:http://roll.caijing.com.cn/ajax_lists.php?modelid=0&time=0.45231726534172245 
  11.    
  12. """  
  13. from pyspider.libs.base_handler import *  
  14. import os  
  15. import time  
  16. import json  
  17. import random  
  18. import pathlib  
  19. import hashlib  
  20. from elasticsearch import Elasticsearch  
  21. #引入基本的python包使用##################  
  22. s = open('/home/git/PyspiderProfile/Retile.txt').read().split("@")  
  23. #引入配置文件,链接数据库,已经kafka的基本数据  
  24. #引入配置文件############################  
  25. sys.path.append('/home/git/PyspiderProfile')  
  26. from GT import itemExiste_news  
  27. from GT import log  
  28. from GT import Quote  
  29. from GT import insertMysql  
  30. from GT import Mysql  
  31. from GT import Sentiment  
  32. from GT import Kafka  
  33. from GT import insertMysql  
  34.     
  35. AGENT='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'  
  36. SOURCE="财经网 "#网站名称  
  37. TABLE_NAME="www_caijing_com_cn"#该网站所具有的的唯一数据库表  
  38. URL_LIST={"http://roll.caijing.com.cn/ajax_lists.php?modelid=0&time=0.45231726534172245"}  
  39. class Handler(BaseHandler):  
  40.     crawl_config = {  
  41.         "headers": {  
  42.             "Cache-Control""no-cache",  
  43.             "Upgrade-Insecure-Requests""1",  
  44.             "User-Agent": AGENT,  
  45.             "Accept""text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",  
  46.             "DNT""1",  
  47.             "Accept-Encoding""gzip, deflate, sdch",  
  48.             "Accept-Language""zh-CN,zh;q=0.8"}  
  49.     }  
  50.    
  51.     @every(minutes=3)  
  52.     def on_start(self):  
  53.         Mysql(s[0],s[1],s[2],s[3],TABLE_NAME).createpage()#创建数据库,用作数据去重  
  54.         for URL in URL_LIST:  
  55.             self.crawl(URL, callback=self.index_page)  
  56.    
  57.     @config(age=60 * 60)  
  58.     def index_page(self, response):  
  59.         myjson=response.text  
  60.         for each in json.loads(myjson):  
  61.             self.crawl(each["url"], callback=self.detail_page)  
  62.     
  63.     #正文接口处理,获取到相应的模块,将链接指定到具体的新闻地址  
  64.     def detail_page(self, response):  
  65.         if not response.ok:  
  66.             return  
  67.         url = response.url#URL  
  68.         bRepeated = itemExiste_news(s[0],s[1],s[2],s[3],TABLE_NAME,url).itemExisted()#根据新闻的URL进行去重  
  69.         if bRepeated:   
  70.             log(TABLE_NAME,"repeat"+url+"-->此条新闻已经处理过").log()  
  71.             return  
  72.         else:  
  73.             author=""  
  74.             try:  
  75.                 print("ok")  
  76.                 author= response.doc("#source_baidu").text()  
  77.             except:  
  78.                 author = ""  
  79.             if author=="":  
  80.                 try:  
  81.                     author=response.doc("div.ws-infor > a:nth-child(2)").text()  
  82.                 except:  
  83.                     author = ""  
  84.             if author=="":  
  85.                 try:  
  86.                     author=response.doc("#endWebsite > a").text()  
  87.                 except:  
  88.                     author = ""  
  89.             if 评论" in author:  
  90.                 author=author.split(评论")[0]  
  91.                 if "[\xa0\xa0" in author:  
  92.                     author=author.replace("[\xa0\xa0","")  
  93.                 if "\xa0\xa0]" in author:  
  94.                     author=author.replace("\xa0\xa0]","")  
  95.             print(author)  
  96.             source=SOURCE  
  97.             print("okok")  
  98.             title=Quote(response.doc('title').text()).DelQuote()  
  99.                     
  100.             publish_date=""  
  101.                     
  102.             publish_date=response.doc("#pubtime_baidu").text()  
  103.             print(publish_date)  
  104.             if "\xa0" in publish_date:  
  105.                 publish_date=publish_date.replace("\xa0"," ")  
  106.             if publish_date=="":  
  107.                 publish_date=response.doc("div.wzzzly > span:nth-child(2)").text()  
  108.             print(publish_date)  
  109.             if "/" in publish_date:  
  110.                 publish_date=publish_date.replace("/","-")  
  111.             if "" in publish_date:  
  112.                 publish_date=publish_date.replace("","-")  
  113.             if "" in publish_date:  
  114.                 publish_date=publish_date.replace("","-")  
  115.             if "" in publish_date:  
  116.                 publish_date=publish_date.replace("","")  
  117.             if ":" not in publish_date:  
  118.                 publish_date=publish_date+" 00:00:00"  
  119.             if publish_date.count(":")==1:  
  120.                 publish_date=publish_date+":00"  
  121.             if len(publish_date)>19:  
  122.                 publish_date = publish_date[:19]  
  123.             if 0==len(publish_date):  
  124.                 publish_date = '1970-01-01 00:00:00'  
  125.                 log(TABLE_NAME,"publish_date is null "+url).log()  
  126.                 return              
  127.             if not Quote(publish_date).is_valid_date():  
  128.                 log(TABLE_NAME,"Error publish_date ="+url).log()  
  129.                 publish_date = '1970-01-01 00:00:00'  
  130.                 return  
  131.             print(publish_date)  
  132.             crawl_time=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))  
  133.             content=""  
  134.             for each in response.doc('#the_content> p').items():  
  135.                 if len(each('style').text())>0:  
  136.                     continue   
  137.                 if len(each('script').text())>0:  
  138.                     continue   
  139.                 if len(each('table').text())>0:  
  140.                     continue  
  141.                 if len(each.text())>0:  
  142.                     content +=  '    ' + each.text() + '<br>'    
  143.             if content=="":  
  144.                 for each in response.doc('#Content > p').items():  
  145.                     if len(each('style').text())>0:  
  146.                         continue   
  147.                     if len(each('script').text())>0:  
  148.                         continue   
  149.                     if len(each('table').text())>0:  
  150.                         continue  
  151.                     if len(each.text())>0:  
  152.                         content +=  '    ' + each.text() + '<br>'                     
  153.             content = Quote(content).ConverQuote()   
  154.             print(content)  
  155.             if len(content) > 0:  
  156.                 print("ok")  
  157.                 insertMysql(s[0],s[1],s[2],s[3],TABLE_NAME,url,"").insertSQLpage()#URL插入到数据库  
  158.                 self.send_msg(source,title,publish_date,url,crawl_time,content,author)  
  159.     #正文处理模块,将正文的内容从网页中提取处理,并组织好相应的结构进行发送  
  160.     def send_msg(self,source,title,publish_date,url,crawl_time,content,author):  
  161.         #发送情况1  
  162.         """ 
  163.         参数: 
  164.         source:网站名称,在爬虫开头进行定义 
  165.         title:文章标题 
  166.         publish_date:文章发布时间 
  167.         url:新闻网址 
  168.         crawl_time:爬取时间,就是获取当前时间 
  169.         content:新闻爬去的正文 
  170.         """  
  171.         #创建MD5对象  
  172.         hl = hashlib.md5()                  
  173.         strid = source+title+publish_date  
  174.         hl.update(strid.encode(encoding='utf-8'))  
  175.         #ES直接写入数据时使用这个  
  176.         #_id=hl.hexdigest()  
  177.         #通过kafkaES写入数据时使用这个  
  178.         id=hl.hexdigest()  
  179.         #将数据发送到ES里面  
  180.         data = {  
  181.             "url": url,  
  182.             "title": title,  
  183.             "publish_date":publish_date,  
  184.             "source":source,  
  185.             "content":content,  
  186.             "author":author,  
  187.             "crawl_time":crawl_time,  
  188.             "id":id    #只有通过kafkaES写入数据时才有这个配置  
  189.         }  
  190.         print(data)  
  191.         #修改1:由直接向kafka写入数据转为通过kafka队列向kafka写入数据  
  192.         #kafka插入数据,需要自己定义topictopic直接确定  
  193.         Kafka(s[6],"news",data).SendToKafka()  
  194.         #es = Elasticsearch([{"host":"192.168.31.161","port":9200,"timeout":15000}])   
  195.         #es.index( index="news", doc_type="news", body=data, id=_id)  
  196.     #具体参数需要根据情况进行确定  

猜你喜欢

转载自www.cnblogs.com/songdongdong6/p/10075546.html