证监会处罚公告爬取

域名:http://www.csrc.gov.cn/pub/zjhpublic/3300/3313/index_7401.htm

# -*- coding: utf-8 -*-
import scrapy
from scrapy .linkextractors import LinkExtractor
from scrapy .spiders import CrawlSpider , Rule
import re


class CfSpider ( CrawlSpider ):
name = ' cf '
allowed_domains = [ ' csrc.gov.cn ' ]
start_urls = [ ' http://www.csrc.gov.cn/pub/zjhpublic/3300/3313/index_7401.htm ' ]

rules = (
Rule (LinkExtractor ( allow = r '/G\d + /\d + /t\d + _\d + \.htm' ), callback = ' parse_item ' ),
# Rule(LinkExtractor(allow=r'/3300/3313/index_7401_.*?\.htm'),follow=True), # 不起作用,所以重写了start_requests
)

def start_requests ( self ):
current_page = 0
while current_page < 67 :
if current_page == 0 :
url = ' http://www.csrc.gov.cn/pub/zjhpublic/3300/3313/index_7401 '
next_url = url + " .html "
else :
url = ' http://www.csrc.gov.cn/pub/zjhpublic/3300/3313/index_7401_ {} .htm '
next_url = url .format ( str (current_page ))
yield scrapy .Request (
url =next_url ,
callback = self .parse ,
)
current_page += 1


def parse_item ( self , response ):
item = dict ()
item [ " title " ] = response .xpath ( " //span[@id='lTitle']/text() " ).extract_first ()
item [ " pub_title " ] = re .findall ( r "<span>(20\d + 年\d {2} 月\d {2} 日)</span>" ,response .body .decode (),re .S )
item [ " pub_title " ] = item [ " pub_title " ][ 0 ] if item [ " pub_title " ] else None
item [ " index_number " ] = response .xpath ( " //table[@id='headContainer']//tr[1]//td[@colspan='2']//td[1]/text() " ).extract_first ()
item [ " href " ] = response .url
yield item

猜你喜欢

转载自www.cnblogs.com/nuochengze/p/12944333.html
今日推荐