证监会处罚公告爬取

域名：http://www.csrc.gov.cn/pub/zjhpublic/3300/3313/index_7401.htm
 
  # -*- coding: utf-8 -*- 
 
 
  import scrapy 
 
 
  from scrapy 
  .linkextractors 
  import LinkExtractor 
 
 
  from scrapy 
  .spiders 
  import CrawlSpider 
  , Rule 
 
 
  import re 
 


 
  class 
  CfSpider 
  ( 
  CrawlSpider 
  ): 
 

   name 
  = 
  ' 
  cf 
  ' 
 

   allowed_domains 
  = 
  [ 
  ' 
  csrc.gov.cn 
  ' 
  ] 
 

   start_urls 
  = 
  [ 
  ' 
  http://www.csrc.gov.cn/pub/zjhpublic/3300/3313/index_7401.htm 
  ' 
  ] 
 


   rules 
  = 
  ( 
 

   Rule 
  (LinkExtractor 
  ( 
  allow 
  = 
  r 
  '/G\d 
  + 
  /\d 
  + 
  /t\d 
  + 
  _\d 
  + 
  \.htm' 
  ), 
  callback 
  = 
  ' 
  parse_item 
  ' 
  ), 
 
 
  # Rule(LinkExtractor(allow=r'/3300/3313/index_7401_.*?\.htm'),follow=True), # 不起作用，所以重写了start_requests 
 
 
  ) 
 

 
  def 
  start_requests 
  ( 
  self 
  ): 
 

   current_page 
  = 
  0 
 
 
  while current_page 
  < 
  67 
  : 
 
 
  if current_page 
  == 
  0 
  : 
 

   url 
  = 
  ' 
  http://www.csrc.gov.cn/pub/zjhpublic/3300/3313/index_7401 
  ' 
 

   next_url 
  = url 
  + 
  " 
  .html 
  " 
 
 
  else 
  : 
 

   url 
  = 
  ' 
  http://www.csrc.gov.cn/pub/zjhpublic/3300/3313/index_7401_ 
  {} 
  .htm 
  ' 
 

   next_url 
  = url 
  .format 
  ( 
  str 
  (current_page 
  )) 
 
 
  yield scrapy 
  .Request 
  ( 
 
 
  url 
  =next_url 
  , 
 
 
  callback 
  = 
  self 
  .parse 
  , 
 
 
  ) 
 

   current_page 
  += 
  1 
 


 
  def 
  parse_item 
  ( 
  self 
  , 
  response 
  ): 
 

   item 
  = 
  dict 
  () 
 

   item 
  [ 
  " 
  title 
  " 
  ] 
  = response 
  .xpath 
  ( 
  " 
  //span[@id='lTitle']/text() 
  " 
  ).extract_first 
  () 
 

   item 
  [ 
  " 
  pub_title 
  " 
  ] 
  = re 
  .findall 
  ( 
  r 
  "<span>(20\d 
  + 
  年\d 
  {2} 
  月\d 
  {2} 
  日)</span>" 
  ,response 
  .body 
  .decode 
  (),re 
  .S 
  ) 
 

   item 
  [ 
  " 
  pub_title 
  " 
  ] 
  = item 
  [ 
  " 
  pub_title 
  " 
  ][ 
  0 
  ] 
  if item 
  [ 
  " 
  pub_title 
  " 
  ] 
  else 
  None 
 

   item 
  [ 
  " 
  index_number 
  " 
  ] 
  = response 
  .xpath 
  ( 
  " 
  //table[@id='headContainer']//tr[1]//td[@colspan='2']//td[1]/text() 
  " 
  ).extract_first 
  () 
 

   item 
  [ 
  " 
  href 
  " 
  ] 
  = response 
  .url 
 
 
  yield item 
 
证监会处罚公告爬取

猜你喜欢