scrapy爬虫思路
练习1:爬取某腾招聘网站岗位详情信息
思路
先针对某一页遍历获取每个职位的详情页url,之后再针对详情页发起请求获取响应中详情的具体内容。
以下信息,均通过《检查》Network->XHR->Request headers->referer:
步骤
创建爬虫
#创建爬虫项目
scrapy startproject tencent
#进入项目目录
cd tencent
#创建爬虫
scrapy genspider hr tencent.com
爬虫文件hr.py
# -*- coding: utf-8 -*-
import scrapy
import json
#引入Item模块数据
from tencent.items import TencentItem
# https://careers.tencent.com/search.html 因为没有目标内容,所以不能作为网页起始页
# 经过network的xhr分析包含目标信息的网页为
# https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1593306598402&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex=1&pageSize=10&language=zh-cn&area=cn
# https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1593307990592&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex=5&pageSize=10&language=zh-cn&area=cn
# 详情页面
# https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1593306836293&postId=1123175307965108224&language=zh-cn
# 思路先针对某一页遍历获取每个职位的详情页url,之后再针对详情页发起请求获取响应中详情的具体内容
class HrSpider(scrapy.Spider):
name = 'hr'
allowed_domains = ['tencent.com']
#one_urls = 'https://careers.tencent.com/search.html?index={}'
one_urls = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1593307990592&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn'
job_detail_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1593306836293&postId={}&language=zh-cn'
start_urls = [one_urls.format(1)]
def parse(self, response):
# 爬取10页信息
for page in range(1,11):
url = self.one_urls.format(page)
#向着10页发起请求
#callback 自定义回调函数
yield scrapy.Request(url=url,callback=self.parse_fun)
# scrapy.Request(url, callback=None, method='GET', headers=None, body=None,
# cookies=None, meta=None, encoding='utf-8', priority=0,
# dont_filter=False, errback=None, flags=None)
# 常用参数为:
# callback: 指定传入的URL交给那个解析函数取处理
# meta: 实现不同的解析函数中传递数据,meta默认会携带部分信息,比如下载延迟,请求深度
# dont_filter:让scrapy去重复,不会过滤当前URL。srcapy默认有URL去重功能,对需要重复
# 请求的URL用重要用途
def parse_fun(self,response):
datas = json.loads(response.text)
#print(datas)
#print(type(datas))
for data in datas['Data']['Posts']:
#print(type(data))
job_id = data['PostId']
job_name = data['RecruitPostName']
post_url = data['PostURL']
# print(job_id)
# print(job_name)
# print(post_url)
#定义一个字典存储职位详情
item = TencentItem()
item['job_name'] = job_name
#拼接详情页的url
item['job_detail_url'] = self.job_detail_url.format(job_id)
item['job_category_name'] = data['CategoryName']
#发起详情页的请求,获取响应信息
yield scrapy.Request(url=item['job_detail_url'],callback=self.parse_detail,
meta={
'item':item} #传值
)
def parse_detail(self,response):
#print('parse_detail'+response.text)
details = json.loads(response.text)
#print(response.meta['item'])
#response.meta['item']['job_responsibility'] = details['Data']['Responsibility'].replace('\n','').replace('\t','').replace('\r','')
item = response.meta.get('item')
print(response.text)
#print(type(details))
#print(details['Data']['Responsibility'])
item['job_responsibility'] = details['Data']['Responsibility'].replace('\n','').replace('\t','').replace('\u2028','')
item['job_requirement'] = details['Data']['Requirement'].replace('\n','').replace('\t','').replace('\u2028','')
yield item
setting.py文件打开或设置相关开关
# Override the default request headers: 打开开关
DEFAULT_REQUEST_HEADERS = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2816.400',
}
# 添加
LOG_LEVEL = 'WARNING'
item.py文件
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class TencentItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
job_name = scrapy.Field()
job_detail_url = scrapy.Field()
job_category_name = scrapy.Field()
job_responsibility = scrapy.Field()
job_requirement = scrapy.Field()
pipelines.py文件
class TencentPipeline:
def process_item(self, item, spider):
print('pipelines TencentPipeline process_item')
print(item)
return item
启动文件start.py
from scrapy import cmdline
cmdline.execute(['scrapy','crawl','hr'])
练习2:阳光政务平台案例
需求
爬取每个问政问题的详情内容
思路
首先拿到页面中的问政标题和对应的详情页url
定位到起始页的url地址,查看网页源码是否包括目标信息,本案例是包含,第一页url如下:
http://wz.sun0769.com/political/index/politicsNewest?id=1&page=1
http://wz.sun0769.com/political/index/politicsNewest?id=1&page=2
1、找到li标签
2、找到li标签里面的href和title
3、去详情页找具体的内容
4、翻页
创建项目
scrapy startproject yangguang
cd yangguang
setting.py
# Configure item pipelines 打开管道接收数据
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'yangguang.pipelines.YangguangPipeline': 300,
}
#增加日志级别设定,精炼日志信息
LOG_LEVEL = 'WARNING'
编写爬虫文件ygjz.py
# -*- coding: utf-8 -*-
import scrapy
#引入Item模块数据
from yangguang.items import YangguangItem
class YgjzSpider(scrapy.Spider):
name = 'ygjz'
allowed_domains = ['wz.sun0769.com']
url_page = 'http://wz.sun0769.com/political/index/politicsNewest?id=1&page={}'
start_urls = [url_page.format(1)]
url_head = 'http://wz.sun0769.com'
def parse(self, response):
# 直接用xpath获取目标信息
#print(type(response))
list_li = response.xpath('//ul[@class="title-state-ul"]/li')
# print(info.extract_first())
# print(type(info))
for li in list_li:
#print(li.xpath('./span[3]/a/text()').extract_first())
#print(li.xpath('./span[3]/a/@href').extract_first())
item = YangguangItem()
item['title'] = li.xpath('./span[3]/a/text()').extract_first()
item['href'] = self.url_head + li.xpath('./span[3]/a/@href').extract_first()
#print(item)
# 获取详情页的内容
yield scrapy.Request(url=item['href'],callback=self.parse_detail,meta={
'item':item})
next_url = self.url_head + response.xpath('//div[@class="mr-three paging-box"]/a[2]/@href').extract_first()
#print(next_url)
# 获取下一页内容
if next_url:
yield scrapy.Request(url=next_url, callback=self.parse)
def parse_detail(self, response):
item = response.meta.get('item')
#print(item)
detail = response.xpath('//div[@class="details-box"]/pre/text()').extract_first()
#print(detail)
item['content'] = detail
yield item #给管道传递数据
pipelines.py文件
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import re
class YangguangPipeline:
def process_item(self, item, spider):
item['content'] = re.sub(r'\r\n','',item['content'])
print(item)
return item
文件item.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class YangguangItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
href = scrapy.Field()
content = scrapy.Field()
启动项目文件start.py
#!/user/bin/env python
#-*-coding utf-8-*-
#@Time : 2020/6/2911:29
#@Author : GodSpeed
#@File : start.py.py
#@Software : PyCharm
from scrapy import cmdline
cmdline.execute(['scrapy','crawl','ygjz'])
使用scrapy爬虫小结
1.分析页面、确定起始url
2.创建工程,创建爬虫项目
3.settings.py打开日志级别,管道
4.在items.py中定义爬虫数据字段
5.爬虫的py文件中实现逻辑,yield 、xpath、yield scrapy.Request、callback、meta 、extract_frist()
6.管道文件pipelines.py中实现操作数据的流程
练习3爬取小程序社区
小程序社区url
http://www.wxapp-union.com/portal.php?mod=list&catid=1
创建爬虫项目
scrapy startproject wxapp_union
创建爬虫
cd wxapp_union
scrapy genspider -t crawl cwl_wxapp wxapp-union.com
cwl_wxapp.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import WxappUnionItem
class CwlWxappSpider(CrawlSpider):
name = 'cwl_wxapp'
allowed_domains = ['wxapp-union.com']
start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=1&page=1']
# 列表页
# http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=6
# 详情页
# http://www.wxapp-union.com/article-5667-1.html
# http://www.wxapp-union.com/article-5663-1.html
rules = (
# 详情页
Rule(LinkExtractor(allow=r'http://www.wxapp-union.com/portal.php\?mod=list&catid=1&page=\d+'),follow=True),
# 列表页
Rule(LinkExtractor(allow=r'http://www.wxapp-union.com/article-\d+-1.html'), callback='parse_item'),
)
def parse_item(self, response):
#item = {}
item = WxappUnionItem()
#item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
#item['name'] = response.xpath('//div[@id="name"]').get()
#item['description'] = response.xpath('//div[@id="description"]').get()
#item['description'] = response.xpath('//ul/[@id="itemContainer"]/h3[@class="list_title"]/a/text()').extract_first()
item['title'] = response.xpath('//div[@class="cl"]/h1/text()').extract_first()
item['p_date'] = response.xpath('//p[@class="authors"]/span/text()').extract_first()
yield item
return item
items.py
import scrapy
class WxappUnionItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
p_date = scrapy.Field()
#pass
pipelines.py
class WxappUnionPipeline:
def process_item(self, item, spider):
print(item)
return item
settings.py
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'wxapp_union.pipelines.WxappUnionPipeline': 300,
}
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
# Obey robots.txt rules
#ROBOTSTXT_OBEY = True
LOG_LEVEL = 'WARNING'