scrapy爬虫思路

练习1:爬取某腾招聘网站岗位详情信息

思路

先针对某一页遍历获取每个职位的详情页url,之后再针对详情页发起请求获取响应中详情的具体内容。

以下信息，均通过《检查》Network->XHR->Request headers->referer:

步骤

创建爬虫

#创建爬虫项目
scrapy startproject tencent
#进入项目目录
cd tencent
#创建爬虫
scrapy genspider hr tencent.com

爬虫文件hr.py

# -*- coding: utf-8 -*-
import scrapy

import json

#引入Item模块数据
from tencent.items import TencentItem

# https://careers.tencent.com/search.html 因为没有目标内容，所以不能作为网页起始页
# 经过network的xhr分析包含目标信息的网页为
# https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1593306598402&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex=1&pageSize=10&language=zh-cn&area=cn
# https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1593307990592&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex=5&pageSize=10&language=zh-cn&area=cn

# 详情页面
# https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1593306836293&postId=1123175307965108224&language=zh-cn

# 思路先针对某一页遍历获取每个职位的详情页url,之后再针对详情页发起请求获取响应中详情的具体内容

class HrSpider(scrapy.Spider):
    name = 'hr'
    allowed_domains = ['tencent.com']
    #one_urls = 'https://careers.tencent.com/search.html?index={}'
    one_urls = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1593307990592&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn'
    job_detail_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1593306836293&postId={}&language=zh-cn'
    start_urls = [one_urls.format(1)]

    def parse(self, response):
        # 爬取10页信息
        for page in range(1,11):
            url = self.one_urls.format(page)
            #向着10页发起请求
            #callback 自定义回调函数
            yield scrapy.Request(url=url,callback=self.parse_fun)
            # scrapy.Request(url, callback=None, method='GET', headers=None, body=None,
            #                  cookies=None, meta=None, encoding='utf-8', priority=0,
            #                  dont_filter=False, errback=None, flags=None)

            # 常用参数为:
            # callback: 指定传入的URL交给那个解析函数取处理
            # meta: 实现不同的解析函数中传递数据，meta默认会携带部分信息，比如下载延迟，请求深度
            # dont_filter:让scrapy去重复，不会过滤当前URL。srcapy默认有URL去重功能，对需要重复
            # 请求的URL用重要用途

    def parse_fun(self,response):
        datas = json.loads(response.text)
        #print(datas)
        #print(type(datas))

        for data in datas['Data']['Posts']:
            #print(type(data))
            job_id = data['PostId']
            job_name = data['RecruitPostName']
            post_url = data['PostURL']
            # print(job_id)
            # print(job_name)
            # print(post_url)

            #定义一个字典存储职位详情
            item = TencentItem()
            item['job_name'] = job_name
            #拼接详情页的url
            item['job_detail_url'] = self.job_detail_url.format(job_id)
            item['job_category_name'] = data['CategoryName']

            #发起详情页的请求，获取响应信息
            yield scrapy.Request(url=item['job_detail_url'],callback=self.parse_detail,
                                 meta={
    
    'item':item} #传值
                                 )


    def parse_detail(self,response):
        #print('parse_detail'+response.text)
        details = json.loads(response.text)
        #print(response.meta['item'])
        #response.meta['item']['job_responsibility'] = details['Data']['Responsibility'].replace('\n','').replace('\t','').replace('\r','')
        item = response.meta.get('item')
        print(response.text)
        #print(type(details))
        #print(details['Data']['Responsibility'])
        item['job_responsibility'] = details['Data']['Responsibility'].replace('\n','').replace('\t','').replace('\u2028','')
        item['job_requirement'] = details['Data']['Requirement'].replace('\n','').replace('\t','').replace('\u2028','')
        yield item

setting.py文件打开或设置相关开关

# Override the default request headers: 打开开关
DEFAULT_REQUEST_HEADERS = {
    
    
  'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2816.400',
}
# 添加
LOG_LEVEL = 'WARNING'

item.py文件

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class TencentItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    job_name = scrapy.Field()
    job_detail_url = scrapy.Field()
    job_category_name = scrapy.Field()
    job_responsibility = scrapy.Field()
    job_requirement = scrapy.Field()

pipelines.py文件

class TencentPipeline:
    def process_item(self, item, spider):
        print('pipelines TencentPipeline process_item')
        print(item)
        return item

启动文件start.py

from scrapy import cmdline
cmdline.execute(['scrapy','crawl','hr'])

练习2:阳光政务平台案例

需求

爬取每个问政问题的详情内容

思路

首先拿到页面中的问政标题和对应的详情页url
定位到起始页的url地址，查看网页源码是否包括目标信息，本案例是包含，第一页url如下:
http://wz.sun0769.com/political/index/politicsNewest?id=1&page=1
http://wz.sun0769.com/political/index/politicsNewest?id=1&page=2

1、找到li标签
2、找到li标签里面的href和title
3、去详情页找具体的内容
4、翻页

创建项目

scrapy startproject yangguang
cd yangguang

setting.py

# Configure item pipelines 打开管道接收数据
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    
    
   'yangguang.pipelines.YangguangPipeline': 300,
}

#增加日志级别设定，精炼日志信息
LOG_LEVEL = 'WARNING'

编写爬虫文件ygjz.py

# -*- coding: utf-8 -*-
import scrapy

#引入Item模块数据
from yangguang.items import YangguangItem


class YgjzSpider(scrapy.Spider):
    name = 'ygjz'
    allowed_domains = ['wz.sun0769.com']
    url_page = 'http://wz.sun0769.com/political/index/politicsNewest?id=1&page={}'

    start_urls = [url_page.format(1)]
    url_head = 'http://wz.sun0769.com'



    def parse(self, response):
        # 直接用xpath获取目标信息
        #print(type(response))
        list_li = response.xpath('//ul[@class="title-state-ul"]/li')
        # print(info.extract_first())
        # print(type(info))
        for li in list_li:
            #print(li.xpath('./span[3]/a/text()').extract_first())
            #print(li.xpath('./span[3]/a/@href').extract_first())
            item = YangguangItem()
            item['title'] = li.xpath('./span[3]/a/text()').extract_first()
            item['href'] = self.url_head + li.xpath('./span[3]/a/@href').extract_first()
            #print(item)

            # 获取详情页的内容
            yield scrapy.Request(url=item['href'],callback=self.parse_detail,meta={
    
    'item':item})
        next_url = self.url_head + response.xpath('//div[@class="mr-three paging-box"]/a[2]/@href').extract_first()
        #print(next_url)
        # 获取下一页内容
        if next_url:
            yield scrapy.Request(url=next_url, callback=self.parse)

    def parse_detail(self, response):
        item = response.meta.get('item')
        #print(item)
        detail = response.xpath('//div[@class="details-box"]/pre/text()').extract_first()
        #print(detail)
        item['content'] = detail
        yield item #给管道传递数据

pipelines.py文件

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

import re

class YangguangPipeline:
    def process_item(self, item, spider):

        item['content'] = re.sub(r'\r\n','',item['content'])
        print(item)
        return item

文件item.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class YangguangItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    href = scrapy.Field()
    content = scrapy.Field()

启动项目文件start.py

#！/user/bin/env python
#-*-coding utf-8-*-
#@Time           : 2020/6/2911:29
#@Author         : GodSpeed
#@File           : start.py.py
#@Software       : PyCharm

from scrapy import cmdline
cmdline.execute(['scrapy','crawl','ygjz'])

使用scrapy爬虫小结

1.分析页面、确定起始url
2.创建工程，创建爬虫项目
3.settings.py打开日志级别，管道
4.在items.py中定义爬虫数据字段
5.爬虫的py文件中实现逻辑，yield 、xpath、yield scrapy.Request、callback、meta 、extract_frist()
6.管道文件pipelines.py中实现操作数据的流程

练习3爬取小程序社区

小程序社区url
http://www.wxapp-union.com/portal.php?mod=list&catid=1

创建爬虫项目

scrapy startproject wxapp_union

创建爬虫

cd wxapp_union
scrapy genspider -t crawl cwl_wxapp wxapp-union.com

cwl_wxapp.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

from ..items import WxappUnionItem

class CwlWxappSpider(CrawlSpider):
    name = 'cwl_wxapp'
    allowed_domains = ['wxapp-union.com']
    start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=1&page=1']

    # 列表页
    # http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=6

    # 详情页
    # http://www.wxapp-union.com/article-5667-1.html
    # http://www.wxapp-union.com/article-5663-1.html

    rules = (
        # 详情页
        Rule(LinkExtractor(allow=r'http://www.wxapp-union.com/portal.php\?mod=list&catid=1&page=\d+'),follow=True),
        # 列表页
        Rule(LinkExtractor(allow=r'http://www.wxapp-union.com/article-\d+-1.html'), callback='parse_item'),

    )

    def parse_item(self, response):
        #item = {}
        item = WxappUnionItem()
        #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
        #item['name'] = response.xpath('//div[@id="name"]').get()
        #item['description'] = response.xpath('//div[@id="description"]').get()
        #item['description'] = response.xpath('//ul/[@id="itemContainer"]/h3[@class="list_title"]/a/text()').extract_first()

        item['title'] = response.xpath('//div[@class="cl"]/h1/text()').extract_first()
        item['p_date'] = response.xpath('//p[@class="authors"]/span/text()').extract_first()
        yield item
        return item

items.py

import scrapy


class WxappUnionItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    p_date = scrapy.Field()
    #pass

pipelines.py

class WxappUnionPipeline:
    def process_item(self, item, spider):
        print(item)
        return item

settings.py

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    
    
   'wxapp_union.pipelines.WxappUnionPipeline': 300,
}

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    
    
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
}

# Obey robots.txt rules
#ROBOTSTXT_OBEY = True

LOG_LEVEL = 'WARNING'

pyhon之scrapy框架练习

scrapy爬虫思路

练习1:爬取某腾招聘网站岗位详情信息

思路

步骤

创建爬虫

爬虫文件hr.py

setting.py文件打开或设置相关开关

item.py文件

pipelines.py文件

启动文件start.py

练习2:阳光政务平台案例

需求

思路

创建项目

setting.py

编写爬虫文件ygjz.py

pipelines.py文件

文件item.py

启动项目文件start.py

使用scrapy爬虫小结

练习3爬取小程序社区

创建爬虫项目

创建爬虫

cwl_wxapp.py

items.py

pipelines.py

settings.py

猜你喜欢