设置过程:
依赖安装:
pip install scrapy
创建项目:
scrapy startproject projectname
创建爬虫任务:
cd projectname
scrapy genspider spidername website.com
其中website.com是你需要爬取的网站
设置修改:
#管道文件
ITEM_PIPELINES = {
‘Recruitment.pipelines.RecruitmentPipeline’: 300,
}
#是否使用自定义的cookies
COOKIES_ENABLED = True
#是否遵循robots.txt规则
ROBOTSTXT_OBEY = False
说明,其中COOKIES_ENABLED表示需要使用对应的自定义cookie;ROBOTSTXT_OBEY表示是否需要遵循robot协议
若需要debug的话,在爬虫工程根目录创建一个run.py文件,我的文件目录如下:
其中run.py的文件内容为:
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2023/4/4 17:44 # @Author : sparkle_code_guy from scrapy import cmdline #其中 zhaopin为自己的爬虫任务 cmdline.execute(['scrapy','crawl','zhaopin'])
若需要查看自己的爬虫任务列表,在命令行输入:
scrapy list
爬虫任务文件如下,其中的start_requests方法是爬虫任务的开始方法,必须实现;parse方法是内容解析方法,也必须实现;
import scrapy import re, json import time import requests class ZhaopinSpider(scrapy.Spider): name = 'zhaopin' allowed_domains = ['zhaopin.com'] def start_requests(self): cookies = your cookie self.cookies = {i.split("=")[0]: i.split("=")[1] for i in cookies.split("; ")} self.headers = { 'User-Agent': "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", } yield scrapy.Request( 'https://www.zhaopin.com/', callback=self.parse, cookies=self.cookies, headers=self.headers ) def parse(self, response): # 自动添加第一次请求的cookie start_city = 480 end_city = 950 print("开始爬取") for i in range(start_city, end_city): print("城市ID:", i) url_city = "https://sou.zhaopin.com/?jl={0}".format(i) yield scrapy.Request( url=url_city, callback=self.parse_page, cookies=self.cookies, headers=self.headers ) def parse_page(self, response): page = 1 next_page = response.xpath( "//div[@class='pagination clearfix']//div[@class='pagination__pages']//button[@class='btn soupager__btn soupager__btn--disable']/text()").extract_first() if next_page == "下一页": print("正在爬取1:") yield scrapy.Request( url=response.request.url, callback=self.parse_zp, cookies=self.cookies, headers=self.headers ) elif response.xpath( "//div[@class='sou-main']//div[@class='sou-main__center clearfix']//div[@class='positionList-hook']//div[@class='page-empty__tips']//span/text()").extract_first() != None: print("未搜索到:", response.request.url) return else: print("正在爬取2:") for i in range(2, 40, 1): url_page = response.request.url + "&p={0}".format(page) page += 1 yield scrapy.Request( url=url_page, callback=self.parse_zp, cookies=self.cookies, headers=self.headers ) def parse_zp(self, response): item = {} list_body = response.xpath("//div[@class='joblist-box__item clearfix']") print("URL:", response.request.url) for body in list_body: # 工作名字 item['title'] = body.xpath( ".//div[@class='iteminfo__line iteminfo__line1']//div[@class='iteminfo__line1__jobname']//span[@class='iteminfo__line1__jobname__name']/text()").extract_first() list_li = body.xpath(".//div[@class='iteminfo__line iteminfo__line2']//ul//li") # 学历 item['Education'] = list_li[2].xpath("./text()").extract_first() # 工作地点 item['job_location'] = list_li[0].xpath("./text()").extract_first() # 工作时间 item['job_time'] = list_li[1].xpath("./text()").extract_first() # 工资 money = body.xpath( ".//div[@class='iteminfo__line iteminfo__line2']//div[@class='iteminfo__line2__jobdesc']//p/text()").extract_first() item['money'] = money.split() # 工作需要 info = body.xpath( ".//div[@class='iteminfo__line iteminfo__line3']//div[@class='iteminfo__line3__welfare']//div") info_list = [] for i in info: info_list.append(i.xpath("./text()").extract_first()) item['job_info'] = " ".join(info_list) # #公司名 item['Company_name'] = body.xpath( "//div[@class='iteminfo__line iteminfo__line1']//div[@class='iteminfo__line1__compname']//span[@class='iteminfo__line1__compname__name']/text()").extract_first() company = body.xpath( ".//div[@class='iteminfo__line iteminfo__line2']//div[@class='iteminfo__line2__compdesc']//span") # 公司人数 item['company_number'] = company[1].xpath("./text()").extract() # 公司类型 item['company_type'] = company[0].xpath("./text()").extract() yield itemx
代码中的cookie需要自己在浏览器登陆后并查看对应的cookie后填写更改
项目目录中的items.py是对解析的内容的定义,若要使用,需要自己在parse中按照mapping的方式或类的方式调用,否则无效
项目目录中的pipelines.py是对parse方法解析完的每一个item进行处理,通常是保存到数据库或文件中
项目目录中的middlewares.py是获取网页数据时的干预,通常可以增加一些插件来支持更多的功能,暂未使用到。
若要在具体的爬虫任务中个性化配置,比如个性化的pipline处理,则可以在个性化任务中增加类变量,如下:
class ZhipinSpider(scrapy.Spider): #name对应的就是爬虫任务的名字 name = "boss" allowed_domains = ["www.zhipin.com"] current_page = 1 #开始页码 start_urls = [ "https://www.zhaopin.com/", ] custom_settings = { "ITEM_PIPELINES":{ 'tutorial.pipelines.ZhipinPipeline': 300, }, "DOWNLOADER_MIDDLEWARES":{ 'tutorial.middlewares.ZhipinMiddleware': 299, # 'tutorial.middlewares.ProxyMiddleware':301 }, "DEFAULT_REQUEST_HEADERS":{ 'Accept': 'application/json', 'Accept-Language': 'zh-CN,zh;q=0.9', 'User-Agent':'Mozilla/5.0 (Linux; Android 9.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Mobile Safari/537.36', 'Referer':'https://www.zhipin.com/', 'X-Requested-With':"XMLHttpRequest", "cookie":"lastCity=101020100; JSESSIONID=""; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1532401467,1532435274,1532511047,1532534098; __c=1532534098; __g=-; __l=l=%2Fwww.zhipin.com%2F&r=; toUrl=https%3A%2F%2Fwww.zhipin.com%2Fc101020100-p100103%2F; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1532581213; __a=4090516.1532500938.1532516360.1532534098.11.3.7.11" } }
若想要使用外部的setting.py中的配置参数,使用如下:
from scrapy.utils.project import get_project_settings settings = get_project_settings()