需求
抓取需求
1、 按地区抓取
2、 抓取职位名称,薪酬,学历要求,工作年限要求,发布时间,公司名称,所属行业
代理
注释很详细,不解释了,没有代理慎用。
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-08-16 11:04:59
# Project: hunting_recruit
from pyspider.libs.base_handler import *
import re
import datetime
from pymongo import MongoClient
# 连接线下数据库
DB_NAME = 'research'
DB_COL = 'hunting_recruit'
db = client[DB_NAME]
col = db[DB_COL]
class Handler(BaseHandler):
crawl_config = {
"headers":{"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"
},
"proxy": "http://localhost:6666"
}
url = 'https://www.liepin.com/zhaopin'
def format_date(self, date):
return datetime.datetime.strptime(date, '%Y%m%d')
@every(minutes=24 * 60)
def on_start(self):
self.crawl(self.url, callback=self.index_page)
@config(age=60)
def index_page(self, response):
page = response.etree
base_url = 'https://www.liepin.com'
##行业列表
industry_list = page.xpath("//dd[@data-param='industries']/ul/li")
for each in industry_list:
title = each.xpath("./span/text()")[0]
print('-------',title,'--------')
## 子目录
sub_list = each.xpath("./div[@class='sub-industry']/a")
for sub in sub_list:
belonging = sub.xpath("./text()")[0]
print(belonging)
link_url = base_url + sub.xpath("./@href")[0]
save = {"belonging": belonging}
self.crawl(link_url, callback=self.parse_city, save=save)
@config(age=60)
def parse_city(self, response):
page = response.etree
base_url = 'https://www.liepin.com'
## 城市列表
city_list = page.xpath("//dd[@data-param='city']/a")[1:-1] #去掉全国 不要其他
for each in city_list:
city = each.xpath("./text()")[0]
print(city)
link_url = base_url + each.xpath("./@href")[0]
save = {"belonging": response.save["belonging"], "city": city}
self.crawl(link_url, callback=self.parse_district, save=save)
@config(age=60)
def parse_district(self, response):
page = response.etree
base_url = 'https://www.liepin.com'
## 地区列表
district_list = page.xpath("//dd[@data-param='dqs']/a")
for each in district_list:
district = each.xpath("./text()")[0]
print(district)
link_url = base_url + each.xpath("./@href")[0]
save = {"belonging": response.save["belonging"], "city": response.save["city"], "district": district}
self.crawl(link_url, callback=self.parse_detail, save=save)
@config(age=60)
def parse_detail(self, response):
page = response.etree
## 翻页
tail_url = page.xpath(u"//a[@title='末页']/@href")[0]
print(tail_url)
page_num = int(re.findall('&curPage=(\d+)', tail_url)[0])
print(page_num)
for each in range(page_num):
page_url = response.url + '&curPage={}'.format(each)
self.crawl(page_url, callback=self.parse_page, save=response.save)
def parse_page(self, response):
page = response.etree
## 内容列表
contents = page.xpath("//ul[@class='sojob-list']/li")
for each in contents:
try:
##职位名称
position_name = each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='job-info']/h3/a/text()")[0].strip()
print(position_name)
## 薪酬
salary = each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='job-info']/p[@class='condition clearfix']/span[@class='text-warning']/text()")[0]
print(salary)
## 学历
education = each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='job-info']/p[@class='condition clearfix']/span[@class='edu']/text()")[0]
print(education)
## 工作经验
experience = each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='job-info']/p[@class='condition clearfix']/span[last()]/text()")[0]
print(experience)
## 发布时间
public_time = each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='job-info']/p[@class='time-info clearfix']/time/@title")[0]
public_time = ''.join(re.findall('\d+', public_time))
print(public_time)
## 公司名称
company = each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='company-info nohover']/p[@class='company-name']/a/@title")[0]
print(company)
##公司所属行业
company_belong = each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='company-info nohover']/p[@class='field-financing']/span/a/text()")[0]
print(company_belong)
##反馈时间
time_delay = each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='job-info']/p[@class='time-info clearfix']/span/text()")[0]
print(time_delay)
##福利
welfare = '-'.join(each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='company-info nohover']/p[@class='temptation clearfix']/span/text()"))
print(welfare)
print('------------------------------')
result = {"belonging": response.save["belonging"],
"city": response.save["city"],
"district": response.save["district"],
"position_name": position_name,
"salary": salary,
"education": education,
"experience": experience,
"public_time": self.format_date(public_time),
"company": company,
"update_time": datetime.datetime.now(),
"company_belong": company_belong,
"time_delay": time_delay,
"welfare": welfare
}
yield result
except:
pass
def on_result(self, result):
if result is None:
return
update_key = {
'position_name': result['position_name'],
'public_time': result['public_time'],
'city': result['city'],
'district': result['district'],
'company': result['company'],
'belonging': result['belonging']
}
col.update(update_key, {'$set': result}, upsert=True)