scrapy爬取腾讯招聘信息

创建项目
scrapy startproject tencent

编写items.py
写class TencentItem

 1 import scrapy
 2 
 3 class TencentItem(scrapy.Item):
 4     # define the fields for your item here like:
 5     # 职位名
 6     positionname = scrapy.Field()
 7     # 详情连接
 8     positionlink = scrapy.Field()
 9     # 职位类别
10     positionType = scrapy.Field()
11     # 招聘人数
12     peopleNum = scrapy.Field()
13     # 工作地点
14     workLocation = scrapy.Field()
15     # 发布时间
16     publishTime = scrapy.Field()

创建基础类的爬虫

scrapy genspider tencentPosition"tencent.com"

tencentPosition.py

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from tencent.items import TencentItem
 4 
 5 class TencentpositionSpider(scrapy.Spider):
 6     name = "tencent"
 7     allowed_domains = ["tencent.com"]
 8 
 9     url = "http://hr.tencent.com/position.php?&start="
10     offset = 0
11 
12     start_urls = [url + str(offset)]
13 
14     def parse(self, response):
15         for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
16             # 初始化模型对象
17             item = TencentItem()
18 
19             item['positionname'] = each.xpath("./td[1]/a/text()").extract()[0]
20             # 详情连接
21             item['positionlink'] = each.xpath("./td[1]/a/@href").extract()[0]
22             # 职位类别
23             item['positionType'] = each.xpath("./td[2]/text()").extract()[0]
24             # 招聘人数
25             item['peopleNum'] =  each.xpath("./td[3]/text()").extract()[0]
26             # 工作地点
27             item['workLocation'] = each.xpath("./td[4]/text()").extract()[0]
28             # 发布时间
29             item['publishTime'] = each.xpath("./td[5]/text()").extract()[0]
30 
31             yield item
32 
33         if self.offset < 1680:
34             self.offset += 10
35 
36         # 每次处理完一页的数据之后,重新发送下一页页面请求
37         # self.offset自增10,同时拼接为新的url,并调用回调函数self.parse处理Response
38         yield scrapy.Request(self.url + str(self.offset), callback = self.parse)

管道文件
pipelines.py

 1 import json
 2 
 3 class TencentPipeline(object):
 4     def __init__(self):
 5         self.filename = open("tencent.json", "w")
 6 
 7     def process_item(self, item, spider):
 8         text = json.dumps(dict(item), ensure_ascii = False) + ",\n"
 9         self.filename.write(text.encode("utf-8"))
10         return item
11 
12     def close_spider(self, spider):
13         self.filename.close()

在settings文件设置pipelines

ITEM_PIPELINES = {
'tencent.pipelines.TencentPipeline': 300,
}


添加请求报头

DEFAULT_REQUEST_HEADERS


settings.py
BOT_NAME = 'tencent'

SPIDER_MODULES = ['tencent.spiders']
NEWSPIDER_MODULE = 'tencent.spiders'

ROBOTSTXT_OBEY = True

DOWNLOAD_DELAY = 2

DEFAULT_REQUEST_HEADERS = {
    "User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;",
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
}

ITEM_PIPELINES = {
    'tencent.pipelines.TencentPipeline': 300,
}

猜你喜欢

转载自www.cnblogs.com/wanglinjie/p/9210850.html