爬取tencent职位招聘的

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from TencentSpider.items import TencentspiderItem,TencentDetailItem

class TencentSpider(CrawlSpider):
    name = 'tencent'
    allowed_domains = ['hr.tencent.com'] # 如果指定,如果其他网站匹配到了下面的格式的话,就会去
    # 别的网站去爬取
    start_urls = ['https://hr.tencent.com/position.php?&start=0']
    """
        LinkExtractor(allow='start=\d+')返回的是一个列表,Rule依次发送请求,并且继续跟进,调用指定函数去处理
    """
    rules = [
        # Rule(LinkExtractor(allow='start=\d+'), callback='parse_tencent', follow=True),
        Rule(LinkExtractor(allow='position_detail.php'),callback='parse_info',follow=True)
    ]


    def parse_tencent(self, response):
        link_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")  # 可以这样筛选
        for each in link_list:
            item = TencentspiderItem()
            item['position_name'] = each.xpath("./td[1]/a/text()").extract()[0]
            item['position_link'] = each.xpath("./td[1]/a/@href").extract()[0]
            position_type = each.xpath("./td[2]/text()").extract()[0]
            if not position_type:
                position_type = "为空"
            item['position_type'] = position_type
            item['position_need'] = each.xpath("./td[3]/text()").extract()[0]
            item['position_place'] = each.xpath("./td[4]/text()").extract()[0]
            item['position_time'] = each.xpath("./td[5]/text()").extract()[0]
            yield item


    """这个可以将 本页面中的链接都取出来进去将数据爬下来"""
    def parse_info(self,response):
        """可以将"""
        item = TencentDetailItem()
        item['position_name'] = response.xpath('//*[@id="sharetitle"]').extract()[0]

        yield item


"""其他设置和其他文章没有太大的区别"""

猜你喜欢

转载自blog.csdn.net/chasejava/article/details/79520729