腾讯招聘信息爬虫

# -*- coding: utf-8 -*-

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from bs4 import BeautifulSoup
# import json


class TtSpider(CrawlSpider):
    name = 'tt'
    allowed_domains = ['tencent.com']
    start_urls = ['https://hr.tencent.com/position.php']
    count = 1

    rules = (
        Rule(LinkExtractor(allow=r'/position_detail\.php\?id=\d+1&keywords=&tid=0&lid=0'),
             callback='parse_item', ),  # allow参数写不对,下面response就会为空
        # 火狐和Chrome都无法在查看器中直接选择复制选项卡中的任何一项,任何一项都不是正确的(其他浏览器未试过)
        # allow的参数即使不写网址https://hr.tencent.com/也会自动补全
        # Rule(LinkExtractor(allow=r'position_detail\.php\?id=\d+&keywords=&tid=0&lid=0'),\
        #      callback='parse_item', ),
        Rule(LinkExtractor(allow=r'/position\.php\?&start=\d+#a'),
             follow=True, ),
    )

    def parse_item(self, response):
        item = {}
        content = BeautifulSoup(response.body,  "xml")
        # print(content)
        # print(type(content.find(attrs={"id": "sharetitle"})))
        temp_data = content.find(attrs={"id": "sharetitle"})
        if temp_data is not None:
            # print(temp_data.string)
            item["标题"] = temp_data.string
        # temp_data = content.find(attrs={"class": "c bottomline"})
        # print(temp_data)
        temp_data = content.find(text="工作地点:")
        # print(temp_data)
        temp_data1 = temp_data.parent.next_sibling
        item["工作地点"] = temp_data1
        temp_data = content.find(text="职位类别:")
        # print(temp_data)
        temp_data1 = temp_data.parent.next_sibling
        item["职位类别"] = temp_data1
        # print(temp_data1)
        temp_data = content.find(text="招聘人数:")
        # print(temp_data)
        temp_data1 = temp_data.parent.next_sibling
        item["招聘人数"] = temp_data1
        # print(temp_data1)
        item["工作职责"] = []
        temp_data = content.find(attrs={"class": "squareli"})
        for string_data in temp_data:
            if string_data.string is not None:
                # print(type(string_data.string))
                item["工作职责"].extend(string_data.string)
                item["工作职责"].extend("\n")

        item["工作要求"] = []
        temp_data = content.find_all(attrs={"class": "squareli"})[1]
        for string_data in temp_data:
            if string_data.string is not None:
                # print(string_data.string)
                item["工作要求"].extend(string_data.string)
                item["工作要求"].extend("\n")
        with open("./腾讯招聘爬虫.txt", "a", encoding="utf8") as file:
            file.write("*" * 10 + "第" + str(self.count) + "个招聘信息" + "*" * 10 + "\n")
            file.write("标题:" + "".join(item["标题"]) + '\n')
            file.write("工作地点:" + "".join(item["工作地点"]) + '\n')
            file.write("职位类别:" + "".join(item["职位类别"]) + '\n')
            file.write("招聘人数:" + "".join(item["招聘人数"]) + '\n')
            file.write("工作职责:" + "\n" + "".join(item["工作职责"]))
            file.write("工作要求:" + "\n" + "".join(item["工作要求"]) + "\n" * 3)
            self.count += 1

注意:程序会在爬取第283个招聘信息的时候挂掉,我也不知道怎么调试,若哪位大佬知道,请多指教,在此先谢过。

猜你喜欢

转载自blog.csdn.net/qq_15054345/article/details/86942479