爬取腾讯招聘信息写入TXT文件

"""
    需求:爬取腾讯招聘数据(职位名称,职位类别,人数,地点, 工作要求,工作职责)
    url = "http://hr.tencent.com/position.php?&start="
"""
import requests
from lxml import etree

BASE_DOMAIN = "http://hr.tencent.com/"
HEADERS = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36'
}
BASE_URL = "https://hr.tencent.com/position.php?keywords=python&lid=0&tid=0&start=0"


def parse_detail_page(url):
    position = {}
    response = requests.get(url, headers=HEADERS)
    html = etree.HTML(response.text)
    work_name = html.xpath("//tr[@class='h']/td/text()")[0]
    work_place = html.xpath("//tr[@class='c bottomline']/td[1]/text()")[0]
    work_category = html.xpath("//tr[@class='c bottomline']/td[2]/text()")[0]
    work_lack_number = html.xpath("//tr[@class='c bottomline']/td[3]/text()")[0]
    more_info = html.xpath("//ul[@class='squareli']")
    work_duty = more_info[0].xpath(".//text()")
    work_require = more_info[1].xpath(".//text()")

    position['work_name'] = work_name
    position['work_place'] = work_place
    position['work_category'] = work_category
    position['work_lack_number'] = work_lack_number
    position['work_duty'] = work_duty
    position['work_require'] = work_require

    return position


def get_detail_urls(url):
    response = requests.get(url=BASE_URL, headers=HEADERS)
    text = response.text
    html = etree.HTML(text)
    links = html.xpath("//tr[@class='even']//a/@href")
    links = map(lambda url: BASE_DOMAIN + url, links)
    return links


def spider():
    base_url = "https://hr.tencent.com/position.php?keywords=python&lid=0&tid=0&start={}#a"
    positions = []
    for x in range(0, 4):  # 43
        x *= 10
        url = base_url.format(x)
        detail_urls = get_detail_urls(url)
        for detail_url in detail_urls:
            position = parse_detail_page(detail_url)
            positions.append(position)
            print(position)
            with open('tecentRecruit.txt', 'a', encoding='utf-8') as f:
                for (key, value) in position.items():
                    if key == 'work_duty':
                        str = 'work_duty :{}'
                        f.write(str.format(value))
                        f.write('\n')
                    elif key == 'work_require':
                        str = "work_require :{}"
                        f.write(str.format(value))
                        f.write('\n')
                    else:
                        f.write(key + ":" + value)
                        f.write('\n')
                f.write('\n' * 3)

    # print(positions)


if __name__ == '__main__':
    spider()

猜你喜欢

转载自blog.csdn.net/qq_42029527/article/details/83181219