python3爬虫-通过requests获取拉钩职位信息

import requests, json, time, tablib


def send_ajax_request(data: dict):
    try:
        ajax_response = session.post(url=ajax_url,
                                     params={"needAddtionalResult": "false", "city": city},
                                     data=data,
                                     headers=ajax_headers,
                                     timeout=timeout)
        if ajax_response.status_code == 200:
            return ajax_response.json()
        return {}
    except Exception:
        return {}


def get_job_info(info_dic: dict):
    jobInfoMap = info_dic.get("content").get("positionResult").get("result")

    for jobInfoDict in jobInfoMap:
        dic = {}
        dic["companyId"] = jobInfoDict.get("companyId")
        dic["companyFullName"] = jobInfoDict.get("companyFullName")
        dic["positionName"] = jobInfoDict.get("positionName")
        dic["workYear"] = jobInfoDict.get("workYear")
        dic["education"] = jobInfoDict.get("education")
        dic["salary"] = jobInfoDict.get("salary")
        dic["jobNature"] = jobInfoDict.get("jobNature")
        dic["companySize"] = jobInfoDict.get("companySize")
        dic["city"] = jobInfoDict.get("city")
        dic["district"] = jobInfoDict.get("district")
        dic["createTime"] = jobInfoDict.get("createTime")
        if is_save_txtfile:
            yield json.dumps(dic, ensure_ascii=False)
        else:
            yield dic.values()


def save_to_file(json_data):
    for data in json_data:
        f.write(data + "\n")


def save_to_excel(list_data):
    for line in list_data:
        dataset.append(line)


def run():
    for i in range(1, 31):
        data = {
            "first": "false",
            "pn": i,
            "kd": "python"
        }
        info_dic = send_ajax_request(data)
        data = get_job_info(info_dic)
        if is_save_txtfile:
            save_to_file(data)
        else:
            save_to_excel(data)
        print("正在保存数据")
        time.sleep(sleeptime)


if __name__ == '__main__':
    session = requests.Session()
    job_name = "python"
    city = "成都"
    timeout = 5
    sleeptime = 10
    doc_url = "https://www.lagou.com/jobs/list_{job_name}".format(job_name=job_name)
    session.headers[
        "User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
    session.headers["Host"] = "www.lagou.com"

    doc_response = session.get(url=doc_url, params={"city": city})

    ajax_headers = {
        "Origin": "https://www.lagou.com",
        "Referer": doc_response.url
    }

    ajax_url = "https://www.lagou.com/jobs/positionAjax.json?=false"

    is_save_txtfile = False

    if not is_save_txtfile:
        dataset = tablib.Dataset()
        dataset.headers = ["companyId", "companyFullName", "positionName", "workYear",
                           "education", "salary", "jobNature", "companySize", "city",
                           "district", "createTime"]

    f = open("jobinfo.txt", "a", encoding="utf-8")
    try:
        run()
    except Exception:
        print('出错了')
    finally:
        if is_save_txtfile:
            f.close()
        else:
            with open("jobInfo.xls", "wb") as f:
                f.write(dataset.xls)
                f.flush()
python3爬虫-通过requests获取拉钩职位信息

猜你喜欢