药物不良反应数据库信息的下载

需求:如图,我想把不良反应数据库中的每个药品的不良反应相关信息给获取到

点击详细信息之后

分析页面请求,发现是ajax请求,

  •   第一步,我们需要获取详细页面的url,也就是药品ID
  •   第二步,拿到详细页面的url,下载页面
  •   第三步,提取页面中的适应症和不良反应,并将数据写入文件

代码

# -*- coding: utf-8 -*-

"""
@Datetime: 2019/1/11
@Author: Zhang Yafei
"""
import json
import numpy
import os

from gevent import monkey
monkey.patch_all()
import gevent
from urllib.parse import urljoin
import pandas as pd
import requests
from concurrent.futures import ThreadPoolExecutor
from lxml.etree import HTML


url_list = []
drug_list = []


def task(page):
    origin_url = 'http://pharm.ncmi.cn/dataContent/dataSearch.do?did=6'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
    data = {
        'method': 'list',
        'ec_i': 'ec',
        'ec_crd': 200,
        'ec_p': page+1,
        'ec_rd': 200,
        'ec_pd': page,
    }
    response = requests.post(origin_url, headers=headers, data=data)
    return response


def done(future,*args,**kwargs):
    response = future.result()
    response = HTML(response.text)
    hrefs = response.xpath('//table[@id="ec_table"]//tr/td[4]/a/@href')[1:]
    for href in hrefs:
        detail_url = urljoin('http://pharm.ncmi.cn', 'dataContent/' + href)
        url_list.append(detail_url)


def main():
    origin_url = 'http://pharm.ncmi.cn/dataContent/dataSearch.do?did=6'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
    data = {
        'method': 'list',
        'ec_i': 'ec',
        'ec_crd': 200,
        'ec_p': 1,
        'ec_rd': 200,
        'ec_pd': 0,
    }
    response = requests.post(origin_url, headers=headers, data=data)
    response = HTML(response.text)
    hrefs = response.xpath('//table[@id="ec_table"]//tr/td[4]/a/@href')[1:]
    url_list = []
    for href in hrefs:
        # http://pharm.ncmi.cn/dataContent/dataSearch.do?method=viewpage&id=145511&did=6
        # http: // pharm.ncmi.cn / dataSearch.do?method = viewpage & id = 144789 & did = 6
        detail_url = urljoin('http://pharm.ncmi.cn','dataContent/'+href)
        url_list.append(detail_url)
    list(map(parse, url_list))


def parse(file):
    with open(file=file, encoding='utf-8') as f:
        response = f.read()
    response = HTML(text=response)
    drug_name = response.xpath('//form/table[1]//table/tr[3]/td[2]/text()')[0].strip()
    adverse_reaction = response.xpath('//form/table[1]//table/tr[9]/td[2]/text()')[0].strip()
    indiction = response.xpath('//form/table[1]//table/tr[last()-1]/td[2]/text()')[0].strip()
    if not indiction:
        indiction = numpy.NAN
    drug_dict = {
        '药品通用名称': drug_name,
        '不良反应':adverse_reaction,
        '适应症': indiction,
    }
    drug_list.append(drug_dict)
    print(file+'提取成功')


def task1(i, url):
    response = requests.get(url)
    filename = 'html/{}.html'.format(i)
    if not os.path.exists(filename):
        with open(filename,'w',encoding='utf-8') as f:
            f.write(response.text)


if __name__ == '__main__':
    # 1.获取所有url
    # pool = ThreadPoolExecutor()
    # for page in range(37):
    #     v = pool.submit(task, page)
    #     v.add_done_callback(done)
    #
    # pool.shutdown(wait=True)
    # 2.将url写入文件
    # with open('url.py','w') as f:
    #     json.dump(url_list, f)

    # 3.读取url并下载页面
    # with open('url.py') as f:
    #     url_list = json.load(f)
    # pool = ThreadPoolExecutor()
    # for i, url in enumerate(url_list):
    #     v = pool.submit(task1, i, url)
    #
    # pool.shutdown(wait=True)

    # 4.读取页面提取有用信息,并写入文件
    for base_path, folders, files in os.walk('html'):
        file_list = list(map(lambda x:os.path.join(base_path, x), files))
        # list(map(parse, file_list))
    pool = ThreadPoolExecutor()
    for file in file_list:
        v = pool.submit(parse, file)

    pool.shutdown(wait=True)

    df = pd.DataFrame(data=drug_list)
    df = df.loc[:, ['药品通用名称','适应症','不良反应']]
    writer = pd.ExcelWriter('adverse_reaction_database.xlsx')
    df.to_excel(writer, 'adverse_reaction', index=False)
    writer.save()

  

猜你喜欢

转载自www.cnblogs.com/zhangyafei/p/10266642.html
今日推荐