一、爬取网页分析
爬取网站URL :http://scxk.nmpa.gov.cn:81/xk/
1、可以看出,页面采用表格进行数据呈现,在设计爬虫时可以想到,以每一家公司为单位,通过设置数组来存储数据信息。
2、打开浏览器抓包工具,对页面数据进行分析,可以看出页面采用ajax请求,返回json数据。
二、爬取思路
1、首先通过爬取该界面的信息,获取每一家公司的 id 信息。
2、获取到id号后,再对具体公司信息进行爬取
三、源代码
import requests
import xlwt
def main():
baseurl = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList'
url = "http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}
datalist = []
for page in range(1, 6):
data = {
'on': 'true',
'page': str(page),
'pageSize': '15',
'productName': '',
'conditionType': '1',
'applyname': '',
'applysn': ''
}
response = requests.post(url=baseurl, data=data, headers=headers)
page_text = response.json()
for dic in page_text['list']:
item = []
id = dic['ID']
data = {
'id': str(id)
}
response = requests.post(url=url, data=data, headers=headers)
page_text = response.json()
# print(page_text)
epsName = page_text['epsName']
item.append(epsName)
productSn = page_text['productSn']
item.append(productSn)
certStr = page_text['certStr']
item.append(certStr)
epsAddress = page_text['epsAddress']
item.append(epsAddress)
epsProductAddress = page_text['epsProductAddress']
item.append(epsProductAddress)
businessLicenseNumber = page_text['businessLicenseNumber']
item.append(businessLicenseNumber)
legalPerson = page_text['legalPerson']
item.append(legalPerson)
businessPerson = page_text['businessPerson']
item.append(businessPerson)
qualityPerson = page_text['qualityPerson']
item.append(qualityPerson)
qfManagerName = page_text['qfManagerName']
item.append(qfManagerName)
xkName = page_text['xkName']
item.append(xkName)
rcManagerDepartName = page_text['rcManagerDepartName']
item.append(rcManagerDepartName)
rcManagerUser = page_text['rcManagerUser']
item.append(rcManagerUser)
xkDate = page_text['xkDate']
item.append(xkDate)
xkDateStr = page_text['xkDateStr']
item.append(xkDateStr)
datalist.append(item)
print(datalist)
savepath = "NMPA化妆品生产许可信息数据.xls"
saveData(datalist, savepath)
def saveData(datalist,savepath):
print("开始保存...")
book = xlwt.Workbook(encoding="utf-8",style_compression=0)
sheet = book.add_sheet('NMPA化妆品生产许可信息数据',cell_overwrite_ok=True)
col = ("企业名称","许可证编号","许可项目","企业住所","生产地址","社会信用代码","法定代表人","企业负责人","质量负责人","发证机关","签发人","日常监督管理机构","日常见度管理人员","有效期至")
for i in range(0,14):
sheet.write(0,i,col[i])
for i in range(0,75):
print("第%d条"%(i+1))
data = datalist[i]
for j in range(0,14):
sheet.write(i+1,j,data[j])
book.save(savepath)
print("保存完成")
if __name__ =="__main__":
main()
四、爬取结果