这里只写用到的工具 解题思路 授人以鱼不如授人以渔
!
# -*- coding-8 -*-
import requests
from bs4 import BeautifulSoup
import os
import time
import db
base_url = 'https://www.qcc.com/web/search?key='
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Cookie': '自己的cookie',
'Host': 'www.qcc.com',
'Referer': 'https://www.qcc.com/',
'TE': 'Trailers',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0'
}
companyInfo = {
}
debug = False # 开发时打开调试模式
# 创建文件方法:
def create_file(filename):
"""
创建日志文件夹和日志文件
:param filename:
:return:
"""
path = filename[0:filename.rfind("/")]
if not os.path.isdir(path): # 无文件夹时创建
os.makedirs(path)
if not os.path.isfile(filename): # 无文件时创建
fd = open(filename, mode="w", encoding="utf-8")
fd.close()
else:
pass
def fileWrite(content):
if not os.path.exists('cache.txt'):
create_file('cache.txt')
f = open('cache.txt', 'w', encoding='utf-8')
f.write(content)
f.close()
def fileOpen(fileName='cache.txt'):
if not os.path.exists('cache.txt'):
return False
f = open(fileName, 'r', encoding='utf-8')
return f.read()
def getCompanyData(companyName=''):
proxies = {
}
try:
text = fileOpen()
if text and debug:
print('缓存读取成功')
else:
url = base_url + companyName
response = requests.get(url, headers=headers, proxies=proxies)
if response.status_code != 200:
response.encoding = 'utf-8'
print(response.status_code)
print('ERROR')
if debug:
fileWrite(response.text) # 请求结果写入缓存文件
text = response.text
xmlContent = BeautifulSoup(text, 'lxml')
except Exception as err:
print(err)
try:
firstList = xmlContent.find_all('tr')[0] # 获取搜索列表权重最高的信息
if firstList:
companyDesc = firstList.find_all('td')[2] # 获取企业简介基础内容
title = companyDesc.find('a', class_='title').text # 标题
detailUrl = companyDesc.find('a', class_='title').attrs['href'] # 详情链接地址
status = companyDesc.find('span', class_='text-success').text # 状态信息
faren = companyDesc.find_all('span', class_='val')[0].text # 企业法人
ziben = companyDesc.find_all('span', class_='val')[1].text # 注册资本
tel = companyDesc.find_all('span', class_='val')[3].text # 企业电话
email = companyDesc.find_all('a', attrs={
"title": "发送邮件"})[0].text # 企业邮箱
if len(companyDesc.find_all('a', attrs={
"title": "进入官网"})) > 0:
www = companyDesc.find_all('a', attrs={
"title": "进入官网"})[0].attrs['href'] # 企业官网
else:
www = ''
companyInfo['title'] = title
companyInfo['detailUrl'] = detailUrl
companyInfo['status'] = status
companyInfo['faren'] = faren
companyInfo['ziben'] = ziben
companyInfo['tel'] = tel
companyInfo['email'] = email
companyInfo['www'] = www
db.insert(companyInfo) # 插入数据库
return companyInfo
else:
return '未找到相关信息'
except Exception as err:
print(err)
if __name__ == '__main__':
while name := input('请输入你想查询的公司名称:'):
try:
re = getCompanyData(name)
print(re)
except Exception as err:
print(err)
time.sleep(1)
复制完 请顺手 一键三联