爬某免费招标网站,不涉及版权。直接上代码:
import requests
import json
# from pprint import pprint
import time
# from lxml import etree
import re
import sqlite3
import datetime
class sql_con:
def __init__(self, sqlpath):
self.conn = sqlite3.connect(sqlpath)
self.cur = self.conn.cursor()
def dosql(self, cmd):
self.cur.execute(cmd)
def close(self):
self.cur.close()
self.conn.close()
def run(self):
self.results = self.cur.fetchall() # 执行语句
self.conn.commit() # 更新数据库
return self.results
def newtb(self):
self.sqlcode = """
create table if not exists 招标信息
(序号 varchar(60),项目名称 varchar(20),公告类型 varchar(60),预算 varchar(20),招标单位 varchar(20),开标日期 varchar(20),中标单位 varchar(20),中标金额 varchar(20),发布时期 varchar(20))
"""
self.dosql(self.sqlcode)
ret = self.run()
return ret
def add(self,*args):
self.sqlcode="""
insert into 招标信息 values('%s','%s','%s','%s','%s','%s','%s','%s','%s')
"""
self.dosql(self.sqlcode% args)
ret = self.run()
return ret
def update(self,*args):
self.sqlcode = """
update 招标信息 set 发布时期 = '%s' where title='%s'
"""
self.dosql(self.sqlcode % args)
ret = self.run()
return ret
def select(self):
self.sqlcode = """
SELECT * FROM "招标信息"
"""
self.dosql(self.sqlcode)
ret = self.run()
return ret
def getnow():
dateTime_p = datetime.datetime.now()
str_p = datetime.datetime.strftime(dateTime_p, '%Y-%m-%d %H:%M:%S')
return str_p
def init_db():
s = sql_con('招标信息.db')
s.newtb()
s.add("0","测试","测试","测试","测试","测试","测试","测试",getnow())
return s
url = "https://www.jianyu360.com/front/pcAjaxReq"
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Content-Length": "148",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Cookie": "SESSIONID=fe09da4429766a80c0f24ca85ca4ee04e708cea7; SESSIONID=fe09da4429766a80c0f24ca85ca4ee04e708cea7; UM_distinctid=16f215748375be-0d067d4eb3ecdd-2393f61-1fa400-16f21574838848; CNZZDATA1261815924=1070106432-1576810612-%7C1576810612; Hm_lvt_72331746d85dcac3dac65202d103e5d9=1576812366; Hm_lpvt_72331746d85dcac3dac65202d103e5d9=1576812701",
"Host": "www.jianyu360.com",
"Origin": "https://www.jianyu360.com",
"Referer": "https://www.jianyu360.com/jylab/supsearch/index.html",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
data = {
"pageNumber": "20",
"reqType": "lastNews",
"searchvalue": "",
"area":"",
"subtype":"",
"publishtime":"",
"selectType": "title",
"minprice": "",
"maxprice": "",
"industry": "",
"tabularflag": "Y"
}
def getIndustry():
data["industry"] = ""
r = requests.post("https://www.jianyu360.com/jylab/supsearch/index.html",headers=headers,data=data)
r.encoding = r.apparent_encoding
html = r.text
r1 = re.compile("var industrylist = (.*?)\r\nvar sortArray",re.S)
ret = r1.findall(html)
if len(ret)>0:
json_text = json.loads(ret[0])
type_list = []
for key,value in json_text.items():
tmp = [key+"_"+info for info in value]
type_list.append(tmp)
industryList = [",".join(i) for i in type_list]
return industryList
return []
def getNewData(industry):
data["industry"] = industry
r = requests.post(url,data=data,headers=headers)
r.encoding = r.apparent_encoding
json_text = json.loads(r.text)["list"]
return json_text
if __name__ == '__main__':
industryList = getIndustry()
s = init_db()
print(industryList)
for i in industryList:
print(i)
dataList = getNewData(str(i))
print(dataList)
if isinstance(dataList, list):
for res in dataList:
if isinstance(res,dict):
id = res['_id'] #序号
title = res['title'] #项目名称
noisetype = res["industry"] #公告类型
publish_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(int(res["publishtime"]))) #发布时期
s.add(id,title,noisetype,"","","","","",publish_time)