版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/beijiafei/article/details/80157642
import requests
import re
import xlwt
def getHTMLText(url): # 得到网页文件
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding # 转换编码格式
return r.text
except Exception as e:
print(str(e))
def parsePage(ilt, html): # 分析页面函数得到相应的值
try:
ymd = re.findall("ymd:'(.*?)',", html)
high = re.findall("bWendu:'(.*?)℃',", html)
low = re.findall("yWendu:'(.*?)℃',", html)
tianqi = re.findall("tianqi:'(.*?)',", html)
fengxiang = re.findall("fengxiang:'(.*?)',", html)
fengli = re.findall(",fengli:'(.*?)'", html)
aqi = re.findall("aqi:'(.*?)',", html)
aqiInfo = re.findall("aqiInfo:'(.*?)',", html)
aqiLevel = re.findall(",aqiLevel:'(.*?)'", html)
# 应为日期是每一天都有的,所以可以作为基准,以此来遍历
for i in range(len(ymd)):
ymd[i] = ymd[i].replace("-", "/") # 将日期中的"-" 换成"/"
for i in range(len(ymd)): # 将每一天的信息保存为一个列表
data = ymd[i]
maxTem = high[i]
minTem = low[i]
weather = tianqi[i]
windDirection = fengxiang[i]
windLev = fengli[i]
airNum = aqi[i]
airEva = aqiInfo[i]
airLev = aqiLevel[i]
ilt.append([data, maxTem, minTem, weather, windDirection, windLev, airNum, airEva, airLev])
except:
print("异常")
def printGoodsList(ilt): # 装进表格
xiNing = xlwt.Workbook()
sheet1 = xiNing.add_sheet("sheet1")
"""
rule = "{:8}\t{:3}\t{:3}\t{:8}\t{:8}\t{:6}\t{:4}\t{:2}\t{:6}" # 字符串的格式化
print(rule.format("data", "maxTem", "minTem", "weather", "windDirection", "windLev", "airNum", "airEva", "airLev"))
for i in range(len(ilt)):
print(rule.format(ilt[i][0], ilt[i][1], ilt[i][2], ilt[i][3], ilt[i][4], ilt[i][5], ilt[i][6], ilt[i][7], ilt[i][8]))
"""
for i in range(len(ilt)):
for j in range(len(ilt[i])):
sheet1.write(i, j, ilt[i][j])
xiNing.save("西宁.xls")
if __name__ == "__main__":
start_url = "http://tianqi.2345.com/t/wea_history/js/" # 每一个网址都包含的部分
list0 = []
list0.append(["日期", "最高气温", "最低气温", "天气", "风向", "风力", "空气质量指数", "空气质量评价", "空气质量等级"])
#根据观察,发现有的日期的格式有点奇怪,所以分开处理
for i in range(2016, 2019): # 左闭右开
try:
if i == 2018:
for j in range(1, 5): # 1-4月
url = start_url + str(i) + "0" + str(j) + "/52866_" + str(i) + "0" + str(j) + ".js"
html = getHTMLText(url)
# print(html) #网址没有错,且能访问成功
parsePage(list0, html) # 调用函数的时候,列表的值岁随函数里面的变化而变化
elif i == 2016:
for j in range(1, 13): # 1-12月
if j in range(3, 10):
url = start_url + str(i) + "0" + str(j) + "/52866_" + str(i) + "0" + str(j) + ".js"
elif j in range(1, 3):
url = start_url + "/52866_" + str(i) + str(j) + ".js"
else:
url = start_url + str(i) + str(j) + "/52866_" + str(i) + str(j) + ".js"
html = getHTMLText(url)
#print(html)
parsePage(list0, html)
else:
for j in range(1, 13):
if j in range(1, 10):
url = start_url + str(i) + "0" + str(j) + "/52866_" + str(i) + "0" + str(j) + ".js"
else:
url = start_url + str(i) + str(j) + "/52866_" + str(i) + str(j) + ".js"
html = getHTMLText(url)
# print(html)
parsePage(list0, html)
except:
continue
#print(len(list0))
printGoodsList(list0)
刚学的知识,不足之处,欢迎大家前来评论。