python爬取天气网历史数据

采用python 的bs4和urllib从网站：http://www.tianqihoubao.com/lishi/beijing/month/201710.html，爬取了2011年1月到2017年11月13日天气数据，主要有以下四个字段：data(时间）、weather(天气）、temperature(温度）、wind(风力风向），四个字段的数据

'''
Created on 2017-11-13


@author: chen
'''
from bs4 import BeautifulSoup
from urllib.request import urlopen
#import xlsxwriter
import csv


#通过css获取所有月份的链接
def  get_all_weather_url():
#    urlopen发送请求，返回响应
    response = urlopen("http://www.tianqihoubao.com/lishi/beijing/month/201101.html")
#    bs4格式化并解析数据
    bs_obj = BeautifulSoup(response.read(), "lxml")
#通过css拿到所有月份
    months = bs_obj.find("div", {"class":"months"})
#    print(months,type(months))
#找到所有标签为"a"
    month_all = months.find_all("a")
#    遍历，获取标签里面的属性值href，title（resultSet)
    for month in  month_all:
#        print(type(month),len(month_all))
#        print(month.a["href"]+" "+month.a["title"])
#        print(month.attrs)
#返回标签所有的属性值(迭代器)
        yield month.attrs
#处理得到的url链接        
def get_page_url_weather():
    for url in get_all_weather_url():
        helf_url = url["href"]
        title = url["title"]
        weather_url = "http://www.tianqihoubao.com/" + str(helf_url)
        yield weather_url, title
#通过每个月份的链接获取天气数据
def get_weather_data():
#用set保存链接，避免重复爬取，后期实时更新（未时现）
    url_set = set([])
    for url , title in get_page_url_weather():
        if url not in url_set:
            url_set.add(url)
            weather_content = urlopen(url).read()
            weather_page_obj = BeautifulSoup(weather_content, "lxml")
            tbody_page = weather_page_obj.table
    #        print(tbody_page.find_all("tr"),title)
            tr_weather_page = tbody_page.find_all("tr")
    #        print(title)
            for tr_each in tr_weather_page:
#    处理从tr中获取的string数据，需要删除空白字符和换行符
                td_weather = tr_each.find_all("td") 
                data = td_weather[0].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                weather = td_weather[1].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                temperature = td_weather[2].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                wind = td_weather[3].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                yield data, weather, temperature, wind         
            else:
                continue
def main():
#    这里有个小问题，在写入xlsx文件时，文件无法生成和写入，故采用csv文件写入
#    book = xlsxwriter.Workbook(r"C:\Users\chen\Desktop\北京天气数据每日更新.xlsx")
#newline,防止写入的时候，写下一条会有空白行
    with open(r"C:\Users\chen\Desktop\北京天气数据每日更新.csv", "w+", newline="") as file:
        writer = csv.writer(file)
        tem = 1
        for data , weather, temperature, wind in get_weather_data():
#            temp = book.add_worksheet()
#            temp.write_row(data, weather, temperature, wind)
#            print(data , ",", weather, ",", temperature, ",", wind) 
            day_weather = [data , weather, temperature, wind]
            writer.writerow(day_weather)
            print("第" + str(tem) + "次写入成功")
            tem += 1
        print("写入完毕")
if __name__ == '__main__':
    main()

以上时所有爬取天气网数据的源码，希望大神指教，谢谢！！

python爬取天气网历史数据

猜你喜欢