手撸爬虫(天气预报)

代码:

import random
import time
import urllib.request
import urllib.response
import pymysql
from lxml import etree
import schedule


def sprider():
    currenturl1 = "https://tianqi.moji.com/weather/china/jilin/baishan"  # 白山 墨迹天气当前温度和描述
    currenturl2 = "https://tianqi.moji.com/weather/china/sichuan/yingshan-county"  # 营山 墨迹天气当前温度和描述
    minmaxurl1 = "http://www.weather.com.cn/weather1d/101060901.shtml"  # 白山 白山天气网 预测温度和描述
    minmaxurl2 = "http://www.weather.com.cn/weather1d/101270503.shtml"  # 营山 白营山天气网 预测温度和描述
    place1 = "白山"
    place2 = "营山"
    useragent = [
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
        "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)"]
    headers1 = {
        'User-Agent': useragent[random.randint(0, 3)]}
    headers2 = {
        'User-Agent': useragent[random.randint(4, 6)]}
    headers3 = {
        'User-Agent': useragent[random.randint(7, 9)]}
    headers4 = {
        'User-Agent': useragent[random.randint(9, 11)]}
    currenturl1 = urllib.request.Request(url=currenturl1, headers=headers1)
    currenturl2 = urllib.request.Request(url=currenturl2, headers=headers2)
    minmaxurl1 = urllib.request.Request(url=minmaxurl1, headers=headers3)
    minmaxurl2 = urllib.request.Request(url=minmaxurl2, headers=headers4)
    currentreponse1 = urllib.request.urlopen(currenturl1)
    currentreponse2 = urllib.request.urlopen(currenturl2)
    minmaxreponse1 = urllib.request.urlopen(minmaxurl1)
    minmaxreponse2 = urllib.request.urlopen(minmaxurl2)
    currenthtml1 = currentreponse1.read().decode('utf-8')
    currenthtml2 = currentreponse2.read().decode('utf-8')
    minmaxhtml1 = minmaxreponse1.read().decode('utf-8')
    minmaxhtml2 = minmaxreponse2.read().decode('utf-8')

    currenthtml1 = etree.HTML(currenthtml1)
    currenthtml2 = etree.HTML(currenthtml2)
    minmaxhtml1 = etree.HTML(minmaxhtml1)
    minmaxhtml2 = etree.HTML(minmaxhtml2)
    # 当前天气度数  当前天气描述 墨迹天气
    currentwer1 = currenthtml1.xpath("//div[@class='left']//div[@class='wea_weather clearfix']/em/text()")  # 当前天气度数
    currentdes1 = currenthtml1.xpath("//div[@class='left']//div[@class='wea_weather clearfix']/b/text()")  # 当前天气描述

    currentwer2 = currenthtml2.xpath("//div[@class='left']//div[@class='wea_weather clearfix']/em/text()")  # 当前天气度数
    currentdes2 = currenthtml2.xpath("//div[@class='left']//div[@class='wea_weather clearfix']/b/text()")  # 当前天气描述

    # 中国天气网预测 描述
    # //div[@class='t']//ul[@class='clearfix']/li/p[@class='wea']/text() 描述
    forecastdes1 = minmaxhtml1.xpath(
        "//div[@class='t']//ul[@class='clearfix']/li/p[@class='wea']/text()")
    forecastdes1 = forecastdes1[0] + "转" + forecastdes1[1]

    forecastdes2 = minmaxhtml2.xpath(
        "//div[@class='t']//ul[@class='clearfix']/li/p[@class='wea']/text()")
    forecastdes2 = forecastdes2[0] + "转" + forecastdes2[1]

    # 中国天气网预测 最低气温最高气温
    # //div[@class='t']//ul[@class='clearfix']//p[@class='tem']/span//text()
    maxminwer1 = minmaxhtml1.xpath("//div[@class='t']//ul[@class='clearfix']//p[@class='tem']/span//text()")
    min1 = maxminwer1[0]
    max1 = maxminwer1[1]
    minmax1 = min1 + "/" + max1 + "℃"

    maxminwer2 = minmaxhtml2.xpath("//div[@class='t']//ul[@class='clearfix']//p[@class='tem']/span//text()")
    min2 = maxminwer2[0]
    max2 = maxminwer2[1]
    minmax2 = min2 + "/" + max2 + "℃"

    date = int(time.time())

    currentwer1 = currentwer1[0] + "℃"
    currentwer2 = currentwer2[0] + "℃"

    # print(place1)
    # print("当前实时温度:" + currentwer1)
    # print("当前描述:" + currentdes1[0])
    # print("预测描述:" + forecastdes1)
    # print("预测最高最低气温:" + minmax1)
    #
    # print(place2)
    # print("当前实时温度:" + currentwer2)
    # print("当前描述:" + currentdes2[0])
    # print("预测描述:" + forecastdes2)
    # print("预测最高最低气温:" + minmax2)

    # 打开数据库连接
    db = pymysql.connect("****", "****", "****", "****", port=3306, charset='utf8')

    # 使用cursor()方法获取操作游标
    cursor = db.cursor()

    # SQL 插入语句
    sql = """INSERT INTO weather(
               placeme,currentwerme,minmaxme,currentdesme,forecastdesme,
               placeyou,currentweryou,minmaxyou,currentdesyou,forecastdesyou,
               updatedate
               )
               VALUES ('{0}','{1}','{2}','{3}','{4}','{5}','{6}','{7}','{8}','{9}','{10}')""".format(place1,
                                                                                                     currentwer1,
                                                                                                     minmax1,
                                                                                                     currentdes1[0],
                                                                                                     forecastdes1,
                                                                                                     place2,
                                                                                                     currentwer2,
                                                                                                     minmax2,
                                                                                                     currentdes2[0],
                                                                                                     forecastdes2, date)

    try:
        cursor.execute(sql)  # 执行sql语句
        db.commit()  # 提交到数据库执行
        # results = cursor.fetchall() 查询

    except:
        db.rollback()  # 如果发生错误则回滚

    # 关闭数据库连接
    db.close()


# 每2个小时执行一次sprider函数
schedule.every(120).minutes.do(sprider)
while True:
    # 启动服务
    schedule.run_pending()
    time.sleep(1)

服务器端运行

ps -ef |grep python //查看运行的python程序
kill -9 进程名  //杀掉进程
nohup python3 -u test.py > test.log 2>&1 & //后台运行(输出日志)
//需要注意这里用python3 和pip3
nohup python3 -u test.py >/dev/null 2>&1&  //后台运行(不输出日志)

猜你喜欢

转载自blog.csdn.net/qq_35416214/article/details/106231434
今日推荐