爬取各城市天气网址小记

花了蛮长时间爬取的这个数据,网站都看腻了,记录一下过程。

  • 网站

中国天气网。

  • 爬取的数据

精确到县区:省名、市名、县区名及对应的天气网址。如河北、石家庄、长安区。

  • 爬取关键步骤

获取各分区的网址》再获取各分区的省网址》再获取其市及县区的网址。

  • 上图

这就是要爬取的分区网址,get它到下一步,记得要加全网址哦。

 

这就是要爬取的省级数据了,在这里即可得到get它到下一步。

 

这里第一个标记就是要的市级数据了,处理它。

第二个标记就是县区数据,解决它。

OK!

  • 上一个c++与Qt的代码。

数据库采用同一个结构。

#ifndef WIDGET_H
#define WIDGET_H

#include <QWidget>
#include<vector>
using namespace std;
class Widget : public QWidget
{
    Q_OBJECT

public:
    Widget(QWidget *parent = 0);
    ~Widget();
    QString m_str;
    short provinceCount;//用于唯一指定省的ID
short cityCount;//用于唯一指定市的ID
short countyCount;//用于唯一指定县区的ID
    vector<QString>regionUrls;//储存含有省数据的各区网址
    vector<QString>provinceUrls;//储存各省的网址

signals:
void start();   public slots:
    void btnClicked();//获取含有省级行政单位数据的各区网址
    void btn1Clicked();//获取各省的网址
void btn2Clicked();//获取相应省的各县区的天气网址

};

#endif // WIDGET_H


#include "widget.h"
#include<QXmlStreamReader>
#include <QNetworkReply>
#include <QNetworkRequest>
#include <QRegularExpression>
#include <QSqlDatabase>
#include <QSqlQuery>
#include<qpushbutton.h>
#include<QVBoxLayout>
#include <QFile>
using namespace std;
Widget::Widget(QWidget *parent)
    : QWidget(parent)
{
    resize(400,400);
    QPushButton*btn=new QPushButton(QStringLiteral("获取分区Url"));
    QPushButton*btn1=new QPushButton(QStringLiteral("获取各省级区域url"));
    QPushButton*btn2=new QPushButton(QStringLiteral("获取各省市县区url"));
    connect(btn,SIGNAL(clicked()),this,SLOT(btnClicked()));
    connect(btn1,SIGNAL(clicked()),this,SLOT(btn1Clicked()));
    connect(btn2,SIGNAL(clicked()),this,SLOT(btn2Clicked()));
    QVBoxLayout*vlay=new QVBoxLayout(this);
    vlay->addWidget(btn);
    vlay->addWidget(btn1);
    vlay->addWidget(btn2);

    QSqlDatabase db=QSqlDatabase::addDatabase("QODBC");
    db.setDatabaseName("qtmanager");
    db.setHostName("127.0.0.1");
    db.setUserName("sa");
    db.setPassword("*******");
    if(db.open())
        qDebug()<<QStringLiteral("打开数据库成功!");
    else {
        qDebug()<<QStringLiteral("打开数据库失败!");
    }
    QSqlQuery query("use myqq");
    provinceCount=0;
    cityCount=0;
    countyCount=0;
}

Widget::~Widget()
{

}

//获取含有省数据的各区网址
void Widget::btnClicked()
{

    QNetworkAccessManager*manager=new QNetworkAccessManager();
    QUrl url;
    url.setUrl("http://www.weather.com.cn/textFC/hb.shtml");
    QNetworkRequest request(url);
    request.setHeader(QNetworkRequest::ContentTypeHeader,
                      "application/x-www-form-urlencoded");
    QNetworkReply*reply=manager->get(request);

    connect(manager,&QNetworkAccessManager::finished,this,[=](){
        m_str=reply->readAll();
        //匹配获取含有省数据的各区网址
        QRegularExpression dateRegex("(?:<li ><span><a href=\")(/textFC/.+)(?:\">.+)(?:</a></span></li>)");
        QRegularExpressionMatchIterator dateItr = dateRegex.globalMatch(m_str);
        const vector<QString>::iterator iter=regionUrls.end();
        regionUrls.insert(iter,"http://www.weather.com.cn/textFC/hb.shtml");
        while (dateItr.hasNext()) {
            const vector<QString>::iterator iter=regionUrls.end();
            QRegularExpressionMatch m=dateItr.next();
            regionUrls.insert(iter,"http://www.weather.com.cn"+m.captured(1));
        }
        foreach (QString v, regionUrls) {
            qDebug()<<v;
        }
    });
}
//获取各省的网址
void Widget::btn1Clicked()
{
    if(regionUrls.empty())
        return;
    foreach (QString url,regionUrls) {
        unsigned* cityCount=new unsigned(0);//局部变量有风险,不能使用,于是临时指针
        QNetworkAccessManager*manager=new QNetworkAccessManager();
        QNetworkRequest request((QUrl(url)));
        request.setHeader(QNetworkRequest::ContentTypeHeader,
                          "application/x-www-form-urlencoded");
        QNetworkReply*reply=manager->get(request);

        connect(manager,&QNetworkAccessManager::finished,this,[=](){
            QByteArray data;
            data.resize(1024);
            data=reply->readLine(1024);
            while (!data.isEmpty()) {
                QRegularExpression lQC("^(?:<div class=\"lQCity\">)$");//获取其数量,减少多余匹配

                //匹配各省的天气网址
                QRegularExpression tar("^(?:<a href=\")(/textFC/.+)(?:\" target=\"_blank\">)(.+)(?:</a>)$");
                QRegularExpressionMatch lQCItr;
                QRegularExpressionMatch tarItr;
                lQCItr = lQC.match(QString::fromUtf8(data.data()));
                tarItr = tar.match(QString::fromUtf8(data.data()));
                if(lQCItr.hasMatch()){
                    data=reply->readLine(1024);
                    data=reply->readLine(1024);
                    QRegularExpression ctName("^(<li><a href=\".+\">.+</a></li>)$");
                    QRegularExpressionMatch ctItr = ctName.match(QString::fromUtf8(data.data()));

                    while(ctItr.hasMatch()){//获取其数量,减少多余匹配
                        (*cityCount)++;
                        data=reply->readLine(1024);
                        ctItr = ctName.match(QString::fromUtf8(data.data()));
                    }
                }
                else if(tarItr.hasMatch()){
                    for(unsigned i=0;i<*cityCount;){
                        QSqlQuery query;
                        tarItr = tar.match(QString::fromUtf8(data.data()));
                        if(tarItr.hasMatch()){//获取各省网址
                            provinceUrls.push_back(("http://www.weather.com.cn"+tarItr.captured(1)));
                            qDebug()<<QStringLiteral("市级名:")<<tarItr.captured(2)<<tarItr.captured(1);
                            query.prepare(QStringLiteral("insert into province values"
                                                         "(:id,:name,:path,:fkId)"));
                            query.bindValue(":id",++provinceCount);
                            query.bindValue(":name",tarItr.captured(2));
                            query.bindValue(":path",("http://www.weather.com.cn"+tarItr.captured(1)));
                            query.bindValue(":fkId",1);
                            if(!query.exec())
                                qDebug()<<QStringLiteral("爬取省url失败");
                            i++;
                        }
                        data=reply->readLine(1024);
                    }
                    break;//获取玩想要的数据结束读取
                }

                data=reply->readLine(1024);
            }
        });
    }

}

//获取相应省级行政单位的各县区的天气网址
void Widget::btn2Clicked()
{
    if(provinceUrls.empty())
        return;
    foreach (QString url, provinceUrls) {
        unsigned* ctCount=new unsigned(0);//局部变量有风险,不能使用,于是临时指针
        int* proId=new int(0);

        // qDebug()<<"start->";
        QNetworkAccessManager*manager=new QNetworkAccessManager();
        QNetworkRequest request((QUrl(url)));
        request.setHeader(QNetworkRequest::ContentTypeHeader,
                          "application/x-www-form-urlencoded");
        QNetworkReply*reply=manager->get(request);

        connect(manager,&QNetworkAccessManager::finished,this,[=](){
            QByteArray data;
            data.resize(1024);
            data=reply->readLine(1024);
            while (!data.isEmpty()) {
                QRegularExpression lQC("^(?:<div class=\"lQCity\">)$");//用于匹配获取其数量,减少多余匹配
                QRegularExpression proN("^(<a href=\"/textFC/.+.shtml\">)(.+)(</a>)$");//获取省级行政单位名
                //开始匹配各县区天气网址数据
                QRegularExpression province("^(?:<td width=\".+\" rowspan=\".+\" class=\"rowsPan\">)(.+)(?:</td>)$");
                QRegularExpressionMatch lQCItr;
                QRegularExpressionMatch proItr;
                QRegularExpressionMatch proNItr;
                lQCItr = lQC.match(QString::fromUtf8(data.data()));
                proItr = province.match(QString::fromUtf8(data.data()));
                proNItr = proN.match(QString::fromUtf8(data.data()));
                if(proItr.hasMatch()){
                    for(unsigned i=0;i<*ctCount;){
                        proItr =province.match(QString::fromUtf8(data.data()));
                        data=reply->readLine(1024);
                        if(proItr.hasMatch()){
                            i++;
                            short* tempCCount=new short;//要使用2次为了数据安全,使用临时指针
                            QSqlQuery query;
                            //插入数据到市表
                            query.prepare(QStringLiteral("insert into city "
                                                         "values(:id,:name,:url,:fkId)"));
                            *tempCCount=++cityCount;
                            query.bindValue(":id",*tempCCount);
                            query.bindValue(":name",proItr.captured(1));
                            query.bindValue(":url","");
                            query.bindValue(":fkId",*proId);
                            if(!query.exec())
                                qDebug()<<QStringLiteral("插入市记录失败");
                            //匹配各县区天气网址数据
                             data=reply->readLine(1024);
                            QRegularExpression checkName("^(?:<a href=\")(http://www.weather.com.cn/weather/.+shtml)"
                                                         "(?:\" target=\"_blank\">)(.+)(?:</a></td>)$");
                            QRegularExpressionMatch ckItr = checkName.match(QString::fromUtf8(data.data()));
                            if(ckItr.hasMatch()){//插入city的网址
                                query.prepare(QStringLiteral("update city set lookUrlPath=:url where id=:id"));
                                query.bindValue(":url",ckItr.captured(1));
                                query.bindValue(":id",*tempCCount);
                                if(!query.exec())
                                    qDebug()<<QStringLiteral("设置市的url失败");
                            }
                            while(true){
                                data=reply->readLine(1024);
                                ckItr = checkName.match(QString::fromUtf8(data.data()));
                                proItr = province.match(QString::fromUtf8(data.data()));
                                if(ckItr.hasMatch()){
                                    if(ckItr.captured(2)!=QStringLiteral("详情")){//过滤其他数据
                                        qDebug()<<ckItr.captured(2)<<ckItr.captured(1);
                                        QSqlQuery query;
                                        //插入数据
                                        query.prepare(QStringLiteral("insert into county "
                                                                     "values(:id,:name,:path,:fkId)"));
                                        query.bindValue(":id",++countyCount);
                                        query.bindValue(":name",ckItr.captured(2));
                                        query.bindValue(":path",ckItr.captured(1));
                                        query.bindValue(":fkId",*tempCCount);
                                        if(!query.exec())
                                            qDebug()<<QStringLiteral("爬取市或县区url失败");

                                    }
                                }else if(proItr.hasMatch())
                                    break;
                            }
                        }
                    }
                    break;//获取玩想要的数据结束读取
                }
                else if(proNItr.hasMatch()){//获取省的外键
                    QString proName=proNItr.captured(2);
                    QSqlQuery query;
                    //id 不能直接获取转换,需通过数据库管理器准换binary数据为int,再获取
                    query.prepare(QStringLiteral("SELECT CAST((select id from province where name=:name) as int)"));
                    query.bindValue(":name",proName);
                    if(!query.exec())
                        qDebug()<<QStringLiteral("查询 province ID failed");
                    while(query.next()){//获取记录数据
                        *proId=query.value(0).toInt();
                        qDebug()<<query.value(0).toInt();
                        qDebug()<<proName;
                    }
                }
                else if(lQCItr.hasMatch()){
                    data=reply->readLine(1024);
                    data=reply->readLine(1024);
                    QRegularExpression ctName("^(<li><a href=\".+\">.+</a></li>)$");
                    QRegularExpressionMatch ctItr = ctName.match(QString::fromUtf8(data.data()));

                    while(ctItr.hasMatch()){//获取其数量,减少多余匹配
                        (*ctCount)++;
                        data=reply->readLine(1024);
                        ctItr = ctName.match(QString::fromUtf8(data.data()));
                    }
                }

                data=reply->readLine(1024);
            }
        });
    }
}




  • 小记录
    QHostInfo::lookupHost("127.0.0.1",this,[=](QHostInfo host){
        if(host.error()!=QHostInfo::NoError)
            qDebug()<<QStringLiteral("网络连接失败!");//有网线没网0 没网线=1 也就是只能检测网线的连接
        qDebug()<<host.error();
    });
QNetworkReply*reply=manager->get(request);
  reply->readall ;//没网线==“” 网址错误=“” 有网线!=“”;通过爬取状况来看,这个不能检测网络阿

 看来这样不能检测网络有无变化,还需通过系统API实现,如Windows的InternetGetConnectedState函数,通过识别系统运行相应系统的API

 QRegularExpression checkName("^(?:<a href=\")(http://www.weather.com.cn/weather/.+shtml)"
"(?:\" target=\"_blank\">)(.+)(?:</a></td>)$"); 
QRegularExpressionMatch ckItr = checkName.match(QString::fromUtf8(data.data()));
ckItr.captured(2)

正则表达式第0组是全匹配,接下来依次是捕获组。

猜你喜欢

转载自blog.csdn.net/qq_40032304/article/details/103222397
今日推荐