花了蛮长时间爬取的这个数据,网站都看腻了,记录一下过程。
- 网站
中国天气网。
- 爬取的数据
精确到县区:省名、市名、县区名及对应的天气网址。如河北、石家庄、长安区。
- 爬取关键步骤
获取各分区的网址》再获取各分区的省网址》再获取其市及县区的网址。
- 上图
这就是要爬取的分区网址,get它到下一步,记得要加全网址哦。
这就是要爬取的省级数据了,在这里即可得到get它到下一步。
这里第一个标记就是要的市级数据了,处理它。
第二个标记就是县区数据,解决它。
OK!
- 上一个c++与Qt的代码。
数据库采用同一个结构。
#ifndef WIDGET_H
#define WIDGET_H
#include <QWidget>
#include<vector>
using namespace std;
class Widget : public QWidget
{
Q_OBJECT
public:
Widget(QWidget *parent = 0);
~Widget();
QString m_str;
short provinceCount;//用于唯一指定省的ID
short cityCount;//用于唯一指定市的ID
short countyCount;//用于唯一指定县区的ID
vector<QString>regionUrls;//储存含有省数据的各区网址
vector<QString>provinceUrls;//储存各省的网址
signals:
void start(); public slots:
void btnClicked();//获取含有省级行政单位数据的各区网址
void btn1Clicked();//获取各省的网址
void btn2Clicked();//获取相应省的各县区的天气网址
};
#endif // WIDGET_H
#include "widget.h"
#include<QXmlStreamReader>
#include <QNetworkReply>
#include <QNetworkRequest>
#include <QRegularExpression>
#include <QSqlDatabase>
#include <QSqlQuery>
#include<qpushbutton.h>
#include<QVBoxLayout>
#include <QFile>
using namespace std;
Widget::Widget(QWidget *parent)
: QWidget(parent)
{
resize(400,400);
QPushButton*btn=new QPushButton(QStringLiteral("获取分区Url"));
QPushButton*btn1=new QPushButton(QStringLiteral("获取各省级区域url"));
QPushButton*btn2=new QPushButton(QStringLiteral("获取各省市县区url"));
connect(btn,SIGNAL(clicked()),this,SLOT(btnClicked()));
connect(btn1,SIGNAL(clicked()),this,SLOT(btn1Clicked()));
connect(btn2,SIGNAL(clicked()),this,SLOT(btn2Clicked()));
QVBoxLayout*vlay=new QVBoxLayout(this);
vlay->addWidget(btn);
vlay->addWidget(btn1);
vlay->addWidget(btn2);
QSqlDatabase db=QSqlDatabase::addDatabase("QODBC");
db.setDatabaseName("qtmanager");
db.setHostName("127.0.0.1");
db.setUserName("sa");
db.setPassword("*******");
if(db.open())
qDebug()<<QStringLiteral("打开数据库成功!");
else {
qDebug()<<QStringLiteral("打开数据库失败!");
}
QSqlQuery query("use myqq");
provinceCount=0;
cityCount=0;
countyCount=0;
}
Widget::~Widget()
{
}
//获取含有省数据的各区网址
void Widget::btnClicked()
{
QNetworkAccessManager*manager=new QNetworkAccessManager();
QUrl url;
url.setUrl("http://www.weather.com.cn/textFC/hb.shtml");
QNetworkRequest request(url);
request.setHeader(QNetworkRequest::ContentTypeHeader,
"application/x-www-form-urlencoded");
QNetworkReply*reply=manager->get(request);
connect(manager,&QNetworkAccessManager::finished,this,[=](){
m_str=reply->readAll();
//匹配获取含有省数据的各区网址
QRegularExpression dateRegex("(?:<li ><span><a href=\")(/textFC/.+)(?:\">.+)(?:</a></span></li>)");
QRegularExpressionMatchIterator dateItr = dateRegex.globalMatch(m_str);
const vector<QString>::iterator iter=regionUrls.end();
regionUrls.insert(iter,"http://www.weather.com.cn/textFC/hb.shtml");
while (dateItr.hasNext()) {
const vector<QString>::iterator iter=regionUrls.end();
QRegularExpressionMatch m=dateItr.next();
regionUrls.insert(iter,"http://www.weather.com.cn"+m.captured(1));
}
foreach (QString v, regionUrls) {
qDebug()<<v;
}
});
}
//获取各省的网址
void Widget::btn1Clicked()
{
if(regionUrls.empty())
return;
foreach (QString url,regionUrls) {
unsigned* cityCount=new unsigned(0);//局部变量有风险,不能使用,于是临时指针
QNetworkAccessManager*manager=new QNetworkAccessManager();
QNetworkRequest request((QUrl(url)));
request.setHeader(QNetworkRequest::ContentTypeHeader,
"application/x-www-form-urlencoded");
QNetworkReply*reply=manager->get(request);
connect(manager,&QNetworkAccessManager::finished,this,[=](){
QByteArray data;
data.resize(1024);
data=reply->readLine(1024);
while (!data.isEmpty()) {
QRegularExpression lQC("^(?:<div class=\"lQCity\">)$");//获取其数量,减少多余匹配
//匹配各省的天气网址
QRegularExpression tar("^(?:<a href=\")(/textFC/.+)(?:\" target=\"_blank\">)(.+)(?:</a>)$");
QRegularExpressionMatch lQCItr;
QRegularExpressionMatch tarItr;
lQCItr = lQC.match(QString::fromUtf8(data.data()));
tarItr = tar.match(QString::fromUtf8(data.data()));
if(lQCItr.hasMatch()){
data=reply->readLine(1024);
data=reply->readLine(1024);
QRegularExpression ctName("^(<li><a href=\".+\">.+</a></li>)$");
QRegularExpressionMatch ctItr = ctName.match(QString::fromUtf8(data.data()));
while(ctItr.hasMatch()){//获取其数量,减少多余匹配
(*cityCount)++;
data=reply->readLine(1024);
ctItr = ctName.match(QString::fromUtf8(data.data()));
}
}
else if(tarItr.hasMatch()){
for(unsigned i=0;i<*cityCount;){
QSqlQuery query;
tarItr = tar.match(QString::fromUtf8(data.data()));
if(tarItr.hasMatch()){//获取各省网址
provinceUrls.push_back(("http://www.weather.com.cn"+tarItr.captured(1)));
qDebug()<<QStringLiteral("市级名:")<<tarItr.captured(2)<<tarItr.captured(1);
query.prepare(QStringLiteral("insert into province values"
"(:id,:name,:path,:fkId)"));
query.bindValue(":id",++provinceCount);
query.bindValue(":name",tarItr.captured(2));
query.bindValue(":path",("http://www.weather.com.cn"+tarItr.captured(1)));
query.bindValue(":fkId",1);
if(!query.exec())
qDebug()<<QStringLiteral("爬取省url失败");
i++;
}
data=reply->readLine(1024);
}
break;//获取玩想要的数据结束读取
}
data=reply->readLine(1024);
}
});
}
}
//获取相应省级行政单位的各县区的天气网址
void Widget::btn2Clicked()
{
if(provinceUrls.empty())
return;
foreach (QString url, provinceUrls) {
unsigned* ctCount=new unsigned(0);//局部变量有风险,不能使用,于是临时指针
int* proId=new int(0);
// qDebug()<<"start->";
QNetworkAccessManager*manager=new QNetworkAccessManager();
QNetworkRequest request((QUrl(url)));
request.setHeader(QNetworkRequest::ContentTypeHeader,
"application/x-www-form-urlencoded");
QNetworkReply*reply=manager->get(request);
connect(manager,&QNetworkAccessManager::finished,this,[=](){
QByteArray data;
data.resize(1024);
data=reply->readLine(1024);
while (!data.isEmpty()) {
QRegularExpression lQC("^(?:<div class=\"lQCity\">)$");//用于匹配获取其数量,减少多余匹配
QRegularExpression proN("^(<a href=\"/textFC/.+.shtml\">)(.+)(</a>)$");//获取省级行政单位名
//开始匹配各县区天气网址数据
QRegularExpression province("^(?:<td width=\".+\" rowspan=\".+\" class=\"rowsPan\">)(.+)(?:</td>)$");
QRegularExpressionMatch lQCItr;
QRegularExpressionMatch proItr;
QRegularExpressionMatch proNItr;
lQCItr = lQC.match(QString::fromUtf8(data.data()));
proItr = province.match(QString::fromUtf8(data.data()));
proNItr = proN.match(QString::fromUtf8(data.data()));
if(proItr.hasMatch()){
for(unsigned i=0;i<*ctCount;){
proItr =province.match(QString::fromUtf8(data.data()));
data=reply->readLine(1024);
if(proItr.hasMatch()){
i++;
short* tempCCount=new short;//要使用2次为了数据安全,使用临时指针
QSqlQuery query;
//插入数据到市表
query.prepare(QStringLiteral("insert into city "
"values(:id,:name,:url,:fkId)"));
*tempCCount=++cityCount;
query.bindValue(":id",*tempCCount);
query.bindValue(":name",proItr.captured(1));
query.bindValue(":url","");
query.bindValue(":fkId",*proId);
if(!query.exec())
qDebug()<<QStringLiteral("插入市记录失败");
//匹配各县区天气网址数据
data=reply->readLine(1024);
QRegularExpression checkName("^(?:<a href=\")(http://www.weather.com.cn/weather/.+shtml)"
"(?:\" target=\"_blank\">)(.+)(?:</a></td>)$");
QRegularExpressionMatch ckItr = checkName.match(QString::fromUtf8(data.data()));
if(ckItr.hasMatch()){//插入city的网址
query.prepare(QStringLiteral("update city set lookUrlPath=:url where id=:id"));
query.bindValue(":url",ckItr.captured(1));
query.bindValue(":id",*tempCCount);
if(!query.exec())
qDebug()<<QStringLiteral("设置市的url失败");
}
while(true){
data=reply->readLine(1024);
ckItr = checkName.match(QString::fromUtf8(data.data()));
proItr = province.match(QString::fromUtf8(data.data()));
if(ckItr.hasMatch()){
if(ckItr.captured(2)!=QStringLiteral("详情")){//过滤其他数据
qDebug()<<ckItr.captured(2)<<ckItr.captured(1);
QSqlQuery query;
//插入数据
query.prepare(QStringLiteral("insert into county "
"values(:id,:name,:path,:fkId)"));
query.bindValue(":id",++countyCount);
query.bindValue(":name",ckItr.captured(2));
query.bindValue(":path",ckItr.captured(1));
query.bindValue(":fkId",*tempCCount);
if(!query.exec())
qDebug()<<QStringLiteral("爬取市或县区url失败");
}
}else if(proItr.hasMatch())
break;
}
}
}
break;//获取玩想要的数据结束读取
}
else if(proNItr.hasMatch()){//获取省的外键
QString proName=proNItr.captured(2);
QSqlQuery query;
//id 不能直接获取转换,需通过数据库管理器准换binary数据为int,再获取
query.prepare(QStringLiteral("SELECT CAST((select id from province where name=:name) as int)"));
query.bindValue(":name",proName);
if(!query.exec())
qDebug()<<QStringLiteral("查询 province ID failed");
while(query.next()){//获取记录数据
*proId=query.value(0).toInt();
qDebug()<<query.value(0).toInt();
qDebug()<<proName;
}
}
else if(lQCItr.hasMatch()){
data=reply->readLine(1024);
data=reply->readLine(1024);
QRegularExpression ctName("^(<li><a href=\".+\">.+</a></li>)$");
QRegularExpressionMatch ctItr = ctName.match(QString::fromUtf8(data.data()));
while(ctItr.hasMatch()){//获取其数量,减少多余匹配
(*ctCount)++;
data=reply->readLine(1024);
ctItr = ctName.match(QString::fromUtf8(data.data()));
}
}
data=reply->readLine(1024);
}
});
}
}
- 小记录
QHostInfo::lookupHost("127.0.0.1",this,[=](QHostInfo host){
if(host.error()!=QHostInfo::NoError)
qDebug()<<QStringLiteral("网络连接失败!");//有网线没网0 没网线=1 也就是只能检测网线的连接
qDebug()<<host.error();
});
QNetworkReply*reply=manager->get(request);
reply->readall ;//没网线==“” 网址错误=“” 有网线!=“”;通过爬取状况来看,这个不能检测网络阿
看来这样不能检测网络有无变化,还需通过系统API实现,如Windows的InternetGetConnectedState函数,通过识别系统运行相应系统的API
QRegularExpression checkName("^(?:<a href=\")(http://www.weather.com.cn/weather/.+shtml)"
"(?:\" target=\"_blank\">)(.+)(?:</a></td>)$");
QRegularExpressionMatch ckItr = checkName.match(QString::fromUtf8(data.data()));
ckItr.captured(2)
正则表达式第0组是全匹配,接下来依次是捕获组。