本文使用R语音 rvest爬取中国天气网所有城市未来七天天气数据并写入oracle数据库,其中包括了如何使用R语言连接oracle数据库,以及爬取时候的简单策略,最后对爬取到的数据组装成数据框并写入数据库,可以作为R语音初中级爱好者们很好的参考例子,当然这是我很久前写的代码,很多地方没有进行优化,比如使用了for循环,其实应该封装到函数中使用sapply来处理循环以提高效率。感兴趣的朋友想与我交流的话可以加群R语言&大数据分析456726635,或者加群Python & Spark 636866908。下面废话不多说了直接贴代码。
library(rvest)
library(ROracle)
library(curl)
library(stats)
library(lubridate)
library(xts)
library(zoo)
library(TTR)
library(forecast)
library(fGarch)
library(tseries)
library(FinTS)
library(rugarch)
#############################连接oracle数据库的相关配置
localdrv <- dbDriver("Oracle")
localhost <- "192.168.11.170"
localport <- 1521
sid <- "BIFORECAST"
localstring <- paste(
"(DESCRIPTION=",
"(ADDRESS=(PROTOCOL=tcp)(HOST=", localhost, ")(PORT=", localport, "))",
"(CONNECT_DATA=(SID=", sid, ")))", sep = "")
## Use username/password authentication.
localcon <- dbConnect(localdrv, username = "bi", password = "bi",
dbname = localstring)
# 第一步:定义几个函数**
# 定义读取大区URL地址函数
read.region.url<-function(url){
web <-""
trynext=try(read_html(url,encoding = "utf8"),silent=TRUE)
if ('try-error' %in% class(trynext))
{
web<-read_html(url,encoding = "utf8")
}else
{
web<-read_html(url,encoding = "utf8")
}
region.url<-web%>%
html_nodes("div.maptabboxinBox")%>%
html_nodes("div.maptabbox")%>%
html_nodes("h4")%>%
html_nodes("a")%>%
html_attr("href")
return(region.url)
}
# 定义读取省会URL地址函数
read.province.url<-function(region.url){
web <-""
trynext=try(read_html(region.url,encoding = "utf8"),silent=TRUE)
if ('try-error' %in% class(trynext))
{
web<-read_html(region.url,encoding = "utf8")
}else
{
web<-read_html(region.url,encoding = "utf8")
}
province.url<-web%>%
html_nodes("div")%>%
html_nodes("tr")%>%
html_nodes("td.rowsPan")%>%
html_nodes("a")%>%
html_attr("href")%>%
unique()
province.url<-paste("http://www.weather.com.cn",province.url,sep = "")
return(province.url)
}
# 定义读取城市URL地址函数
read.city.url<-function(province.url){
web <-""
trynext=try(read_html(province.url,encoding = "utf8"),silent=TRUE)
if ('try-error' %in% class(trynext))
{
web<-read_html(province.url,encoding = "utf8")
}else
{
web<-read_html(province.url,encoding = "utf8")
}
city.url<-web%>%
html_nodes("div")%>%
html_nodes("tr")%>%
html_nodes("td")%>%
html_nodes("a")%>%
html_attr("href")%>%
unique()
return(city.url)
}
#第二步:获取URL地址**
# 中国天气网URL地址
url<-"http://www.weather.com.cn/forecast/index.shtml"
# 开始读取大区URL地址
region.url<-read.region.url(url)
# 开始读取省会URL地址
province.url<-unlist(lapply(region.url,read.province.url))
# 开始读取各城市URL地址
city.urls<-unlist(lapply(province.url,read.city.url))
#第三步:爬取实时天气数据**
# 开始爬取各城市天气数据
#city.urls=head(city.urls)
####一个计数器
ii <- 1
for(i in city.urls)
{
web <-""
print(ii)
######睡会吧,心急吃不了热豆腐
if(ii>1500)
{
Sys.sleep(20)
ii <- 1
}
totalweather=data.frame()
print(i)
#####################当前城市不能爬取的时候的策略,可根据自己的需求添加
trynext=try(read_html(i,encoding = "utf8"),silent=TRUE)
if ('try-error' %in% class(trynext))
{
print("Error con111")
next
}else
{
web <- read_html(curl(i, handle = new_handle("useragent" = "Mozilla/5.0")),encoding = "utf8")
}
city <- web %>%
html_nodes("div")%>%
html_nodes("div")%>%
html_nodes("div")%>%
html_nodes("div.crumbs.fl")%>%
html_nodes("a")%>%
html_text()
if(length(city)>1)
{
province <- city[1]
city <- city[2]
}else
{
province <- city[1]
}
# # # # ## # # # # # # # # # 确认城市名
city1 <- web %>%
html_nodes("div")%>%
html_nodes("div")%>%
html_nodes("div")%>%
html_nodes("div.crumbs.fl")%>%
html_nodes("span")%>%
html_text()
if(length(city1)>1)
{
city1 <- city1[length(city1)]
}
############################省-市 和 直辖市-城区 的分别处理
if(city1 != "城区")
{
city <- city1
}else
{
if(length(city)==1) city <- city[1]
}
print(city)
weather <-web %>%
html_nodes("div")%>%
html_nodes("ul.t.clearfix")%>%
html_nodes("li")%>%
html_text()%>%
strsplit(split="[\n]+")
newweather<-data.frame()
for(j in weather)
{
if(length(j)>5)
{
jtemp <- paste0(j[4],j[5])
weather0=data.frame(j[3],jtemp,j[6])
}else
{
weather0=data.frame(j[3],j[4],j[5])
}
newweather=rbind(newweather,weather0)
}
names(newweather)=c("weather","temperature","wind")
####################未来七天的时间准备好
sdate1 <- as.character(Sys.Date())
sdate2 <- as.character(Sys.Date()+1)
sdate3 <- as.character(Sys.Date()+2)
sdate4 <- as.character(Sys.Date()+3)
sdate5 <- as.character(Sys.Date()+4)
sdate6 <- as.character(Sys.Date()+5)
sdate7 <- as.character(Sys.Date()+6)
date<-c(sdate1,sdate2,sdate3,sdate4,sdate5,sdate6,sdate7)
weather<-data.frame(province,city,date,newweather)
totalweather=rbind(totalweather,weather)
aprovince=totalweather$province
acity=totalweather$city
adate=totalweather$date
aweather=totalweather$weather
atemperature=totalweather$temperature
awind=totalweather$wind
################数据框已经装载完毕
writedata<-data.frame(CITY=acity,PERIOD_SDATE=adate,WEATHER=aweather,TEMPERATURE=atemperature,WIND=awind,PROVINCE_NAME=aprovince)
print(writedata)
# Sys.setenv(TZ = "GMT")
# Sys.setenv(ORA_SDTZ = "GMT")
#adate=as.Date(adate)
########################写入数据库
dbWriteTable(localcon,"WECHAT_NEW_WEATHER_FORECAST", writedata, row.names = FALSE,append=TRUE, ora.number=FALSE)
dbWriteTable(localcon,"WECHAT_NEW_WEATHER_FORECAST_TT", writedata, row.names = FALSE,append=TRUE, ora.number=FALSE)
ii <- ii+1
}
dbDisconnect(localcon)