案例-浙江省律师综合管理平台信息爬取

爬取浙江省律师综合管理平台上的律师事务所+律师基本信息,截止2018-08-06依然有效,直接运行即可

library("RCurl")
library("XML")
library("dplyr")

#律师事务所信息爬取
request_url<-"http://lsgl.zjsft.gov.cn/zjlawyermanager/view/lawyers/LawyerOfficePageList/execute/lawofficeList.do"
Host_url<-"http://lsgl.zjsft.gov.cn"
request_headers<-c(
"Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language"="zh-CN,zh;q=0.9",
"Connection"="keep-alive",
"Origin"="http://lsgl.zjsft.gov.cn",
"Referer"="http://lsgl.zjsft.gov.cn/zjlawyermanager/view/lawyers/LawyerOfficePageList/execute/lawofficeList.do",
"User-Agent"="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36"
)

#收集调试信息
d<-debugGatherer()
#构造curl句柄收集登陆信息,并开启cookiefile管理器
chandle<-getCurlHandle(debugfunction=d$update,followlocation=TRUE,cookiefile="",verbose=TRUE)
#构造表单体
payload<-c(pageNo=1)
#先POST请求一次登陆地址,保存cookie
tmp<-postForm(request_url,httpheader=request_headers,.params=payload,style="POST",curl=chandle,.encoding="utf-8")
rm(tmp)

href_huizong<-data.frame()
for (i in 1:100) {
payload["pageNo"]=i
tmp<-postForm(request_url,httpheader=request_headers,.params=payload,style="POST",curl=chandle,.encoding="utf-8")
rdhtml<-htmlParse(tmp)
rm(tmp)
rd<-getNodeSet(rdhtml,"/html/body/table[4]/tr/td/table/tr/td[2]/table[3]/tr[1]/td[2]/table[2]/tr//a[@href]")	#将tbody节点剔除
#提取href属性的值
href<-unlist(lapply(rd,xmlGetAttr,"href"))
href<-paste0(Host_url,href)
#提取属性对应的值
href_name<-unlist(lapply(rd,xmlValue))
rm(rd)
href_data_frame<-as.data.frame(cbind(href,href_name),stringsAsFactors=FALSE)
rm(href)
rm(href_name)
href_huizong<-rbind(href_huizong,href_data_frame)
rm(href_data_frame)
}

###律师事务所的详细信息爬取
law_firm_huizong<-data.frame()
for (j in seq_along(href_huizong$href)) {
tmp<-getURL(href_huizong$href[j],httpheader=request_headers,curl=chandle,.encoding="utf-8")
rdhtml<-htmlParse(tmp)
rm(tmp)
HREF_NAME<-href_huizong$href_name[j]
rd<-getNodeSet(rdhtml,"/html/body/table[4]/tr/td/table/tr/td[2]/table[3]") %>% '[['(1) %>% readHTMLTable(,header=c("CLASS","NAME"),stringsAsFactors=FALSE)
tmp_huizong<-data.frame(HREF_NAME=HREF_NAME,CLASS=rd$CLASS,NAME=rd$NAME,stringsAsFactors=FALSE)
rm(rdhtml)
rm(HREF_NAME)
rm(rd)
law_firm_huizong<-rbind(law_firm_huizong,tmp_huizong)
rm(tmp_huizong)
}

###对特殊字符进行处理
law_firm_huizong$NAME<-gsub("\u00A0\u00A0","、",law_firm_huizong$NAME)
law_firm_huizong$NAME<-gsub("[[:space:]]","",law_firm_huizong$NAME)
law_firm_huizong<-merge(law_firm_huizong,href_huizong,by.x = "HREF_NAME",by.y = "href_name",all.x = T)
###写回本地存储
write.table(law_firm_huizong,"law_firm_info.csv",append = F,col.names = T,row.names = F,sep=",")



#爬取律师信息
request_headers["Referer"]<-"http://lsgl.zjsft.gov.cn/zjlawyermanager/view/lawyers/LawyerPageList/execute/lawyersList.do"
request_url<-"http://lsgl.zjsft.gov.cn/zjlawyermanager/view/lawyers/LawyerPageList/execute/lawyersList.do"
payload<-c(pageNo=1)
lawyer_huizong<-data.frame()	
for (i in 1:1346) {
payload["pageNo"]=i
tmp<-postForm(request_url,httpheader=request_headers,.params=payload,style="POST",curl=chandle,.encoding="utf-8")
rdhtml<-htmlParse(tmp)
rm(tmp)
rd<-getNodeSet(rdhtml,"/html/body/table[4]/tr/td/table/tr/td[2]/table[3]/tr[1]/td[2]/table[2]/tr/td[1]/a[@href]")
rm(rdhtml)	
href<-unlist(lapply(rd,xmlGetAttr,"href"))
href<-paste0(Host_url,href)	
href_name<-unlist(lapply(rd,xmlValue))
rm(rd)
href_data_frame<-as.data.frame(cbind(href,href_name),stringsAsFactors=FALSE)
rm(href)
rm(href_name)	
lawyer_huizong<-rbind(lawyer_huizong,href_data_frame)
rm(href_data_frame)	
Sys.sleep(runif(1,0.5,1.5))			
}

#爬取律师详细信息
lawer_huizong<-data.frame()
for (j in seq_along(lawyer_huizong$href)) {
tmp<-getURL(lawyer_huizong$href[j],httpheader=request_headers,curl=chandle,.encoding="utf-8")
rdhtml<-htmlParse(tmp)
rm(tmp)
HREF_NAME<-lawyer_huizong$href_name[j]
rd<-getNodeSet(rdhtml,"/html/body/table[4]/tr/td/table/tr/td[2]/table[3]") %>% '[['(1) %>% readHTMLTable(,header=c("CLASS","NAME"),stringsAsFactors=FALSE)
tmp_huizong<-data.frame(HREF_NAME=HREF_NAME,CLASS=rd$CLASS,NAME=rd$NAME,stringsAsFactors=FALSE)
rm(rdhtml)
rm(HREF_NAME)
rm(rd)
lawer_huizong<-rbind(lawer_huizong,tmp_huizong)
rm(tmp_huizong)
Sys.sleep(runif(1,0.5,1.7))
}
write.table(lawer_huizong,"lawer_info.csv",append = F,col.names = T,row.names = F,sep=",")

猜你喜欢

转载自blog.csdn.net/qq_38984677/article/details/81461376
今日推荐