版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/panhaigang123/article/details/83686927
最近有一个任务抓取如下图的网页数据 要获取前一天的数据进行翻页抓取数据并存入数据库
如果就只是抓取当前页的数据 没有条件和翻页数据 这个就比较简单了 但是要选取前一天的数据,还有分页数据
一开始的思路就想错了(开始想的是触发查询按钮和翻页按钮)导致任务一度没有进展 后来在技术经理的协助下搞定
话不多说 直接贴出代码
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.Properties;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
@Scheduled(cron = "0 0 03 * * ?")//每天凌晨3点抓取数据
//@Scheduled(cron="0/10 * * * * ? ") //测试 10秒执行一次
//@Scheduled(cron="0 */10 * * * ?") //测试 10分钟执行一次
@Transactional
public void getNotice() throws ClientProtocolException, IOException, ParseException {
//获取当前时间的前一天
Calendar calendar = Calendar.getInstance();
calendar.setTime(new Date());
calendar.add(Calendar.DAY_OF_MONTH, -1);
SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd");//设置日期格式
String format = df.format(calendar.getTime());
//读取配置文件中的企业信息
Properties properties = new Properties();
// 使用ClassLoader加载properties配置文件生成对应的输入流
InputStream in = WryVoluntarilyMonitorService.class.getClassLoader()
.getResourceAsStream("config/syqy.properties");
// 使用properties对象加载输入流 设置字符集以防读取中文时乱码
properties.load(new InputStreamReader(in, "utf-8"));
// 获取key对应的value值
String property = properties.getProperty("value");
String[] split = property.split(";");
System.out.println(split.length);
for (String s : split) {
String[] split2 = s.split(",");
// System.out.println(split2[0]+"======="+split2[1]);split2[0]企业名称;split2[1])企业url
String html = getHtmlByUrl(split2[1],format);
if (html != null && !"".equals(html)) {
//获取选中的时间有多少页数据
Document doc1 = Jsoup.parse(html);
Elements select = doc1.select("input");
String attr = select.get(4).attr("value");
//循环每页的数据并写入数据库
for(int k=1;k<=Integer.parseInt(attr);k++) {
String htmlByUrlData = getHtmlByUrlData(split2[1],format,k);
Document doc = Jsoup.parse(htmlByUrlData);
Elements linksElements = doc.select(".tb_ls >tbody >tr");
for (int i = 1; i < linksElements.size(); i++) {
Element element = linksElements.get(i);
/**
* element.select(">td").get(0).text() 获取到的是序号
* 判断是否有数据 (element.select(">td").get(0).text().equals("暂无数据!") 返回true是没有数据)
*/
if (!element.select(">td").get(0).text().equals("暂无数据!")) {
String aqi1 = element.select(">td").get(1).text();//检测点位
String aqi2 = element.select(">td").get(2).text();//检测时间
String aqi3 = element.select(">td").get(3).text();//检测项目
String aqi4 = element.select(">td").get(4).text();//检测结果
String aqi5 = element.select(">td").get(5).text();//检测限值
String aqi6 = element.select(">td").get(6).text();//检测单位
String aqi7 = element.select(">td").get(7).text();//是否达标
String aqi8 = element.select(">td").get(8).text();//超标倍数
String att9 = element.select(">td").get(9).getElementsByTag("td").attr("title");//评价标准
String aqi10 = element.select(">td").get(10).getElementsByTag("td").attr("title");//排放去向 内容太多需要读取title才能抓取完全
String aqi11 = element.select(">td").get(11).text();//排放方式
String aqi12 = element.select(">td").get(12).text();//备注
WryVoluntarilyMonitor wryVoluntarilyMonitor = new WryVoluntarilyMonitor();
wryVoluntarilyMonitor.setPkid(keyGenerator.getNext());
wryVoluntarilyMonitor.setCompanyName(split2[0]);
wryVoluntarilyMonitor.setDetectionPoint(aqi1);
wryVoluntarilyMonitor.setDetectionTime(StringToDate(aqi2));
wryVoluntarilyMonitor.setDetectionProject(aqi3);
wryVoluntarilyMonitor.setDetectionResult(aqi4);
wryVoluntarilyMonitor.setStandardLimitingValue(aqi5);
wryVoluntarilyMonitor.setUnit(aqi6);
wryVoluntarilyMonitor.setIsStandards(aqi7);
wryVoluntarilyMonitor.setExceedingMultiple(aqi8);
wryVoluntarilyMonitor.setEvaluationCriterion(att9);
wryVoluntarilyMonitor.setEmissionsTo(aqi10);
wryVoluntarilyMonitor.setEmissionsWay(aqi11);
wryVoluntarilyMonitor.setRemarks(aqi12);
super.insert(wryVoluntarilyMonitor);
}
}
}
}
}
System.out.println("执行成功");
}
/**
* String转date
* @param times
* @return
* @throws ParseException
*/
public Date StringToDate(String times) throws ParseException {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
return sdf.parse(times);
}
/**
* 根据URL和时间获得所有的html信息
*
* @param url
* @return
* @throws IOException
* @throws ClientProtocolException
*/
public static String getHtmlByUrl(String url,String date) throws ClientProtocolException, IOException{
String html = null;
//创建httpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
CloseableHttpResponse response=null;
try {
//以get方式请求该URL
//HttpGet httpget = new HttpGet(url);
HttpPost httppost = new HttpPost(url);
String query="startTime="+date;
StringEntity stringEntity = new StringEntity(query,"UTF-8");
stringEntity.setContentType("application/x-www-form-urlencoded");
httppost.setEntity(stringEntity);
//CloseableHttpResponse response = httpClient.execute(httpget);
response = httpClient.execute(httppost);
//得到responce对象
//HttpResponse responce = httpClient.execute(httpget);
//返回码
int resStatu = response.getStatusLine().getStatusCode();
if (resStatu==HttpStatus.SC_OK) {//200正常 其他就不对
//获得输入流
InputStream entity = response.getEntity().getContent();
if (entity!=null) {
//通过输入流转为字符串获得html源代码 注:可以获得实体,然后通过 EntityUtils.toString方法获得html
//但是有可能出现乱码,因此在这里采用了这种方式
html=getStreamString(entity);
// System.out.println(html);
}
}
} catch (Exception e) {
//System.out.println("访问【"+url+"】出现异常!");
e.printStackTrace();
} finally {
//httpClient.getConnectionManager().shutdown();
//response.close();
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return html;
}
/**
* 根据url,时间和当前页获取数据
* @param url
* @param date
* @param page
* @return
* @throws ClientProtocolException
* @throws IOException
*/
public static String getHtmlByUrlData(String url,String date,Integer page) throws ClientProtocolException, IOException{
String html = null;
//创建httpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
CloseableHttpResponse response=null;
try {
//以get方式请求该URL
//HttpGet httpget = new HttpGet(url);
HttpPost httppost = new HttpPost(url);
String query="startTime="+date+"&pageIndex="+page;
StringEntity stringEntity = new StringEntity(query,"UTF-8");
stringEntity.setContentType("application/x-www-form-urlencoded");
httppost.setEntity(stringEntity);
//CloseableHttpResponse response = httpClient.execute(httpget);
response = httpClient.execute(httppost);
//得到responce对象
//HttpResponse responce = httpClient.execute(httpget);
//返回码
int resStatu = response.getStatusLine().getStatusCode();
if (resStatu==HttpStatus.SC_OK) {//200正常 其他就不对
//获得输入流
InputStream entity = response.getEntity().getContent();
if (entity!=null) {
//通过输入流转为字符串获得html源代码 注:可以获得实体,然后通过 EntityUtils.toString方法获得html
//但是有可能出现乱码,因此在这里采用了这种方式
html=getStreamString(entity);
// System.out.println(html);
}
}
} catch (Exception e) {
//System.out.println("访问【"+url+"】出现异常!");
e.printStackTrace();
} finally {
//httpClient.getConnectionManager().shutdown();
//response.close();
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return html;
}
/**
* 将一个输入流转化为字符串
*/
public static String getStreamString(InputStream tInputStream) {
if (tInputStream != null) {
try {
BufferedReader tBufferedReader = new BufferedReader(new InputStreamReader(tInputStream, "utf-8"));
StringBuffer tStringBuffer = new StringBuffer();
String sTempOneLine = new String("");
while ((sTempOneLine = tBufferedReader.readLine()) != null) {
tStringBuffer.append(sTempOneLine + "\n");
}
return tStringBuffer.toString();
} catch (Exception ex) {
ex.printStackTrace();
}
}
return null;
}
循环
代码截图 为什么从一开始循环 应为第一行是表格的开头
页面检查截图
读取title
贴出检查页面图startTime选择时间 pageIndex当前页码