有搜索条件根据url抓取网页数据(java爬取网页数据)

最近有一个任务抓取如下图的网页数据要获取前一天的数据进行翻页抓取数据并存入数据库
如果就只是抓取当前页的数据没有条件和翻页数据这个就比较简单了但是要选取前一天的数据,还有分页数据
一开始的思路就想错了(开始想的是触发查询按钮和翻页按钮)导致任务一度没有进展后来在技术经理的协助下搞定
话不多说直接贴出代码
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.Properties;

import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;



@Scheduled(cron = "0 0 03 * * ?")//每天凌晨3点抓取数据
	//@Scheduled(cron="0/10 * *  * * ? ")  //测试 10秒执行一次
	//@Scheduled(cron="0 */10 * * * ?") //测试 10分钟执行一次
	@Transactional
	public void getNotice() throws ClientProtocolException, IOException, ParseException {
		//获取当前时间的前一天
		Calendar calendar = Calendar.getInstance();
		calendar.setTime(new Date());
		calendar.add(Calendar.DAY_OF_MONTH, -1);
		SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd");//设置日期格式
		String format = df.format(calendar.getTime());
		//读取配置文件中的企业信息
		Properties properties = new Properties();
		// 使用ClassLoader加载properties配置文件生成对应的输入流
		InputStream in = WryVoluntarilyMonitorService.class.getClassLoader()
				.getResourceAsStream("config/syqy.properties");
		// 使用properties对象加载输入流 设置字符集以防读取中文时乱码
		properties.load(new InputStreamReader(in, "utf-8"));
		// 获取key对应的value值
		String property = properties.getProperty("value");
		String[] split = property.split(";");
		System.out.println(split.length);
		for (String s : split) {
			String[] split2 = s.split(",");
			// System.out.println(split2[0]+"======="+split2[1]);split2[0]企业名称;split2[1])企业url
			String html = getHtmlByUrl(split2[1],format);
			if (html != null && !"".equals(html)) {
				//获取选中的时间有多少页数据
				Document doc1 = Jsoup.parse(html);
				Elements select = doc1.select("input");
				String attr = select.get(4).attr("value");
				//循环每页的数据并写入数据库
				for(int k=1;k<=Integer.parseInt(attr);k++) {
					String htmlByUrlData = getHtmlByUrlData(split2[1],format,k);
					Document doc = Jsoup.parse(htmlByUrlData);
					Elements linksElements = doc.select(".tb_ls >tbody >tr");
					for (int i = 1; i < linksElements.size(); i++) {
						Element element = linksElements.get(i);
						/**
						 * element.select(">td").get(0).text() 获取到的是序号 
						 * 判断是否有数据 (element.select(">td").get(0).text().equals("暂无数据！") 返回true是没有数据)
						 */
						if (!element.select(">td").get(0).text().equals("暂无数据！")) {
							String aqi1 = element.select(">td").get(1).text();//检测点位
							String aqi2 = element.select(">td").get(2).text();//检测时间
							String aqi3 = element.select(">td").get(3).text();//检测项目
							String aqi4 = element.select(">td").get(4).text();//检测结果
							String aqi5 = element.select(">td").get(5).text();//检测限值
							String aqi6 = element.select(">td").get(6).text();//检测单位
							String aqi7 = element.select(">td").get(7).text();//是否达标
							String aqi8 = element.select(">td").get(8).text();//超标倍数
							String att9 = element.select(">td").get(9).getElementsByTag("td").attr("title");//评价标准
							String aqi10 = element.select(">td").get(10).getElementsByTag("td").attr("title");//排放去向 内容太多需要读取title才能抓取完全
							String aqi11 = element.select(">td").get(11).text();//排放方式
							String aqi12 = element.select(">td").get(12).text();//备注
							WryVoluntarilyMonitor wryVoluntarilyMonitor = new WryVoluntarilyMonitor();
							wryVoluntarilyMonitor.setPkid(keyGenerator.getNext());
							wryVoluntarilyMonitor.setCompanyName(split2[0]);
							wryVoluntarilyMonitor.setDetectionPoint(aqi1);
							wryVoluntarilyMonitor.setDetectionTime(StringToDate(aqi2));
							wryVoluntarilyMonitor.setDetectionProject(aqi3);
							wryVoluntarilyMonitor.setDetectionResult(aqi4);
							wryVoluntarilyMonitor.setStandardLimitingValue(aqi5);
							wryVoluntarilyMonitor.setUnit(aqi6);
							wryVoluntarilyMonitor.setIsStandards(aqi7);
							wryVoluntarilyMonitor.setExceedingMultiple(aqi8);
							wryVoluntarilyMonitor.setEvaluationCriterion(att9);
							wryVoluntarilyMonitor.setEmissionsTo(aqi10);
							wryVoluntarilyMonitor.setEmissionsWay(aqi11);
							wryVoluntarilyMonitor.setRemarks(aqi12);
							super.insert(wryVoluntarilyMonitor);
						}
					}
				}
			}
		}
		System.out.println("执行成功");
	}

	/**
	 * String转date
	 * @param times
	 * @return
	 * @throws ParseException
	 */
	public Date StringToDate(String times) throws ParseException {
		SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        return sdf.parse(times);
	}
	
	/**
	 * 根据URL和时间获得所有的html信息
	 * 
	 * @param url
	 * @return
	 * @throws IOException
	 * @throws ClientProtocolException
	 */

	public static String getHtmlByUrl(String url,String date) throws ClientProtocolException, IOException{
        String html = null;
        //创建httpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();
        CloseableHttpResponse response=null;
        try {
        	//以get方式请求该URL
            //HttpGet httpget = new HttpGet(url);
            HttpPost httppost = new HttpPost(url);
            String query="startTime="+date;
            StringEntity stringEntity = new StringEntity(query,"UTF-8");
            stringEntity.setContentType("application/x-www-form-urlencoded");
            httppost.setEntity(stringEntity);
            //CloseableHttpResponse response = httpClient.execute(httpget);
            response = httpClient.execute(httppost);
            //得到responce对象
            //HttpResponse responce = httpClient.execute(httpget);
            //返回码
            int resStatu = response.getStatusLine().getStatusCode();
            if (resStatu==HttpStatus.SC_OK) {//200正常  其他就不对
                //获得输入流
                InputStream entity = response.getEntity().getContent();
                if (entity!=null) {
                    //通过输入流转为字符串获得html源代码  注：可以获得实体，然后通过 EntityUtils.toString方法获得html
                	//但是有可能出现乱码，因此在这里采用了这种方式
                    html=getStreamString(entity);
                    // System.out.println(html);
                }
            }
        } catch (Exception e) {
            //System.out.println("访问【"+url+"】出现异常!");
            e.printStackTrace();
        } finally {
            //httpClient.getConnectionManager().shutdown();
            //response.close();
            try {
				httpClient.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
        }
        return html;
    }

	/**
	 * 根据url,时间和当前页获取数据
	 * @param url
	 * @param date
	 * @param page
	 * @return
	 * @throws ClientProtocolException
	 * @throws IOException
	 */
	public static String getHtmlByUrlData(String url,String date,Integer page) throws ClientProtocolException, IOException{
	       
    	String html = null;
        //创建httpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();
        CloseableHttpResponse response=null;
        try {
        	//以get方式请求该URL
            //HttpGet httpget = new HttpGet(url);
            HttpPost httppost = new HttpPost(url);
            String query="startTime="+date+"&pageIndex="+page;
            StringEntity stringEntity = new StringEntity(query,"UTF-8");
            stringEntity.setContentType("application/x-www-form-urlencoded");
            httppost.setEntity(stringEntity);
            //CloseableHttpResponse response = httpClient.execute(httpget);
            response = httpClient.execute(httppost);
            //得到responce对象
            //HttpResponse responce = httpClient.execute(httpget);
            //返回码
            int resStatu = response.getStatusLine().getStatusCode();
            if (resStatu==HttpStatus.SC_OK) {//200正常  其他就不对
                //获得输入流
                InputStream entity = response.getEntity().getContent();
                if (entity!=null) {
                    //通过输入流转为字符串获得html源代码  注：可以获得实体，然后通过 EntityUtils.toString方法获得html
                	//但是有可能出现乱码，因此在这里采用了这种方式
                    html=getStreamString(entity);
                    // System.out.println(html);
                }
            }
        } catch (Exception e) {
            //System.out.println("访问【"+url+"】出现异常!");
            e.printStackTrace();
        } finally {
            //httpClient.getConnectionManager().shutdown();
            //response.close();
            try {
				httpClient.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
        }
        return html;
}
	
	
	/**
	 * 将一个输入流转化为字符串
	 */
	public static String getStreamString(InputStream tInputStream) {
		if (tInputStream != null) {
			try {
				BufferedReader tBufferedReader = new BufferedReader(new InputStreamReader(tInputStream, "utf-8"));
				StringBuffer tStringBuffer = new StringBuffer();
				String sTempOneLine = new String("");
				while ((sTempOneLine = tBufferedReader.readLine()) != null) {
					tStringBuffer.append(sTempOneLine + "\n");
				}
				return tStringBuffer.toString();
			} catch (Exception ex) {
				ex.printStackTrace();
			}
		}
		return null;
	}
循环
代码截图为什么从一开始循环应为第一行是表格的开头
页面检查截图
读取title
贴出检查页面图startTime选择时间 pageIndex当前页码
有搜索条件根据url抓取网页数据(java爬取网页数据)

猜你喜欢