By reptile crawling public resources trading platform (Sichuan Province) the most recent bid information
A: the introduction of specific dependency JSON
<dependency>
<groupId>net.sf.json-lib</groupId>
<artifactId>json-lib</artifactId>
<version>2.4</version>
<classifier>jdk15</classifier>
</dependency>
II: Get URLConnection connection request url
package com.svse.pachong;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;Import the org.apache.log4j.Logger;
/ **
* acquired by the connection request url URLConnection
* @author Lenovo
* @date dated 22 is 2019 years. 1 Day
* Description:
* /
public class open_url_test {
public static Logger Logger = Logger. the getLogger (open_url_test.class);
Boolean public OpenURL (String url_infor) throws Exception {
the URL = new new URL the URL (url_infor);
parent // connection classes, abstract
the URLConnection urlConnection url.openConnection = ();
// http connection class of
HttpURLConnection httpURLConnection = (HttpURLConnection) urlConnection;
/ * set the request method, the default is the GET (for attachment server repository must GET, POST will return if it is 405.Accessories migration process which must be POST, differentiated. ) * /
HttpURLConnection.setRequestMethod ( "GET");
// Set the character encoding httpURLConnection.setRequestProperty ( "Charset", "UTF-8");
// Open URL references to this resource communication link (if this has not been established connection).
code = HttpURLConnection int. getResponseCode ();
the System. OUT .println ( " code :" + code); // successful connection 200 is
the try {
the InputStream httpURLConnection.getInputStream inputStream = ();
the System. OUT .println ( " connection successful ") ;
Logger. info ( " open " + url_infor + " success! ");
return to true;
the catch (Exception Exception) {
. Logger info ( " open " + url_infor + " failed! ");
return to false;
}
}
}
III: url parsed data desired by crawling, and returns the format json
package com.svse.pachong;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.Charset;
import net.sf.json.JSONException;
import net.sf.json.JSONObject;/ **
* url crawling through the resolution desired data, and returns json format
* @param urlString need a website url path crawling
* @return return json data results
* @throws IOException
* @throws JSONException
* /
class public readData {
public static the JSONObject readData (String the urlString) throws IOException, {JSONException
the InputStream IS = new new . the URL (the urlString) openStream ();
the try {
the BufferedReader RD = new new . the BufferedReader (new new the InputStreamReader (IS, the Charset the forName ( " UTF . 8 ")));
the StringBuilder SB = new new the StringBuilder ();
int cp;
while ((cp = rd.read()) != -1) {
sb.append((char) cp);
}
String jsonText = sb.toString();
JSONObject json = JSONObject.fromObject(jsonText);
return json;
} finally {
is.close();
}
}
}
IV: crawling inlet
package com.svse.pachong;
import java.io.IOException;
import net.sf.json.JSONArray;
import net.sf.json.JSONException;
import net.sf.json.JSONObject;/**
* 爬取的入口
* @author lenovo
* @date 2019年1月22日
* description:
*/
public class Main {
static String urlString = "http://www.scggzy.gov.cn/Info/GetInfoListNew?keywords=×=4×Start=×End=&province=&area=&businessType=&informationType=&industryType=&page=1&parm=1534929604640";
@SuppressWarnings("static-access")
public static void main(String[] args) {
open_url_test oUrl = new open_url_test();
try {
if (oUrl.openurl(urlString)) {
readData rData = new readData();
JSONObject json = rData.readData(urlString);
JSONObject ob=JSONObject.fromObject(json);
String data=ob.get("data").toString(); //JSONObject 转 String
data="["+data.substring(1,data.length()-1)+"]";
JSONArray json2=JSONArray.fromObject(data); //String 转 JSONArray
for (int i = 0; i < 10; i++) {
JSONObject jsonObject = (JSONObject) json2.get(i);
System.out.println("--------------------------------------------");
System.out.println("项目: "+jsonObject.get("Title"));
System.out.println("时间: "+jsonObject.get("CreateDateStr"));
System.out.println(jsonObject.get("TableName"));
System.out.println(jsonObject.get("Link"));
System.out.println( jsonObject.get("province") +" "+jsonObject.get("username")+" "+jsonObject.get("businessType")+" "+jsonObject.get("NoticeType"));
}
}else{
System.out.println("解析数据失败!");
}
} catch (JSONException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
}}
Four: Test Results
At this point, the entire crawling task is over!