快速入门爬虫开发jsoup

爬虫简介

1：爬虫什么是什么？

Robot、Crawler是一片模拟人工浏览网页的代码

2：爬虫能干什么？

抓取互联网上的信息

3：爬虫分类

广义爬行器和定向爬虫

4：爬虫的技术发展

Socket 第一代：效率高，代码繁重

HttpURLConnection OA项目跨域经常用第二代

(前台跨域ajax （jsonp）)

HttpClient 第三代

Jsoup 第四代

5：定向爬虫的实现

5.1主要抓取信息的类别

5.2抓取json信息

5.3抓取html文档

5.4信息提取

document文档

head

body

title

按id查找

获取标签的内容及属性值

获取a标签

获取a标签及href属性值

按标签及类属性查找

图片下载FileUtils

内容替换

爬虫开发

下载jar包 jsoup、jsonsimple、commons放入项目中

工具类：

import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.JSONValue;

public class MyUtil {
public static JSONObject parseToJSONObject(String str) {
return (JSONObject) JSONValue.parse(str);
}

public static JSONArray parseToJSONArray(String str) {
return (JSONArray) JSONValue.parse(str);
}

}

测试代码：

package jsoupDemo;

import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;

import org.apache.commons.io.FileUtils;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.jsoup.Connection;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import base.MyUtil;

public class JsoupDemo {

// static String url = "http://localhost:9090/test/A";
static HashMap<String, String> param = null;
static String url = "http://easy.escen.cn/";

public static void getJsonObject() throws Exception {
Connection conn = Jsoup.connect(url).timeout(10000);
param = new HashMap<String, String>();
// param.put("num", "y");
param.put("num", "n");
conn.data(param);

Response res = conn.execute();
String str = res.body();

JSONArray parseToJSONArray = MyUtil.parseToJSONArray(str);
Iterator iterator = parseToJSONArray.iterator();
while (iterator.hasNext()) {
String s = iterator.next().toString();
JSONObject parseToJSONObject = MyUtil.parseToJSONObject(s);
System.err.println(parseToJSONObject.get("name") + "===" + parseToJSONObject.get("age"));
}

}

/**
* 拿到document文档
*/
public static Document getDocument(String url, HashMap param) throws Exception {
Connection conn = null;
Document doc = null;
conn = Jsoup.connect(url).timeout(10000);

if (param == null) {
doc = conn.get();
} else {
conn.data(param);
doc = conn.post();
}
return doc;
}

/**
* 拿到head 元素
*/

public static Element getHead(String url, HashMap param) throws Exception {
Document doc = getDocument(url, param);
return doc.head();// doc.body();doc.title();(String)
}

/**
* 拿到title 元素
*/
public static String getTitle(String url, HashMap param) throws Exception {
Document doc = getDocument(url, param);
return doc.title();// doc.body();doc.title();(String)
}

/**
* 拿到id 元素
*/
public static Element getById(String url, HashMap param, String id) throws Exception {
Document doc = getDocument(url, param);
return doc.getElementById(id);// doc.body();doc.title();(String)
}

/**
* 拿到id 里面的内容元素
*/
public static String getById_innerContent(String url, HashMap param, String id) throws Exception {
Document doc = getDocument(url, param);
return doc.getElementById(id).text();// doc.body();doc.title();(String)
}

/**
* 拿到标签 a
*/
public static Elements getByTag(String url, HashMap param, String tag) throws Exception {
Document doc = getDocument(url, param);
return doc.getElementsByTag(tag);// doc.body();doc.title();(String)
}

/**
* 拿到标签 a 里的 href值注意 attr（）方法有两个参数，第二第不写，为查找对应属性的值，第二个参数写上为属性值替换
*/
public static String getByTag2(String url, HashMap param, String tag, String attr) throws Exception {
Document doc = getDocument(url, param);
Elements elementsByTag = doc.getElementsByTag(tag);
Iterator<Element> iterator = elementsByTag.iterator();
while (iterator.hasNext()) {
String attr1 = iterator.next().attr(attr);
System.out.println(attr1);

}
return doc.getElementsByTag(tag).attr(attr);// doc.body();doc.title();(String)
}

/**
* 拿到标签和样式数组里选择
*/
public static Elements getByTagAndClass(String url, HashMap param, String tagName, String[] classes)
throws Exception {
Document doc = getDocument(url, param);
StringBuffer sb = null;
Elements select = null;
if (tagName != null && classes != null) {
sb = new StringBuffer(tagName);
for (int i = 0, l = classes.length; i < l; i++) {
sb.append("." + classes[i]);
}
select = doc.select(sb.toString());
} else if (tagName != null && classes == null) {
select = doc.select(tagName);
}
return select;
}

static String path = System.getProperty("user.dir") + "/image";

static {
File f = new File(path);
if (!f.exists()) {
f.mkdirs();
System.out.println("你姐夫的文件夹创建好的");
}
}

public static void getImg(String url, HashMap param) throws Exception {
Document doc = getDocument(url, param);
Elements select = doc.select("img");
Iterator<Element> iterator = select.iterator();
while(iterator.hasNext()){
String imgUrl = iterator.next().attr("abs:src");
System.out.println(imgUrl);
downLoadImg(path,imgUrl);
}
}

public static void downLoadImg(String path, String imgUrl) throws Exception {
String fname = imgUrl.substring(imgUrl.lastIndexOf("/") + 1);
//忽略返回值类型
Response execute = Jsoup.connect(imgUrl).timeout(10000).ignoreContentType(true).execute();
FileUtils.writeByteArrayToFile(new File(path + "\\" + fname), execute.bodyAsBytes());
}

public static void main(String[] args) throws Exception {
// getJsonObject();
// Document document = getDocument(url, null);
// Element head = getHead(url,null);
// String title = getTitle(url,null);

// Element byId = getById(url,null,"container");
getImg(url, null);
}
}