Jsoup解析网页html文档

什么是Jsoup?

Jsoup是Java用于解析HTML,就类似XML解析器用于解析XML。 Jsoup它解析HTML成为真实世界的HTML。 它与jquery选择器的语法非常相似,并且非常灵活容易使用以获得所需的结果。

有哪些功能?

  • 查找和提取数据,使用DOM遍历或CSS选择器
  • 操纵HTML元素,属性和文本
  • 从URL,文件或字符串中刮取并解析HTML
  • 输出整洁的HTML
  • 根据安全的白名单清理用户提交的内容,以防止XSS攻击

准备工作

  1. 导入jar
    架包下载地址.jar
    或者导入maven依赖方式:

    <dependency>
     <!-- jsoup HTML parser library @ http://jsoup.org/ -->
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.11.3</version>
    </dependency>
    
  2. 测试方法
    jsoup有三种获取文档的方式下面测试类已经给出

package com.zsx;

import java.io.File;
import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

public class JsoupTest {

	public static void main(String[] args) {
		
		//***********************使用url获取文档
		/*String url="http://www.gzmssy.cn";
		try {
			Document document = Jsoup.connect(url).get();
			System.out.println(document.title()); // 获得文档标题
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}*/
		//***********************
		
		//***********************使用文件加载html文档
		/*File html = new File("C:/Users/mssy/Desktop/index.html");
		try {
			Document document2 = Jsoup.parse(html,"utf-8");
			System.out.println(document2.title());	// 获得文档标题
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}*/
		//***********************
		
		//***********************从String中加载HTML文档
		String html2 = "<html><head><title>Jsoup 标题</title></head>"
                + "<body><p>Parsed HTML into a doc.</p></body></html>";
		Document document3 = Jsoup.parse(html2);	
		System.out.println(document3.title());	// 获得文档标题
		
		//***********************
		
	}
}

案例

  1. 获取HTML页面的fav图标
package com.zsx;

import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

public class JsoupFav {

	public static void main(String[] args) {
		String favImage = "Not found";
		
		try {
			Document document = Jsoup.connect("http://www.baidu.com").get();
			Element element = document.head().select("link[href~=.*\\.(ico|png)]").first();
			if (element == null) {
				element = document.head().select("meta[itemprop=image]").first();
				if (element != null) {
					favImage = element.attr("content");
				}
			}else{
				favImage = element.attr("href");
			}
			
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		System.out.println(favImage);
	}
}

  1. 获得所有a连接
package com.zsx;

import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class JsoupA {

	public static void main(String[] args) {
		String url = "http://www.baidu.com";
		
		try {
			Document document = Jsoup.connect(url).get();
			Elements links = document.select("a[href]");
			for (Element element : links) {
				System.out.println("link : " + element.attr("href"));
				System.out.println("text : " + element.text());
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}

  1. 获得所有图片信息
package com.zsx;

import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class JsoupImage {

	public static void main(String[] args) {
		String url = "https://www.baidu.com";
		
		try {
			Document document = Jsoup.connect(url).get();
			Elements image = document.select("img[src~=(?i)\\.(png|jpe?g|gif)]");
			int count = 0 ;
			for (Element element : image) {
				
				System.out.println("src : " + image.attr("src"));
				System.out.println("height : " + image.attr("height"));
				System.out.println("width : " + image.attr("width"));
				System.out.println("alt : " + image.attr("alt"));
				count++;
				
			}
			System.out.println("总张数  :" + count +" 张 ");
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}

  1. 获得表单内容
package com.zsx;

import java.io.File;
import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.w3c.dom.stylesheets.LinkStyle;

/**
 * Jsoup获取表单元素
 * 
 * @author mssy
 *
 */
public class JsoupFormElement {

	public static void main(String[] args) {
		File html = new File("C:/Users/mssy/Desktop/jsoupFrom.html");
		try {
			Document document = Jsoup.parse(html, "utf-8");
			Element formElement = document.getElementById("loginForm");
			Elements links = document.select("a[href]");
			Elements inputElements = formElement.getElementsByTag("input");
			for (Element element : inputElements) {
				String key = element.attr("name");
				String value = element.attr("value");
				System.out.println("Param name: " + key + "\nParam value: " + value);
			}
			links.attr("innerHTML","jsoup");
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}

jsoupFrom.html

<!DOCTYPE html>
<html lang="en">
 <head>
 <meta charset="utf-8">
 
 <!-- Always force latest IE rendering engine (even in intranet) & Chrome Frame
 Remove this if you use the .htaccess -->
 <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
 
 <title>jsoup Test</title>
 <meta name="description" content="">
 <meta name="author" content="Administrator">
 
 <meta name="viewport" content="width=device-width; initial-scale=1.0">
 
 <!-- Replace favicon.ico & apple-touch-icon.png in the root of your domain and delete these references -->
 <link rel="shortcut icon" href="/favicon.ico">
 <link rel="apple-touch-icon" href="/apple-touch-icon.png">
 </head>
 
 <body>
 <center>
  <form id="loginForm" action="" method="">
	用户名: <input type="text" name = "username" value="zhangsan"/>
	密&nbsp;码:<input type="password" name = "password" value="123456"/>
	<input name ="sub" type="submit" value="提交"/>
  </form>
 </center>
 </body>
  1. 消除不信任的HTML(以防止XSS)
package com.zsx;

import org.jsoup.Jsoup;
import org.jsoup.safety.Whitelist;

public class JsoupXSS {

	public static void main(String[] args) {
		String dirtyHTML = "<p><a href='http://www.baidu.com/' onclick='sendCookiesToMe()'>Link</a></p>";
		String cleanHTML = Jsoup.clean(dirtyHTML, Whitelist.basic());
		System.out.println(cleanHTML);
	}
}

Jsoup文档地址

猜你喜欢

转载自blog.csdn.net/weixin_43760328/article/details/84794039
今日推荐