基于jsoup的网页爬虫

前阵子做了个网页抓取工具,可扩展性较差,今天发现google 的一个开源网页抓取工具jsoup,写了个测试,与大家分享下

package com.gump.net.html.test;

import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
 *测试类
 *用jasoup进行html具体的网页解析例子
  @author ganliang13
  {@link http://ganliang13.iteye.com/}
 * */
public class test {
	public static void main(String[] args) throws IOException{
		long begin = System.currentTimeMillis();
		//整个html内容
		Document doc = Jsoup.connect("http://www.qzone.cc/Gexing/Qian/02/26263.html").timeout(30000).get(); // 设置连接超时时间 
		
		//打印html文档的<title>内容
		System.out.println(doc.getElementsByTag("title"));
        
		//打印html文档的<a>内容
		Elements aels = doc.getElementsByTag("a");
		
		for (Element el : aels) {
			System.out.println(el.toString());
		}
		
		long end = System.currentTimeMillis();
		System.out.println(end-begin);
	}
}

猜你喜欢

转载自ganliang13.iteye.com/blog/1717146