依赖的jar包
<dependency>
<!-- jsoup HTML parser library @ http://jsoup.org/ -->
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.1.2</version>
</dependency>
package com.iteye.injavawetrust.jsoup;
/**
*
* @author InJavaWeTrust
*
*/
public class Constants {
/**
* 豆瓣最受欢迎的影评URL
*/
public static final String URL = "http://movie.douban.com/review/best/";
/**
* 每页显示记录条数
*/
public static final int NUM = 10;
/**
* 拼接分页
*/
public static final String START = "?start=";
}
主要的代码
package com.iteye.injavawetrust.jsoup;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.Iterator;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
*
* @author InJavaWeTrust
*
*/
public class JsoupUtil {
private JsoupUtil() {
}
private static final JsoupUtil instance = new JsoupUtil();
public static JsoupUtil getInstance() {
return instance;
}
/**
* 将电影名称和评论写入文件
* @param name 电影名称
* @param star 评论
*/
public void writeFile(String name, String star){
File file=new File("e:\\a\\a.txt");
Writer writer = null;
try {
writer = new FileWriter(file, true);
writer.write(star + " " + name + "\r\n");
writer.flush();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if(null != writer){
try {
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public void getDoubanReview(){
try {
//只得到1-5页数据
for(int i = 0; i < 5; i++){
String url = Constants.URL + Constants.START + String.valueOf(i * Constants.NUM);
System.out.println(url);
Connection connection = Jsoup.connect(url);
Document document = connection.get();
Elements ul = document.select("ul"); // 得到ul标签
Iterator<Element> ulIter = ul.iterator();
while (ulIter.hasNext()) {
Element element = ulIter.next();
Elements eleLi = element.select("li.clst*"); // 得到ul里的li.clst*标签
Iterator<Element> liIter = eleLi.iterator();
while (liIter.hasNext()) {
Element liElement = liIter.next();
Elements eleSpan = liElement.select("span.pl*"); // 得到ul里的li.clst*里span.pl*的标签
Elements eleHref = eleSpan.select("a[href]");
String name = eleHref.text().substring(eleHref.text().indexOf("《"));
Elements eleStar = eleSpan.select("span[title]");
String star = eleStar.attr("title");
System.out.println(name + " " + star);
JsoupUtil.getInstance().writeFile(name, star); //写入文件
}
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
main方法
package com.iteye.injavawetrust.jsoup;
/**
*
* @author InJavaWeTrust
*
*/
public class DouBanReview {
public static void main(String[] args) {
JsoupUtil ju = JsoupUtil.getInstance();
ju.getDoubanReview();
}
}