简单的爬虫

依赖的jar包

<dependency>
    <!-- jsoup HTML parser library @ http://jsoup.org/ -->
        <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
        <version>1.10.2</version>
        </dependency>

        <dependency> 
         <groupId>org.apache.httpcomponents</groupId> 
          <artifactId>httpclient</artifactId> 
         <version>4.1.2</version>         
        </dependency> 
package com.iteye.injavawetrust.jsoup;  

/** 
 *  
 * @author InJavaWeTrust 
 * 
 */  
public class Constants {  

    /** 
     * 豆瓣最受欢迎的影评URL 
     */  
    public static final String URL = "http://movie.douban.com/review/best/";  
    /** 
     * 每页显示记录条数 
     */  
    public static final int NUM = 10;  

    /** 
     * 拼接分页 
     */  
    public static final String START = "?start=";  

}  

主要的代码

package com.iteye.injavawetrust.jsoup;  

import java.io.File;  
import java.io.FileNotFoundException;  
import java.io.FileWriter;  
import java.io.IOException;  
import java.io.Writer;  
import java.util.Iterator;  

import org.jsoup.Connection;  
import org.jsoup.Jsoup;  
import org.jsoup.nodes.Document;  
import org.jsoup.nodes.Element;  
import org.jsoup.select.Elements;  

/** 
 *  
 * @author InJavaWeTrust 
 * 
 */  
public class JsoupUtil {  

    private JsoupUtil() {  

    }  

    private static final JsoupUtil instance = new JsoupUtil();  

    public static JsoupUtil getInstance() {  
        return instance;  
    }  

    /** 
     * 将电影名称和评论写入文件 
     * @param name 电影名称 
     * @param star 评论 
     */  
    public void writeFile(String name, String star){  
        File file=new File("e:\\a\\a.txt"); 
        Writer writer = null;  
        try {  
            writer = new FileWriter(file, true);  
            writer.write(star + "   " + name + "\r\n");  
            writer.flush();  
        } catch (FileNotFoundException e) {  
            e.printStackTrace();  
        } catch (IOException e) {  
            e.printStackTrace();  
        } finally {  
            if(null != writer){  
                try {  
                    writer.close();  
                } catch (IOException e) {  
                    e.printStackTrace();  
                }  
            }  
        }  
    }  

    public void getDoubanReview(){  
        try {  
            //只得到1-5页数据  
            for(int i = 0; i < 5; i++){  
                String url = Constants.URL + Constants.START + String.valueOf(i * Constants.NUM);  
                System.out.println(url);  
                Connection connection = Jsoup.connect(url);  
                Document document = connection.get();  
                Elements ul = document.select("ul"); // 得到ul标签  
                Iterator<Element> ulIter = ul.iterator();  
                while (ulIter.hasNext()) {  
                    Element element = ulIter.next();  
                    Elements eleLi = element.select("li.clst*"); // 得到ul里的li.clst*标签  
                    Iterator<Element> liIter = eleLi.iterator();  
                    while (liIter.hasNext()) {  
                        Element liElement = liIter.next();  
                        Elements eleSpan = liElement.select("span.pl*"); // 得到ul里的li.clst*里span.pl*的标签  
                        Elements eleHref = eleSpan.select("a[href]");  
                        String name = eleHref.text().substring(eleHref.text().indexOf("《"));  
                        Elements eleStar = eleSpan.select("span[title]");  
                        String star = eleStar.attr("title");  
                        System.out.println(name + " " + star);  
                        JsoupUtil.getInstance().writeFile(name, star); //写入文件  
                    }  
                }  
            }  


        } catch (IOException e) {  
            e.printStackTrace();  
        }  
    }  

}  

main方法

package com.iteye.injavawetrust.jsoup;  

/** 
 *  
 * @author InJavaWeTrust 
 * 
 */  
public class DouBanReview {  

    public static void main(String[] args) {  
        JsoupUtil ju = JsoupUtil.getInstance();  
        ju.getDoubanReview();  
    }  

}  

猜你喜欢

转载自blog.csdn.net/v2020877/article/details/82458679