Java实现MongoDB的操作,以Java爬虫爬取CSDN博客为例

一、下载驱动

  大家可以去MongoDB官网下载,我也整理了MongoDB的Java驱动包上传到了CSDN,点击下载

二、代码简介

  本爬虫是基于HttpClient+Jsonp框架编写,数据库采用MongoDB。功能是通过提供CSDN中博主的id名,将该博主的博文信息采集入库。大体过程是通过HttpClient访问网页,Jsonp解析将爬取的博文地址(URL)进行存储在集合 blog 中并加上访问标志位,然后再从集合blog中获取没有被访问的URL,再通过URL获取博文的详情然后入口,最后将访问过的URL标记为访问状态。

三、完整代码

1.MongoDBJDBC.java文件

package com.csdn.dao;

import java.util.ArrayList;
import java.util.List;

import org.bson.Document;

import com.mongodb.BasicDBObject;
import com.mongodb.MongoClient;
import com.mongodb.MongoClientOptions;
import com.mongodb.MongoClientOptions.Builder;
import com.mongodb.MongoException;
import com.mongodb.ServerAddress;
import com.mongodb.client.FindIterable;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoCursor;
import com.mongodb.client.MongoDatabase;
import com.mongodb.client.result.UpdateResult;

public class MongoDBJDBC {

    private MongoClient mongoClient = null;

    /**
     * 构造方法,连接MongoDB服务器
     * @author ouyang
     * @param ip
     * @param port
     * @return
     */
    public MongoDBJDBC(String ip, int port) {

        if (mongoClient == null) {
            ServerAddress serverAddress = new ServerAddress(ip, port);

            // 设置连接参数
            Builder builder = MongoClientOptions.builder()
                    .connectTimeout(1000 * 60) // 设置连接超时为60秒
                    .maxWaitTime(1000 * 60 * 2) // 设置最大等待时间为120秒
                    .connectionsPerHost(50); // 设置最大连接数为50

            MongoClientOptions options = builder.build();

            // 连接MongoDB服务,有多种方式
            try {
                // MongoClient mongoClient = new MongoClient(ip,port);
                mongoClient = new MongoClient(serverAddress, options);
            } catch (MongoException e) {
                e.printStackTrace();
            }

        }

    }

    /******** 单例模式声明开始,采用饿汉式方式生成,保证线程安全 ********************/

    // 类初始化时,自行实例化,饿汉式单例模式
    private static final MongoDBJDBC mongoDBJDBC = new MongoDBJDBC(
            "127.0.0.1", 20001);

    /**
     * 单例的静态工厂方法
     * @author ouyang
     * @return
     */
    public static MongoDBJDBC getMongoDBJDBC() {
        return mongoDBJDBC;
    }

    /************************ 单例模式声明结束 *************************************/

    /**
     * 根据指定条件获取Document
     * @author ouyang
     * @param dbName
     * @param collectionName
     * @param keys
     * @param values
     * @param num
     * @return
     */
    public List<Document> find(String dbName, String collectionName,
            String[] keys, Object[] values, int num) {

        // 创建返回的结果集
        List<Document> resultList = new ArrayList<Document>();

        MongoDatabase db = null;

        MongoCollection<Document> dbCollection = null;

        FindIterable<Document> cursor = null;

        if (keys != null && values != null) {
            if (keys.length != values.length) {
                // 如果传来的查询参数对不对,直接返回空的结果集
                return resultList;
            } else {
                try {
                    // 获取数据库实例
                    db = mongoClient.getDatabase(dbName);

                    // 获取数据库中指定的collection集合
                    dbCollection = db.getCollection(collectionName);

                    // 构建查询条件
                    BasicDBObject queryObj = new BasicDBObject();

                    // 填充查询条件
                    for (int i = 0; i < keys.length; i++) {
                        queryObj.put(keys[i], values[i]);
                    }

                    // 查询获取数据
                    cursor = dbCollection.find(queryObj);
                    if (num != -1) {
                        // 判断是否是返回全部数据,num=-1返回查询全部数据,num!=-1则返回指定的num数据
                        MongoCursor<Document> mongoCursor = cursor.iterator();
                        while(mongoCursor.hasNext()) {
                            resultList.add(mongoCursor.next());
                        }
                        return resultList;
                    } else {
                        MongoCursor<Document> mongoCursor = cursor.iterator();
                        int i = 0;
                        while(mongoCursor.hasNext() && i < num) {
                            resultList.add(mongoCursor.next());
                            i++;
                            System.out.println(i);
                        }
                        return resultList;
                    }
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        }

        return resultList;
    }

    /**
     * 插入文档
     * @author ouyang
     * @param dbName
     * @param collectionName
     * @param documents
     * @return
     */
    public Boolean inSert(String dbName, String collectionName, List<Document> documents) {

        MongoDatabase db = null;

        MongoCollection<Document> dbCollection = null;

        if (documents.size() < 1) {
            return false;
        } else {
            // 获取数据库实例
            db = mongoClient.getDatabase(dbName);

            // 获取数据库中指定的collection集合
            dbCollection = db.getCollection(collectionName);

            dbCollection.insertMany(documents);
            return true;
        }

    }

    /**
     * 更新文档
     * @author ouyang
     * @param dbName
     * @param collectionName
     * @param documents
     * @return
     */
    public Boolean update(String dbName, String collectionName, 
            BasicDBObject whereDoc,BasicDBObject updateDoc) {

        MongoDatabase db = null;

        MongoCollection<Document> dbCollection = null;

        if (whereDoc.size() > 0 && updateDoc.size() > 0) {
            return false;
        } else {
            // 获取数据库实例
            db = mongoClient.getDatabase(dbName);

            // 获取数据库中指定的collection集合
            dbCollection = db.getCollection(collectionName);

            UpdateResult updateManyResult = dbCollection.updateMany(whereDoc,
                    new Document("$set",updateDoc));

            System.out.println("更新成功:" + updateManyResult + "个");

            return true;
        }

    }

    /**
     * 更新一个文档
     * @author ouyang
     * @param dbName
     * @param collectionName
     * @param documents
     * @return
     */
    public Boolean updateOne(String dbName, String collectionName, 
            BasicDBObject whereDoc,BasicDBObject updateDoc) {

        MongoDatabase db = null;

        MongoCollection<Document> dbCollection = null;

        if (whereDoc.size() < 1 && updateDoc.size() < 1) {
            return false;
        } else {
            // 获取数据库实例
            db = mongoClient.getDatabase(dbName);

            // 获取数据库中指定的collection集合
            dbCollection = db.getCollection(collectionName);

            UpdateResult updateManyResult = dbCollection.updateOne(whereDoc,
                    new Document("$set",updateDoc));
            System.out.println("更新成功:" + updateManyResult + "个");
            return true;
        }

    }



    /**
     * 获取指定MongoCollection
     * @author ouyang
     * @param dbName
     * @param collectionName
     * @return
     */
    public MongoCollection<Document> getCollection(String dbName,
            String collectionName) {
        return mongoClient.getDatabase(dbName).getCollection(collectionName);
    }

    /**
     * 根据数据库名获取指定数据库实例
     * @author ouyang
     * @param dbName
     * @return
     */
    public MongoDatabase getDatabase(String dbName) {
        return mongoClient.getDatabase(dbName);
    }

}

2.GetHttpResponse.java文件

package com.csdn.crawler;

import java.io.IOException;

import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;

public class GetHttpResponse {

    public static HttpResponse getHttpClient(String url) throws ClientProtocolException, IOException {
        //创建HttpClient对象实例
        HttpClient httpClient = HttpClients.createDefault();

        //创建get请求对象
        HttpGet httpGet = new HttpGet(url);

        RequestConfig config = RequestConfig.custom()
                .setConnectTimeout(5000)            //设置响应时间
                .setConnectionRequestTimeout(5000)  //设置请求超时
                .setCookieSpec(CookieSpecs.IGNORE_COOKIES) //设置cookie策略
                .build();
        httpGet.setConfig(config);

        //设置头信息,不然请求不到网页
        httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.8");
        httpGet.setHeader("User-Agent", 
                "Mozilla/5.0 (Windows NT 6.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2");  

        HttpResponse httpResponse = httpClient.execute(httpGet);;

        return httpResponse;
    }
}

3.GetCSDNInfo.java文件

package com.csdn.crawler;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.http.HttpResponse;
import org.apache.http.util.EntityUtils;
import org.bson.types.ObjectId;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.csdn.dao.MongoDBJDBC;
import com.mongodb.BasicDBObject;


public class GetCSDNInfo {

    /**
     * csdn的url结构为:https://blog.csdn.net/用户名/article/list/页数
     * 
     */
    public String userName = "qq_24598601";
    public static String url = "https://blog.csdn.net/";

    public GetCSDNInfo(String url, String userName) {
        super();
        if("".equals(userName) || userName == null) {
            System.out.println("用户名不能为空");
        } else if("".equals(url) || !url.equals(url)) {
            System.out.println("url不正确");
        }
        this.userName = userName;
    }


    /**
     * 描述:获取要获取信息的链接
     * @author 欧阳
     * @serialData 20180728
     * @param useName
     * @return
     */
    public void getUrl() {

        System.out.println("**********开始获取" + userName + "发布的博文信息**********");

        //记录页数
        int pageNum = 1;
        int count = 0; //记录爬取博文条数

        while(pageNum > 0) {
            String urlStr = url +
                    userName + 
                    "/article/list/" + 
                    String.valueOf(pageNum);

            try {
                //获取网页信息
                HttpResponse httpResponse = GetHttpResponse.getHttpClient(urlStr);

                //将网页内容进去转码
                String html = EntityUtils.toString(httpResponse.getEntity(), "UTF-8");

                //通过Jsoup将页面转成Document对象进行解析
                Document documents = Jsoup.parse(html);

                Elements elements = documents.select(".article-list");

                /*
                 * 如果获取到的Elements为空,则已经没有博文了
                 */
                if(elements.isEmpty()) {
                    System.out.println("已经没有博文了!");
                    break;
                } else {
                    pageNum++; //设置获取下一页
                }

                //获取当前页的博文条数
                Elements contents = documents.select("h4  a");

                List<org.bson.Document> listD = new ArrayList<org.bson.Document>();
                for(Element e : contents) {
                    org.bson.Document documet = new org.bson.Document();
                    documet.put("is", e.text().split(" ")[0]);
                    documet.put("title", e.text().split(" ")[1]);
                    documet.put("url", e.attr("href"));
                    documet.put("status", "0");

                    listD.add(documet);
                    count++;  //计数
                }

                Boolean flag = MongoDBJDBC.getMongoDBJDBC().inSert("crawler", "blog", listD);
                System.out.print("url=" + urlStr + "--");
                if(flag) {
                    System.out.println(listD.size() + "条存入数据库");
                } else {
                    System.out.println("存入数据库失败!");
                }

                //清空list中的数据,准备下一下存放
                listD.clear();


            } catch (IOException e) {
                e.printStackTrace();
                System.out.println("发生未知错误!");
            }
        }

        System.out.println("成功找到并存入数据库:" + count + "条博文");
        System.out.println("***********结束获取" + userName + "发布的博文信息*********");
    }

    /**
     * 从数据库中获取每一条博文的链接并通过链接获取详情信息
     * @author 欧阳
     * @serialData 20180728
     */
    public void getBlogInfo() {
        System.out.println("*******************开始获取博文信息******************");
        //添加条件,status为0表示未被访问过
        String[] key = {"status"};
        Object[] value = {"0"}; 

        //从数据库中获取未被访问的节点
        List<org.bson.Document> documents = MongoDBJDBC.getMongoDBJDBC()
                    .find("crawler", "blog", key, value, 10);

        //直到数据库中的节点都被访问才结束
        while(documents.size() > 0) {
            for(org.bson.Document document : documents) {
                String url = document.getString("url");  //获取链接
                ObjectId _id = document.getObjectId("_id");  //获取id

                try {
                    //获取网页信息
                    HttpResponse httpResponse = GetHttpResponse.getHttpClient(url);

                    //将网页内容进去转码
                    String html = EntityUtils.toString(httpResponse.getEntity(), "UTF-8");

                    //通过Jsoup将页面转成Document对象进行解析
                    Document docs = Jsoup.parse(html);

                    Elements creatTime = docs.select(".time"); //发布时间
                    Elements readNum = docs.select(".read-count");  //阅读数

                    List<org.bson.Document> listD = new ArrayList<org.bson.Document>();
                    org.bson.Document documet = new org.bson.Document();
                    documet.put("pkid", _id);
                    documet.put("creatTime", creatTime.get(0).text());
                    documet.put("readNum", readNum.get(0).text().split(":")[1]);
                    listD.add(documet);

                    Boolean flag = MongoDBJDBC.getMongoDBJDBC().inSert("crawler", "blogInfo", listD);
                    if(flag) {
                        /*
                         * 修改已经访问过的数据的状态
                         */

                        //添加修改时查询的条件
                        BasicDBObject whereDoc = new BasicDBObject();
                        whereDoc.put("_id", _id);

                        //添加修改后的值
                        BasicDBObject updateDoc = new BasicDBObject();
                        updateDoc.put("status", "1");

                        //修改
                        MongoDBJDBC.getMongoDBJDBC().updateOne("crawler", "blog", whereDoc, updateDoc);
                    } else {
                        System.out.println("存入数据库失败!");
                    }

                    //清空list中的数据,准备下一下存放
                    listD.clear();

                } catch (IOException e) {
                    e.printStackTrace();
                    System.out.println("发生未知错误!");
                }
            }

            //再次从数据库中获取未被访问的节点
            documents = MongoDBJDBC.getMongoDBJDBC()
                    .find("crawler", "blog", key, value, 10);
        }

        System.out.println("所有节点都已经被访问!");
        System.out.println("*******************结束获取博文信息******************");
    }
}

4.测试文件GetInfoTest.java

package com.csdn.crawler.test;

import com.csdn.crawler.GetCSDNInfo;

public class GetInfoTest {

    public static void main(String[] args) {
        GetCSDNInfo info = new GetCSDNInfo("https://blog.csdn.net/", 
                    "qq_24598601");
        for(int i=0; i<10; i++) {
            info.getUrl();
            info.getBlogInfo();
        }
    }

}

五、结束语

  Java操作MongoDB数据库还有许多方法:MongoDB API Documentation for Java,这是MongoDB-Java的API。

猜你喜欢

转载自blog.csdn.net/qq_24598601/article/details/81508282