【Elasticsearch学习之三】Elasticsearch 搜索引擎案例

环境
　　虚拟机：VMware 10
　　Linux版本：CentOS-6.5-x86_64
　　客户端：Xshell4
　　FTP：Xftp4
　　jdk8
　　elasticsearch-2.2.0

第一步：获取数据
主流搜索引擎，会使用爬虫，来获取网站的html数据，常用的工具有nutch，Python（主流），wget（c语言）
这里使用wget模拟
#安装wget
yum install wget
#使用wget从文件wget.log爬取数据
#参数：
#-o 指定爬取内容输出日志名
#-P 爬取生成文件目录父目录
#-m 拷贝
#-D 列举爬取域名清单
#-N 不重新检索文件, 除非更新文件时间晚于本地时间
#--convert-links 根据路径转换成目录
#--random-wait 随机等待间隔性爬取以防止被禁止访问
#-A 指定文档类型
#最后指定爬取网站地址

[cluster@PCS101 /] wget -o /tmp/wget.log -P /root/data --no-parent --no-verbose -m -D news.cctv.com -N --convert-links --random-wait -A html,HTML,shtml,SHTML http://news.cctv.com

第二步：ES集群安装分词器IK
注意：必选选择与ES对应的IK版本 https://github.com/medcl/elasticsearch-analysis-ik

#将ik目录直接ftp上传到目录下/opt/cluster/es/elasticsearch-2.2.1/plugins
#更改目录所有者
[cluster@PCS101 plugins]$ chown -R cluster:cluster ik
修改配置 plugin-descriptor.properties
elasticsearch.version=2.2.1

分发至102、103：
[cluster@PCS101 plugins]$ scp -r ik/ cluster@PCS102:`pwd`
[cluster@PCS101 plugins]$ scp -r ik/ cluster@PCS103:`pwd`

第三步：数据抽取：从网页中抽取数据
HtmlTool.java

package com.sxt.util;

import java.io.File;

import com.sxt.es.HtmlBean;
import com.sxt.es.IndexService;

import net.htmlparser.jericho.CharacterReference;
import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.Source;

public class HtmlTool {
    
    /**
     * 数据过滤清洗 将网页转换成javabean
     * @param path html 文件路径
     */
    public static HtmlBean parserHtml(String path)throws Throwable{
        HtmlBean bean  =new HtmlBean();
        Source source=new Source(new File(path));
        source.fullSequentialParse();
        Element titleElement=source.getFirstElement(HTMLElementName.TITLE);
        if(titleElement==null){
            return null;
        }else{
            String title=CharacterReference.decodeCollapseWhiteSpace(titleElement.getContent());
            bean.setTitle(title);
        }
        String content =source.getTextExtractor().setIncludeAttributes(true).toString();
        
        String url =path.substring(IndexService.DATA_DIR.length());
        bean.setContent(content);
        bean.setUrl(url);
        return bean;
  }
    
    /**
     * @param args
     */
    public static void main(String[] args) {
        try {
            System.out.println(parserHtml("e:\\data\\news.cctv.com\\2017\\05\\01\\ARTI0k5MFLx2cvzQZffwQcUp170501.shtml").getContent());
        } catch (Throwable e) {
            e.printStackTrace();
        }
    }
}

HtmlBean.java

package com.sxt.es;

public class HtmlBean {

    private int id;
    private String title;
    private String content;
    private String url;
    public int getId() {
        return id;
    }
    public void setId(int id) {
        this.id = id;
    }
    public String getTitle() {
        return title;
    }
    public void setTitle(String title) {
        this.title = title;
    }
    public String getContent() {
        return content;
    }
    public void setContent(String content) {
        this.content = content;
    }
    public String getUrl() {
        return url;
    }
    public void setUrl(String url) {
        this.url = url;
    }
    
}

第四步：把抽取出来的数据同ES建立索引
#创建索引库
IndexService.java::createIndex
#数据同ES建立索引
IndexService.java::addHtmlToES

package com.sxt.es;

import java.io.File;
import java.net.InetAddress;
import java.util.HashMap;
import java.util.Map;

import org.elasticsearch.action.admin.indices.exists.indices.IndicesExistsResponse;
import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.client.Requests;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.MatchQueryBuilder;
import org.elasticsearch.index.query.MultiMatchQueryBuilder;
import org.elasticsearch.index.query.MultiMatchQueryParser;
import org.elasticsearch.index.query.RangeQueryBuilder;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.SearchHits;
import org.junit.Test;
import org.springframework.stereotype.Service;

import com.sxt.util.HtmlTool;

@Service
public class IndexService {

    //存放html文件的目录
    public static String DATA_DIR="e:\\data\\";
    //定义客户端
    public static Client client;

    static {
        //设置连接集群名
        Settings settings = Settings.settingsBuilder().put("cluster.name", "wjy-es").build();
        try {
            //创建连接集群客户端
            client = TransportClient
                    .builder()
                    .settings(settings)
                    .build()
                    .addTransportAddress(
                            new InetSocketTransportAddress(InetAddress
                                    .getByName("134.32.123.101"), 9300))
                    .addTransportAddress(
                            new InetSocketTransportAddress(InetAddress
                                    .getByName("134.32.123.102"), 9300))
                    .addTransportAddress(
                            new InetSocketTransportAddress(InetAddress
                                    .getByName("134.32.123.103"), 9300));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * 创建索引库
     * admin():管理索引库的。client.admin().indices()
     * 索引数据的管理：client.prepare
     */
    @Test
    public void createIndex() throws Exception {
        IndicesExistsResponse resp = client.admin().indices().prepareExists("testes").execute().actionGet();
        //存在删除旧的
        if(resp.isExists()){
            client.admin().indices().prepareDelete("testes").execute().actionGet();
        }
        //创建
        client.admin().indices().prepareCreate("testes").execute().actionGet();

        new XContentFactory();

        XContentBuilder builder = XContentFactory.jsonBuilder()
                .startObject()
                .startObject("htmlbean")
                .startObject("properties")//type
                .startObject("title")
                .field("type", "string")//字符串类型
                .field("store", "yes")//是否保存
                .field("analyzer", "ik_max_word")//指定分词器
                .field("search_analyzer", "ik_max_word")//指定搜索依赖的分词器
                .endObject()
                .startObject("content")
                .field("type", "string")
                .field("store", "yes")
                .field("analyzer", "ik_max_word")
                .field("search_analyzer", "ik_max_word")
                .endObject()
                .startObject("url")
                .field("type", "string")
                .field("store", "yes")
                .field("analyzer", "ik_max_word")
                .field("search_analyzer", "ik_max_word")
                .endObject()
                .endObject()
                .endObject()
                .endObject();
        //创建type
        PutMappingRequest mapping = Requests.putMappingRequest("testes").type("htmlbean").source(builder);
        client.admin().indices().putMapping(mapping).actionGet();

    }
    
    /**
     * 把源数据html文件添加到索引库中（构建索引文件）
     */
    @Test
    public void addHtmlToES(){
        readHtml(new File(DATA_DIR));
    }
    
    /**
     * 遍历数据文件目录d:/data ，递归方法
     * @param file
     */
    public void readHtml(File file){
        if(file.isDirectory()){
            File[]  fs =file.listFiles();
            for (int i = 0; i < fs.length; i++) {
                File f = fs[i];
                readHtml(f);
            }
        }else{
            HtmlBean bean;
            try {
                bean = HtmlTool.parserHtml(file.getPath());
                if(bean!=null){
                    Map<String, String> dataMap =new HashMap<String, String>();
                    dataMap.put("title", bean.getTitle());
                    dataMap.put("content", bean.getContent());
                    dataMap.put("url", bean.getUrl());
                    //写索引
                    client.prepareIndex("testes", "htmlbean").setSource(dataMap).execute().actionGet();
                }
            } catch (Throwable e) {
                e.printStackTrace();
            }
            
        }
    }
    
    /**
     * 搜索
     * @param kw
     * @param num
     * @return
     */
    public PageBean<HtmlBean> search(String kw,int num,int count){
        PageBean<HtmlBean> wr =new PageBean<HtmlBean>();
        wr.setIndex(num);
//        //构建查询条件
//        MatchQueryBuilder q1 =new MatchQueryBuilder("title", kw);
//        MatchQueryBuilder q2 =new MatchQueryBuilder("content", kw);
//        
//        //构建一个多条件查询对象
//        BoolQueryBuilder q =new BoolQueryBuilder(); //组合查询条件对象
//        q.should(q1);
//        q.should(q2);
        
//        RangeQueryBuilder q1 =new RangeQueryBuilder("age");
//        q1.from(18);
//        q1.to(40);
        
        MultiMatchQueryBuilder q =new MultiMatchQueryBuilder(kw, new String[]{"title","content"});
        SearchResponse resp=null;
        if(wr.getIndex()==1){//第一页
            resp = client.prepareSearch("testes")
                    .setTypes("htmlbean")
                    .setQuery(q)
                    .addHighlightedField("title")
                    .addHighlightedField("content")
                    .setHighlighterPreTags("<font color=\"red\">")
                    .setHighlighterPostTags("</font>")
                    .setHighlighterFragmentSize(40)//设置显示结果中一个碎片段的长度
                    .setHighlighterNumOfFragments(5)//设置显示结果中每个结果最多显示碎片段，每个碎片段之间用...隔开
                    .setFrom(0)//从第几个开始
                    .setSize(10)//第一页数量
                    .execute()
                    .actionGet();
            
        }else{
            wr.setTotalCount(count);
            resp = client.prepareSearch("testes")
                    .setTypes("htmlbean")
                    .setQuery(q)
                    .addHighlightedField("title")//高亮显示
                    .addHighlightedField("content")
                    .setHighlighterPreTags("<font color=\"red\">")//红色显示
                    .setHighlighterPostTags("</font>")
                    .setHighlighterFragmentSize(40)
                    .setHighlighterNumOfFragments(5)
                    .setFrom(wr.getStartRow())
                    .setSize(10)
                    .execute()
                    .actionGet();
        }
        SearchHits hits= resp.getHits();
        wr.setTotalCount((int)hits.getTotalHits());
        
        for(SearchHit hit : hits.getHits())
        {
            HtmlBean bean =new HtmlBean();
            if(hit.getHighlightFields().get("title")==null)
            {//title中没有包含关键字
                bean.setTitle(hit.getSource().get("title").toString());//获取原来的title（没有高亮的title）
            }
            else
            {
                bean.setTitle(hit.getHighlightFields().get("title").getFragments()[0].toString());
            }
            
            if(hit.getHighlightFields().get("content")==null)
            {//content中没有包含关键字
                bean.setContent(hit.getSource().get("content").toString());//获取原来的content（没有高亮的content）
            }
            else
            {
                StringBuilder sb =new StringBuilder();
                for(Text text: hit.getHighlightFields().get("content").getFragments())
                {
                    sb.append(text.toString()+"...");
                }
                bean.setContent(sb.toString());
            }
            
            bean.setUrl("http://"+hit.getSource().get("url").toString());
            wr.setBean(bean);
        }
        
        
        return wr;
    }
    
    
//    @Test
//    public void del(){
////        client.admin().indices().prepareDelete("testes").execute().actionGet();
//        client.admin().indices().prepareDelete("testes").execute().actionGet();
//    }
}

第五步：搜索数据

IndexService.java::search 见上面代码
PageBean.java

package com.sxt.es;

import java.util.ArrayList;
import java.util.List;


public class PageBean<T> {
    private int size = 10;//每页显示记录
    private int index = 1;// 当前页号    
    private int totalCount = 0;// 记录总数
    
    private int totalPageCount = 1;// 总页
    
    
    private int[] numbers;//展示页数集合
    protected List<T> list;//要显示到页面的数据集

    /**
     * 得到
     * @return
     */
    public int getStartRow() {

        return (index - 1) * size;
    }

    /**
     * 得到结束记录
     * @return
     */
    public int getEndRow() {
        
        return index * size;
    }

    /**
     * @return Returns the size.
     */
    public int getSize() {        
        return size;
    }

    /**
     * @param size
     * The size to set.
     */
    public void setSize(int size) {
        if (size > 0) {
            this.size = size;
        }
    }
    /**
     * @return Returns the currentPageNo.
     */
    public int getIndex() {
        if (totalPageCount == 0) {
            
            return 0;
        }
        
        return index;
    }

    /**
     * @param currentPageNo
     * The currentPageNo to set.
     */
    public void setIndex(int index) {
        if (index > 0) {
            this.index = index;
        }
    }

    /**
     * @return Returns the totalCount.
     */
    public int getTotalCount() {
        return totalCount;
    }

    /**
     * @param totalCount
     *  The totalCount to set.
     */
    public void setTotalCount(int totalCount) {
        if (totalCount >= 0) {
            this.totalCount = totalCount;
            setTotalPageCountByRs();//根据总记录数计算总页
        }
    }

    
    public int getTotalPageCount() {
        return this.totalPageCount;
    }

    /**
     * 根据总记录数计算总页
     */
    private void setTotalPageCountByRs() {
        if (this.size > 0 && this.totalCount > 0 && this.totalCount % this.size == 0) {
            this.totalPageCount = this.totalCount / this.size;
        } else if (this.size > 0 && this.totalCount > 0 && this.totalCount % this.size > 0) {
            this.totalPageCount = (this.totalCount / this.size) + 1;
        } else {
            this.totalPageCount = 0;
        }
        setNumbers(totalPageCount);//获取展示页数集合
    }

    public int[] getNumbers() {
        return numbers;
    }
    
    /**
     * 设置显示页数集合
     * @param totalPageCount
     */
    public void setNumbers(int totalPageCount) {
        if(totalPageCount>0){
            //!.当前数组的长度
            int[] numbers = new int[totalPageCount>10?10:totalPageCount];//页面要显示的页数集合
            int k =0;
            //
            //1.数组长度<10   1 2 3 4 ....   7
            //2.数组长度>=10
            //     当前页<=6  1 2 3 4    10
            //     当前页>=总页数-5           ......12 13 14 15  
            //     其他                                5  6  7 8   当前页(10)  10  11 12  13
            for(int i = 0;i < totalPageCount;i++){
                //保证当前页为集合的中�?
                if((i>=index- (numbers.length/2+1) || i >= totalPageCount-numbers.length) && k<numbers.length){
                    numbers[k] = i+1;
                    k++;
                }else if(k>=numbers.length){
                    break;
                }                
            }
            
            this.numbers = numbers;
        }
        
    }
    
    public void setNumbers(int[] numbers) {
        this.numbers = numbers;
    }

    public List<T> getList() {
        return list;
    }

    public void setList(List<T> list) {
        this.list = list;
    }

    public void setBean(T bean){
        if(this.list==null){
            list =new ArrayList<T>();
        }
        list.add(bean);
    }

/*
    public static int getTotalPageCount(int iTotalRecordCount, int iPageSize) {
        if (iPageSize == 0) {
            return 0;
        } else {
            return (iTotalRecordCount % iPageSize) == 0 ? (iTotalRecordCount / iPageSize) : (iTotalRecordCount / iPageSize) + 1;
        }
    }*/
}

启动ES_SEARCH web,访问 http://localhost:8080/ES_SEARCH

输入关键词搜索：

项目代码下载连接：

【Elasticsearch学习之三】Elasticsearch 搜索引擎案例

猜你喜欢