jieba分词的应用（java）

在上一篇说的猜你喜欢功能中，又加了新的需求，需要对关键词进行分词，扩大推荐文章的范围，这样能够拓展用户的喜欢范围，这时候我就想到可以用jieba分词对中文进行分词，同样的需要去官网下载源码，这样方便自己对源码的修改以达到自己的目的。这里，我需要判断切分出来的词是否是无意义的词，就需要对切出来的词进行筛选，这时候，jieba分词的一个属性就体现出它的强大之处了，jieba分词会将切分出来的词进行词性的定义，我可以通过对于jieba分此后词的词性进行判断，筛选出名词，去掉无用的连接词，形容词等其他词性的词来达到我的分词目的。下面是对源码进行修改的部分。（大家也可以根据自己的需要，暴露原来隐藏的属性来实现自己的功能。）

/**
*在jieba分词的SegToken.java中对SegToken类增加一个成员变量properties来存储单词的词性
**/
package com.huaban.analysis.jieba;

public class SegToken {
    public String word;

    public int startOffset;

    public int endOffset;

    public String properties;


    public SegToken(String word, int startOffset, int endOffset, String properties) {
        this.word = word;
        this.startOffset = startOffset;
        this.endOffset = endOffset;
        this.properties = properties;
    }


    @Override
    public String toString() {
        return "[" + word + ", " + startOffset + ", " + endOffset + ", " + properties + "]";
    }

}

将从字典文件dict.txt中读取出来的单词的词性存储到properties 的字段中

/**
*在WordDictionary.java中增加property的Map存储word与词性的关系。建立索引关系，增加获取词性的公共方法
**/
package com.huaban.analysis.jieba;

import java.io.BufferedReader;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;


public class WordDictionary {
    private static WordDictionary singleton;
    private static final String MAIN_DICT = "/dict.txt";
    private static String USER_DICT_SUFFIX = ".dict";

    public final Map<String, Double> freqs = new HashMap<String, Double>();
    public final Map<String, String> property = new HashMap<String, String>();
    public final Set<String> loadedPath = new HashSet<String>();
    private Double minFreq = Double.MAX_VALUE;
    private Double total = 0.0;
    private DictSegment _dict;


    private WordDictionary() {
        this.loadDict();
    }


    public static WordDictionary getInstance() {
        if (singleton == null) {
            synchronized (WordDictionary.class) {
                if (singleton == null) {
                    singleton = new WordDictionary();
                    return singleton;
                }
            }
        }
        return singleton;
    }


    /**
     * for ES to initialize the user dictionary.
     * 
     * @param configFile
     */
    public void init(Path configFile) {
        String abspath = configFile.toAbsolutePath().toString();
        System.out.println("initialize user dictionary:" + abspath);
        synchronized (WordDictionary.class) {
            if (loadedPath.contains(abspath))
                return;

            DirectoryStream<Path> stream;
            try {
                stream = Files.newDirectoryStream(configFile, String.format(Locale.getDefault(), "*%s", USER_DICT_SUFFIX));
                for (Path path: stream){
                    System.err.println(String.format(Locale.getDefault(), "loading dict %s", path.toString()));
                    singleton.loadUserDict(path);
                }
                loadedPath.add(abspath);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                // e.printStackTrace();
                System.err.println(String.format(Locale.getDefault(), "%s: load user dict failure!", configFile.toString()));
            }
        }
    }


    /**
     * let user just use their own dict instead of the default dict
     */
    public void resetDict(){
        _dict = new DictSegment((char) 0);
        freqs.clear();
    }


    public void loadDict() {
        _dict = new DictSegment((char) 0);
        InputStream is = this.getClass().getResourceAsStream(MAIN_DICT);
        try {
            BufferedReader br = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));

            long s = System.currentTimeMillis();
            while (br.ready()) {
                String line = br.readLine();
                String[] tokens = line.split("[\t ]+");

                if (tokens.length < 2)
                    continue;

                String word = tokens[0];
                String properties = "";
                double freq = Double.valueOf(tokens[1]);
                if(tokens.length == 3)
                    properties = tokens[2];
                total += freq;
                word = addWord(word);
                freqs.put(word, freq);
                property.put(word, properties);//存储单词与词性的索引关系
            }
            // normalize
            for (Entry<String, Double> entry : freqs.entrySet()) {
                entry.setValue((Math.log(entry.getValue() / total)));
                minFreq = Math.min(entry.getValue(), minFreq);
            }
            System.out.println(String.format(Locale.getDefault(), "main dict load finished, time elapsed %d ms",
                System.currentTimeMillis() - s));
        }
        catch (IOException e) {
            System.err.println(String.format(Locale.getDefault(), "%s load failure!", MAIN_DICT));
        }
        finally {
            try {
                if (null != is)
                    is.close();
            }
            catch (IOException e) {
                System.err.println(String.format(Locale.getDefault(), "%s close failure!", MAIN_DICT));
            }
        }
    }


    private String addWord(String word) {
        if (null != word && !"".equals(word.trim())) {
            String key = word.trim().toLowerCase(Locale.getDefault());
            _dict.fillSegment(key.toCharArray());
            return key;
        }
        else
            return null;
    }


    public void loadUserDict(Path userDict) {
        loadUserDict(userDict, StandardCharsets.UTF_8);
    }


    public void loadUserDict(Path userDict, Charset charset) {                
        try {
            BufferedReader br = Files.newBufferedReader(userDict, charset);
            long s = System.currentTimeMillis();
            int count = 0;
            while (br.ready()) {
                String line = br.readLine();
                String[] tokens = line.split("[\t ]+");

                if (tokens.length < 1) {
                    // Ignore empty line
                    continue;
                }

                String word = tokens[0];

                double freq = 3.0d;
                String properties = "";
                if (tokens.length == 2)
                    freq = Double.valueOf(tokens[1]);
                if(tokens.length == 3)
                    properties = tokens[2];//获取单词的词性，存入map中
                word = addWord(word); 
                freqs.put(word, Math.log(freq / total));
                property.put(word, properties);
                count++;
            }
            System.out.println(String.format(Locale.getDefault(), "user dict %s load finished, tot words:%d, time elapsed:%dms", userDict.toString(), count, System.currentTimeMillis() - s));
            br.close();
        }
        catch (IOException e) {
            System.err.println(String.format(Locale.getDefault(), "%s: load user dict failure!", userDict.toString()));
        }
    }


    public DictSegment getTrie() {
        return this._dict;
    }


    public boolean containsWord(String word) {
        return freqs.containsKey(word);
    }

    public String getProperties(String word){//通过单词获取单词的词性
        if(containsWord(word))
            return property.get(word);
        else
            return "";
    }

    public Double getFreq(String key) {
        if (containsWord(key))
            return freqs.get(key);
        else
            return minFreq;
    }
}

将词性存储到SegToken的成员变量中，方便生成和调取。

/**
*在JiebaSegmenter.java中生成每个切分词的SegToken对象进行存储，方便使用
**/
 public List<SegToken> process(String paragraph, SegMode mode) {//对paragraphs进行切分，存储到SegToken中
        List<SegToken> tokens = new ArrayList<SegToken>();
        StringBuilder sb = new StringBuilder();
        int offset = 0;
        for (int i = 0; i < paragraph.length(); ++i) {
            char ch = CharacterUtil.regularize(paragraph.charAt(i));
            if (CharacterUtil.ccFind(ch))
                sb.append(ch);
            else {
                if (sb.length() > 0) {
                    // process
                    if (mode == SegMode.SEARCH) {
                        for (String word : sentenceProcess(sb.toString())) {
                            tokens.add(new SegToken(word, offset, offset += word.length(), wordDict.getProperties(word)));//将词性存储进去
                        }
                    }
                    else {
                        for (String token : sentenceProcess(sb.toString())) {
                            if (token.length() > 2) {
                                String gram2;
                                int j = 0;
                                for (; j < token.length() - 1; ++j) {
                                    gram2 = token.substring(j, j + 2);
                                    if (wordDict.containsWord(gram2))
                                        tokens.add(new SegToken(gram2, offset + j, offset + j + 2, wordDict.getProperties(gram2)));
                                }
                            }
                            if (token.length() > 3) {
                                String gram3;
                                int j = 0;
                                for (; j < token.length() - 2; ++j) {
                                    gram3 = token.substring(j, j + 3);
                                    if (wordDict.containsWord(gram3))
                                        tokens.add(new SegToken(gram3, offset + j, offset + j + 3, wordDict.getProperties(gram3)));
                                }
                            }
                            tokens.add(new SegToken(token, offset, offset += token.length(), wordDict.getProperties(token)));
                        }
                    }
                    sb = new StringBuilder();
                    offset = i;
                }
                if (wordDict.containsWord(paragraph.substring(i, i + 1)))
                    tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset, wordDict.getProperties(paragraph.substring(i, i + 1))));
                else
                    tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset, wordDict.getProperties(paragraph.substring(i, i + 1))));
            }
        }
        if (sb.length() > 0)
            if (mode == SegMode.SEARCH) {
                for (String token : sentenceProcess(sb.toString())) {
                    tokens.add(new SegToken(token, offset, offset += token.length(), wordDict.getProperties(token)));
                }
            }
            else {
                for (String token : sentenceProcess(sb.toString())) {
                    if (token.length() > 2) {
                        String gram2;
                        int j = 0;
                        for (; j < token.length() - 1; ++j) {
                            gram2 = token.substring(j, j + 2);
                            if (wordDict.containsWord(gram2))
                                tokens.add(new SegToken(gram2, offset + j, offset + j + 2, wordDict.getProperties(gram2)));
                        }
                    }
                    if (token.length() > 3) {
                        String gram3;
                        int j = 0;
                        for (; j < token.length() - 2; ++j) {
                            gram3 = token.substring(j, j + 3);
                            if (wordDict.containsWord(gram3))
                                tokens.add(new SegToken(gram3, offset + j, offset + j + 3, wordDict.getProperties(gram3)));
                        }
                    }
                    tokens.add(new SegToken(token, offset, offset += token.length(), wordDict.getProperties(token)));
                }
            }

        return tokens;
    }

然后在关键词切分的方法中进行判断，选择所需要词性的word即可

//对关键词进行结巴keyword分词
        for (String sentence : keyword_list) {
            List<SegToken> tokens = segmenter.process(sentence, SegMode.SEARCH);
            for(SegToken s : tokens)
                if(s.word.length() > 1)
                    keyword += " "+s.word;
        }
        keyword_list = keyword.split("[,;\\s'\\*\\+|\\^]+");
        Set<String> keywordList = new LinkedHashSet<String>(Arrays.asList(keyword_list));//用set是为了去除文章的重复

到此完成新需求的实现。与大家共勉~
最后附上jieba分词的此行类别及表示方法：

形语素

形容词性语素。形容词代码为 a，语素代码ｇ前面置以A。

形容词

取英语形容词 adjective的第1个字母。

副形词

直接作状语的形容词。形容词代码 a和副词代码d并在一起。

名形词
具有名词功能的形容词。形容词代码 a和名词代码n并在一起。

区别词
取汉字“别”的声母。

连词
取英语连词 conjunction的第1个字母。

副语素
副词性语素。副词代码为 d，语素代码ｇ前面置以D。

副词
取 adverb的第2个字母，因其第1个字母已用于形容词。

叹词
取英语叹词 exclamation的第1个字母。

方位词
取汉字“方”

语素
绝大多数语素都能作为合成词的“词根”，取汉字“根”的声母。

前接成分
取英语 head的第1个字母。

成语
取英语成语 idiom的第1个字母。

简称略语
取汉字“简”的声母。

后接成分

习用语
习用语尚未成为成语，有点“临时性”，取“临”的声母。

数词
取英语 numeral的第3个字母，n，u已有他用。

名语素
名词性语素。名词代码为 n，语素代码ｇ前面置以N。

名词
取英语名词 noun的第1个字母。

人名
名词代码 n和“人(ren)”的声母并在一起。

地名
名词代码 n和处所词代码s并在一起。

机构团体
“团”的声母为 t，名词代码n和t并在一起。

其他专名
“专”的声母的第 1个字母为z，名词代码n和z并在一起。

拟声词
取英语拟声词 onomatopoeia的第1个字母。

介词
取英语介词 prepositional的第1个字母。

量词
取英语 quantity的第1个字母。

代词
取英语代词 pronoun的第2个字母,因p已用于介词。

处所词
取英语 space的第1个字母。

时语素
时间词性语素。时间词代码为 t,在语素的代码g前面置以T。

时间词
取英语 time的第1个字母。

助词
取英语助词 auxiliary

动语素
动词性语素。动词代码为 v。在语素的代码g前面置以V。

动词
取英语动词 verb的第一个字母。

副动词
直接作状语的动词。动词和副词的代码并在一起。

名动词
指具有名词功能的动词。动词和名词的代码并在一起。

标点符号

非语素字
非语素字只是一个符号，字母 x通常用于代表未知数、符号。

语气词
取汉字“语”的声母。

状态词
取汉字“状”的声母的前一个字母。

未知词
不可识别词及用户自定义词组。取英文Unkonwn首两个字母。(非北大标准，CSW分词中定义)