中文分词系列(一)

关于中文分词的一些资料网上资料很多，大家可以自己去了解了解，今天这里只关注代码怎么写。

中文分词主要可以归纳为“规则分词”、“统计分词”以及“规则+统计”三个主要派别，今天主要了解“规则分词”中常见的正向、逆向和双向最大化匹配，这三个都是基于现在词典做的，所以得准备一个中文词典，一行一个词。

一.正向最大化匹配

　　描述：

　　　　1.找到词典中最长的词，记下长度L

　　　　2.从 “左向右” 取长度为L的字符串，查找词典进行匹配，若匹配成功，则将这个词切分出来。若匹配失败，将这个字符串的最后一个字符去掉，将剩下的串作为新的匹配串　　　　进行匹配。如此重复下去，直到切完。

二.逆向最大化匹配

　　描述：

　　　　1.找到词典中最长的词，记下长度L

　　　　2.从 ”右向左“ 取长度为L的字符串，查找词典进行匹配，若匹配成功，则将这个词切分出来。若匹配失败，将这个字符串的最前面一个字符去掉，将剩下的串作为新的匹配　　　　串进行匹配。如此重复下去，直到切完。

三.双向最大化匹配

　　描述：

　　　　1.将正向和逆向进行比较，先取词数切分最少的作为结果。

四.代码采用python

　　　　1.load 词典

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 
 4 #这是词典路径
 5 dictPath = '../resource/dict.txt'
 6 
 7 def loadDict():
 8     print('load dict...')
 9     dictionary = dict()
10     maximum = 0
11     # read resource
12     with open(dictPath, 'r', encoding='utf8') as f:
13         for line in f:
14             line = line.strip()
15             if not line:
16                 continue
17             str = line.split(' ')
18             dictionary[str[0]] = str[2]
19             wordLength = len(line)
20             if wordLength > maximum:
21                 maximum = wordLength #词典中最长的词的长度
22     return dictionary, maximum

View Code

　　　　2.核心方法

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 
 4 from word_segmentation.regulation.ReverseMaximumMatchMethod import RMM
 5 from word_segmentation.regulation.MaximumMatchMethod import MM
 6 from word_segmentation.regulation.BiDirectctionMatchMethod import BDMM
 7 from word_segmentation.util.LoadDict import loadDict
 8 
 9 class RegulationMatch(object):
10     def __init__(self):
11         self.dictionary, self.maximum = loadDict()
12 
13     def cut(self, text, method):
14         #逆向
15         if method == 'RMM':
16             return RMM.cut(text, self.dictionary, self.maximum)
17         #正向
18         if method == 'MM':
19             return MM.cut(text, self.dictionary, self.maximum)
20         #双向
21         if method == 'BDMM':
22             return BDMM.cut(text, self.dictionary, self.maximum)

View Code

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 
 4 '''
 5 词和词性
 6 '''
 7 class Word(object) :
 8     def __init__(self, token, property):
 9         self.__token = token
10         self.__property = property
11     #单词
12     def getToken(self):
13         return self.__token
14     #词性
15     def getProperty(self):
16         return self.__property

View Code

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 
 4 from word_segmentation.regulation.Word import Word
 5 
 6 '''
 7 正向最大化匹配
 8 MaximumMatchMethod
 9 '''
10 class MM(object):
11     def __init__(self):
12         pass
13 
14     @staticmethod
15     def cut(text, dictionary, maximum):
16         result = []
17         textLength = len(text)
18         start = 0
19         while textLength > 0:
20             word = None
21             for size in range(maximum, 0, -1):
22                 if textLength - size < 0:
23                     continue
24                 piece = text[start:(start + size)]
25                 if dictionary.__contains__(piece):
26                     word = piece
27                     result.append(Word(piece, dictionary.get(piece)))
28                     textLength -= size
29                     start += size
30                     break
31             if word is None:
32                 textLength -= 1
33         return result

View Code

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 
 4 from word_segmentation.regulation.Word import Word
 5 
 6 '''
 7 逆向最大化匹配
 8 ReverseMaximumMatchMethod
 9 '''
10 class RMM(object):
11     def __init__(self):
12         pass
13 
14     @staticmethod
15     def cut(text, dictionary, maximum):
16         result = []
17         textLength = len(text)
18         while textLength > 0:
19             word = None
20             for size in range(maximum, 0, -1):
21                 if textLength - size < 0:
22                     continue
23                 piece = text[(textLength - size) : textLength]
24                 if dictionary.__contains__(piece):
25                     word = piece
26                     result.append(Word(piece, dictionary.get(piece)))
27                     textLength -= size
28                     break
29             if word is None:
30                 textLength -= 1
31         return result[::-1]

View Code

 1 # -*- coding:utf-8 -*-
 2 
 3 from word_segmentation.regulation.MaximumMatchMethod import MM
 4 from word_segmentation.regulation.ReverseMaximumMatchMethod import RMM
 5 
 6 '''
 7     比较正向最大匹配和逆向最大匹配结果:
 8     1.如果分词数量结果不同，那么取分词数量较少的那个
 9     2.如果分词数量结果相同
10         a.分词结果相同，可以返回任何一个
11         b.分词结果不同，返回单字数比较少的那个
12         c.分词结果不同，单字数相同，返回谁呢（可以返回逆向分词结果）
13 '''
14 class BDMM(object):
15     def __init__(self):
16         pass
17 
18     @staticmethod
19     def cut(text, dictionary, maximum):
20         mmResult = MM.cut(text, dictionary, maximum)
21         rmmResult = RMM.cut(text, dictionary, maximum)
22         mmSegment = []
23         rmmSegment = []
24         for word in mmResult:
25             mmSegment.append(word.getToken())
26             # print('token = %s, property = %s' %(word.getToken(), word.getProperty()))
27         for word in rmmResult:
28             rmmSegment.append(word.getToken())
29 
30         if mmSegment.__len__() < rmmSegment.__len__():
31             return mmResult
32         elif mmSegment.__len__() == rmmSegment.__len__():
33             flag = True
34             for segment in mmSegment:
35                 if segment not in rmmSegment:
36                     flag = False
37                     break
38             if flag:
39                 return mmResult
40             else:
41                 mmSingleWords = 0
42                 rmmSingleWords = 0
43                 for word in mmSegment:
44                     if len(word) == 1:
45                         mmSingleWords += 1
46                 for word in rmmSegment:
47                     if len(word) == 1:
48                         rmmSingleWords += 1
49                 if mmSingleWords < rmmSingleWords:
50                     return mmResult
51                 else:
52                     return rmmResult
53         else:
54             return rmmResult

View Code

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 
 4 from word_segmentation.regulation.ReverseMaximumMatchMethod import RMM
 5 import word_segmentation.regulation.MaximumMatchMethod
 6 import word_segmentation.regulation.BiDirectctionMatchMethod
 7 from word_segmentation.regulation.RegulationMatchMthod import RegulationMatch
 8 
 9 def test():
10     pass
11 if __name__ == '__main__':
12     text = '各国有各国的困难…'
13     print('分词：')
14     print('各国有各国的困难…')
15     regulation = RegulationMatch()
16     mmResult = regulation.cut(text, 'MM')
17     rmmResult = regulation.cut(text, 'RMM')
18     bdmmResult = regulation.cut(text, 'BDMM')
19     mmSegment = []
20     rmmSegment = []
21     bdmmSegment = []
22     for word in mmResult:
23         mmSegment.append(word.getToken())
24         #print('token = %s, property = %s' %(word.getToken(), word.getProperty()))
25     for word in rmmResult:
26         rmmSegment.append(word.getToken())
27     for word in bdmmResult:
28         bdmmSegment.append(word.getToken())
29 
30     print('正向匹配: %s'  % mmSegment)
31     print('逆向匹配: %s'  % rmmSegment)
32     print('双向匹配: %s' % bdmmSegment)

View Code

中文分词系列(一)

猜你喜欢