1. count about entropy:
import re import math # print(doc2list(str1)) def gen_words(self, doc): # pattern = re.compile('[:“。”,!?、《》……;’‘\n——\r\t)、(——^[1-9]d*$]') # pattern = re.compile('[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。??:、~@#”“¥:%……&*()]+|[[A-Za-z0-9]*$]'.decode('utf-8')) # pattern = re.compile(u'[\\s\\d,.<>/?:;\'\"[\\]{}()\\|~!@#$%^&*\\-_=+a-zA-Z,。《》、?:;“”‘’{}【】()…¥!—┄-]+') pattern = re.compile('[^\u4e00-\u9fa5-_——/\\\]+') doc = pattern.sub(r'', doc) word_index = extract_cadicateword(doc, self.max_word_len) word_cad = {} # 后选词的字典 for suffix in word_index: word = doc[suffix[0]:suffix[1]] if word not in word_cad: word_cad[word] = wordinfo(word) # record frequency of word and left neighbors and right neighbors # print(doc[suffix[0]-1:suffix[0]],doc[suffix[1]:suffix[1]+1],word) word_cad[word].update_data(doc[suffix[0] - 1:suffix[0]], doc[suffix[1]:suffix[1] + 1]) length = len(doc) # computing frequency of candicate word and entropy of left/right neighbors for word in word_cad: word_cad[word].compute_indexes(length) # ranking by length of word values = sorted(word_cad.values(), key=lambda x: len(x.text)) for v in values: if len(v.text) == 1: continue v.compute_pmi(word_cad) # ranking by freq return sorted(values, key=lambda v: len(v.text), reverse=False) class wordinfo(object): ''' Record every candidate word information include left neighbors, right neighbors, frequency, PMI ''' def __init__(self,text): super(wordinfo,self).__init__() self.text = text self.freq = 0.0 self.left = [] #record left neighbors self.right = [] #record right neighbors self.pmi = 0 def update_data(self,left,right): self.freq += 1.0 if left: self.left.append(left) if right: self.right.append(right) def compute_indexes(self,length): #compute frequency of word,and left/right entropy self.freq /= length self.left = compute_entropy(self.left) self.right = compute_entropy(self.right) def compute_pmi(self,words_dict): #compute all kinds of combines for word sub_part = gen_bigram(self.text) if len(sub_part) > 0: # print(min(map(lambda x : math.log(self.freq/words_dict[x[0]].freq/words_dict[x[-1]].freq),sub_part))) self.pmi = min(map(lambda x : math.log(self.freq/words_dict[x[0]].freq/words_dict[x[-1]].freq),sub_part)) # print(len(sub_part)) def extract_cadicateword(_doc, _max_word_len): # _doc 全语料,拼起来 indexes = [] # print(_doc) doc_length = len(_doc) for i in range(doc_length): for j in range(i + 1, min(i + 1 + _max_word_len, doc_length + 1)): indexes.append((i, j)) # print(indexes) return sorted(indexes, key=lambda x: x[-1]) # return indexes ##################################### def gen_bigram(_word_str): ''' A word is divide into two part by following all possible combines. For instance, ABB can divide into (a,bb),(ab,b) :param _word_str: :return: ''' return [(_word_str[0:_i],_word_str[_i:]) for _i in range(1,len(_word_str))] def compute_pmi(words_dict,pmi_word): text = pmi_word word_freq = words_dict[text] print('word_freq==',word_freq) sub_part = gen_bigram(text) print('sub_part==',sub_part) pmi=-10000000000 if len(sub_part) > 0: # pmi = min(map(lambda x : math.log(word_freq/words_dict[x[0]]/words_dict[x[-1]]),sub_part)) pmi = min(map(lambda x: len(doc)*(word_freq / words_dict[x[0]] / words_dict[x[-1]]), sub_part)) print('pmi===',pmi) print('word, pmi==',text,' ',pmi) if __name__ == Indexes:" __Main__ " : # DOC is you have to statistical corpus. 30M corpus various fields such as stitching together of long strings (need to remove the symbol period, etc.). = DOC ' this competition were set up individual awards, chasing light unit Year and Award and other awards. Where the individual awards, respectively, with Best Editing Award, Best Creative Award, Best Story, Best Male and Best Actress Award six awards. The awards ceremony, the famous director Lu Chuan, general manager of Tencent news operations Huangchen Xia, general manager of Tencent Media Marketing, general manager of Yi Haiyan regional clusters, Wang, general manager of Beijing-Tianjin Tencent news products, well-known TV planner Shishu Si and other guests were prizes to the winners. ' # DOC =' this competition were set up individual awards, chasing light unit Year and Award and other awards. ' Indexes = extract_cadicateword (DOC, 6) # This is the 6' Changyang two store branches' length Print (Indexes) from Collections Import Counter word_dict = Counter () for Start, End in word_dict.update ([DOC [Start: End] ]) Print ( ' word_dict = ' , word_dict) compute_pmi (word_dict, ' Changyang two store branches ' ) # computing 'Changyang two store branches' about entropy
2. count about entropy, the new word is found in the first step Forget mutual information, mutual information after the threshold filter on the basis of the re-calculation about entropy, to meet the threshold into words preserved:
Gave me two functions, one is as follows,
def compute_entropy(_list): length = float(len(_list)) frequence = {} if length == 0: return 0 else: for i in _list: frequence[i] = frequence.get(i,0) + 1 return sum(map(lambda x: - x/length * math.log(x/length) , frequence.values()))
Another easier to use, and directly enter the text you want to count the words:
def _get_entropy(self,candidate,text): matchd = re.finditer(candidate,text) left_char_dic=Counter() right_char_dic=Counter() for item in matchd: start,end=item.span() print(start,text[start],end,text[end-1]) if start!=0: left_char_dic.update([text[start-1]]) if end!=len(text): right_char_dic.update([text[end]]) print(left_char_dic,right_char_dic) length = float(sum(left_char_dic.values())) left_entropy = sum(map(lambda x: - x/length * math.log(x/length) , left_char_dic.values())) if length!=0 else 0 length = float(sum(right_char_dic.values())) right_entropy = sum(map(lambda x: - x/length * math.log(x/length) , right_char_dic.values())) if length!=0 else 0 print(left_entropy,right_entropy) return min(left_entropy,right_entropy)