new word

1. count about entropy:

import re
import math
# print(doc2list(str1))
def gen_words(self, doc):
    # pattern = re.compile('[:“。”,!?、《》……;’‘\n——\r\t)、(——^[1-9]d*$]')
    # pattern = re.compile('[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。??:、~@#”“¥:%……&*()]+|[[A-Za-z0-9]*$]'.decode('utf-8'))
    # pattern = re.compile(u'[\\s\\d,.<>/?:;\'\"[\\]{}()\\|~!@#$%^&*\\-_=+a-zA-Z,。《》、?:;“”‘’{}【】()…¥!—┄-]+')
    pattern = re.compile('[^\u4e00-\u9fa5-_——/\\\]+')
    doc = pattern.sub(r'', doc)
    word_index = extract_cadicateword(doc, self.max_word_len)
    word_cad = {}  # 后选词的字典
    for suffix in word_index:
        word = doc[suffix[0]:suffix[1]]
        if word not in word_cad:
            word_cad[word] = wordinfo(word)
            # record frequency of word and left neighbors and right neighbors
        # print(doc[suffix[0]-1:suffix[0]],doc[suffix[1]:suffix[1]+1],word)
        word_cad[word].update_data(doc[suffix[0] - 1:suffix[0]], doc[suffix[1]:suffix[1] + 1])
    length = len(doc)
    # computing frequency of candicate word and entropy of left/right neighbors
    for word in word_cad:
        word_cad[word].compute_indexes(length)
    # ranking by length of word
    values = sorted(word_cad.values(), key=lambda x: len(x.text))
    for v in values:
        if len(v.text) == 1:
            continue
        v.compute_pmi(word_cad)
    # ranking by freq
    return sorted(values, key=lambda v: len(v.text), reverse=False)


class wordinfo(object):
    '''
    Record every candidate word information include left neighbors, right neighbors, frequency, PMI
    '''
    def __init__(self,text):
        super(wordinfo,self).__init__()
        self.text = text
        self.freq = 0.0
        self.left = []  #record left neighbors
        self.right = [] #record right neighbors
        self.pmi = 0

    def update_data(self,left,right):
        self.freq += 1.0
        if left:
            self.left.append(left)
        if right:
            self.right.append(right)

    def compute_indexes(self,length):
        #compute frequency of word,and left/right entropy
        self.freq /= length
        self.left = compute_entropy(self.left)
        self.right = compute_entropy(self.right)

    def compute_pmi(self,words_dict):
        #compute all kinds of combines for word
        sub_part = gen_bigram(self.text)
        if len(sub_part) > 0:
            # print(min(map(lambda x : math.log(self.freq/words_dict[x[0]].freq/words_dict[x[-1]].freq),sub_part)))
            self.pmi = min(map(lambda x : math.log(self.freq/words_dict[x[0]].freq/words_dict[x[-1]].freq),sub_part))
        # print(len(sub_part))

def extract_cadicateword(_doc, _max_word_len):  # _doc 全语料,拼起来
    indexes = []
    # print(_doc)
    doc_length = len(_doc)
    for i in range(doc_length):
        for j in range(i + 1, min(i + 1 + _max_word_len, doc_length + 1)):
            indexes.append((i, j))
    # print(indexes)
    return sorted(indexes, key=lambda x: x[-1])
    # return indexes



#####################################
def gen_bigram(_word_str):
    '''
    A word is divide into two part by following all possible combines.
    For instance, ABB can divide into (a,bb),(ab,b)
    :param _word_str:
    :return:
    '''
    return [(_word_str[0:_i],_word_str[_i:]) for _i in range(1,len(_word_str))]

def compute_pmi(words_dict,pmi_word):
    text = pmi_word
    word_freq = words_dict[text]
    print('word_freq==',word_freq)
    sub_part = gen_bigram(text)
    print('sub_part==',sub_part)
    pmi=-10000000000
    if len(sub_part) > 0:
        # pmi = min(map(lambda x : math.log(word_freq/words_dict[x[0]]/words_dict[x[-1]]),sub_part))
        pmi = min(map(lambda x: len(doc)*(word_freq / words_dict[x[0]] / words_dict[x[-1]]), sub_part))
        print('pmi===',pmi)

    print('word, pmi==',text,'  ',pmi)



if __name__ ==  Indexes:" __Main__ " : 

    # DOC is you have to statistical corpus. 30M corpus various fields such as stitching together of long strings (need to remove the symbol period, etc.). 
    = DOC ' this competition were set up individual awards, chasing light unit Year and Award and other awards. Where the individual awards, respectively, with Best Editing Award, Best Creative Award, Best Story, Best Male and Best Actress Award six awards. The awards ceremony, the famous director Lu Chuan, general manager of Tencent news operations Huangchen Xia, general manager of Tencent Media Marketing, general manager of Yi Haiyan regional clusters, Wang, general manager of Beijing-Tianjin Tencent news products, well-known TV planner Shishu Si and other guests were prizes to the winners. ' 
    # DOC =' this competition were set up individual awards, chasing light unit Year and Award and other awards. ' 
    Indexes = extract_cadicateword (DOC, 6) # This is the 6' Changyang two store branches' length 
    Print (Indexes)
     from Collections Import   Counter 
    word_dict = Counter ()
     for Start, End in 
        word_dict.update ([DOC [Start: End] ]) 

    Print ( ' word_dict = ' , word_dict) 

    compute_pmi (word_dict, ' Changyang two store branches ' ) # computing 'Changyang two store branches' about entropy

2. count about entropy, the new word is found in the first step Forget mutual information, mutual information after the threshold filter on the basis of the re-calculation about entropy, to meet the threshold into words preserved:

Gave me two functions, one is as follows,

def compute_entropy(_list):
    length = float(len(_list))
    frequence = {}
    if length == 0:
        return 0
    else:
        for i in _list:
            frequence[i] = frequence.get(i,0) + 1
        return sum(map(lambda x: - x/length * math.log(x/length) , frequence.values()))

Another easier to use, and directly enter the text you want to count the words:

def _get_entropy(self,candidate,text):
    matchd = re.finditer(candidate,text)
    left_char_dic=Counter()
    right_char_dic=Counter()
    for item in matchd:
        start,end=item.span()
        print(start,text[start],end,text[end-1])
        if start!=0:
            left_char_dic.update([text[start-1]])
        if end!=len(text):
            right_char_dic.update([text[end]])
        print(left_char_dic,right_char_dic)
    length = float(sum(left_char_dic.values()))
    left_entropy = sum(map(lambda x: - x/length * math.log(x/length) , left_char_dic.values())) if length!=0 else 0
    length = float(sum(right_char_dic.values()))
    right_entropy = sum(map(lambda x: - x/length * math.log(x/length) , right_char_dic.values())) if length!=0 else 0
    print(left_entropy,right_entropy)
    return min(left_entropy,right_entropy)

 

Guess you like

Origin www.cnblogs.com/www-caiyin-com/p/11961680.html