#!/usr/bin/python # -*- coding: utf-8 -*- # @Time : 2018/7/26 9:40 # @verion : python3.6 # @File : generate_datas.py.py # @Software: PyCharm __author__ = 'xiaohu' hidden_states = ["A", "B", "C", "D", "F", "G", "I", "J", "K", "L", "M", "P", "S", "W", "X", "Z"] def generate_transition_probability(): ''' 得到状态转移概率矩阵文本A,每行格式:首状态,次状态,概率[首状态后面为该次状态的概率] :return: ''' result = [] with open('./data/nt.tr.txt', mode='r') as file: all_data = file.readlines() for line in all_data[1:]: split_line = line.strip().split(',') first_state = split_line[0] # 首状态 sumLineData = sum(int(s) for s in split_line[1:]) for index, degree in enumerate(split_line[1:]): second_state = hidden_states[index] # 次状态 result.append([first_state, second_state, float(degree) / sumLineData]) # print(result) # 写入文本 with open('./data/transition_probability.txt', mode='w') as out_file: for thelist in result: str_to_write = '%s,%s,%s\n' % (thelist[0], thelist[1], thelist[2]) out_file.write(str_to_write) print('generate transition_probability.txt') def generate_initial_vector(): ''' 得到初始化概率向量π,每行格式:状态,出现次数,概率 :return: ''' the_hidden_states = {x: 0 for x in hidden_states} sum_total = 0 with open('./data/nt.txt', mode='r') as file: all_data = file.readlines() for line in all_data: split_line = line.strip().split(' ') states_and_degree = split_line[1:] # print(split_line) for index in range(0, len(states_and_degree), 2): states_dict = states_and_degree[index:index + 2] the_hidden_states[states_dict[0]] += eval(states_dict[1]) sum_total += eval(states_dict[1]) # print(the_hidden_states) # 存入文本 with open('./data/initial_vector.txt', mode='w') as initial_file: for state, degree in the_hidden_states.items(): str_to_write = '%s,%s,%s\n' % (state, degree, float(degree) / sum_total) initial_file.write(str_to_write) print('generate initial_vector.txt') # 列表存东西很方便,字典对于算法中的表达式处理很方便 def generate_emit_probability(): ''' 生成观测概率矩阵,每行格式为:隐状态,显状态,概率[该隐状态在该显状态下出现的次数 / 该隐状态总共出现的次数] :return: ''' result = [] initial_freqence = get_initial_freq() with open('./data/nt.txt', mode='r') as file: all_data = file.readlines() for line in all_data: split_line = line.strip().split(' ') observation = split_line[0] states_and_degree = split_line[1:] for index in range(0, len(states_and_degree), 2): state_dict = states_and_degree[index:index + 2] result.append( [state_dict[0], observation, float(state_dict[1]) / eval(initial_freqence[state_dict[0]])]) # print(result) with open('./data/emit_probability.txt', mode='w') as emit_file: for line in result: str_to_write = '%s,%s,%s\n' % (line[0], line[1], line[2]) emit_file.write(str_to_write) print('generate: emit_probability.txt') def get_initial_freq(): ''' 获取每个字典出现的频数 :return:字典;key为标签,value为频数 ''' the_hidden_state = {x: 0 for x in hidden_states} with open('./data/initial_vector.txt', mode='r') as file: all_data = file.readlines() for line in all_data: line_data = line.strip().split(',') the_hidden_state[line_data[0]] = line_data[1] return the_hidden_state if __name__ == '__main__': generate_transition_probability() generate_initial_vector() generate_emit_probability()
nlp之命名实体识别HMM方法(1)
猜你喜欢
转载自blog.csdn.net/qq_18617299/article/details/81236272
今日推荐
周排行