import numpy as np import json import codecs # 计算编辑距离 def edit_distance(word1, word2): len1 = len(word1) len2 = len(word2) dp = np.zeros((len1 + 1, len2 + 1)) for i in range(len1 + 1): dp[i][0] = i for j in range(len2 + 1): dp[0][j] = j for i in range(1, len1 + 1): for j in Range (. 1,. 1 LEN2 + ): IF word1 [I -. 1] == word2 [J -. 1 ]: TEMP = 0 the else : TEMP =. 1 DP [I] [J] = min (DP [I -. 1] [J -. 1] + TEMP, min (DP [I -. 1] [J] +. 1, DP [I] [J -. 1] +. 1 )) return DP [LEN1] [LEN2] # 190 801 # calculated edit distance similarity DEF simility (word1, word2): RES = edit_distance (word1, word2) maxLen = max (len (word1), len (word2)) return . 1-RES * 1.0 / maxLen bianhaos = [] sub_sens = [] with codecs.open(r'C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\yiwoqu\code\xianbingshi_write_sub.txt','r','utf8') as f: for line in f: # bianhao,sub_sen = line.split('<->') # sub_sen = sub_sen.strip().strip('<b>').strip('<e>') # bianhaos.append(bianhao) sub_sens.append(line) count = len(sub_sens) leibie = [-1]*count cla = 0 print(count) for i in range(count): if leibie[i] != -1: continue leibie[i] = cla sub1 = sub_sens[i] for j in range(count): if leibie[j] != -1: continue sub2 = sub_sens[j] sim = simility(sub1,sub2) if sim >= 0.5: leibie[j] = cla cla = cla + 1 print(i) print(leibie) with open('leibie05.json','w') as f: json.dump(leibie,f)