python edit distance

import numpy as np
import json
import codecs

# 计算编辑距离
def edit_distance(word1, word2):
    len1 = len(word1)
    len2 = len(word2)
    dp = np.zeros((len1 + 1, len2 + 1))
    for i in range(len1 + 1):
        dp[i][0] = i
    for j in range(len2 + 1):
        dp[0][j] = j

    for i in range(1, len1 + 1):
        for j in Range (. 1,. 1 LEN2 + ):
             IF word1 [I -. 1] == word2 [J -. 1 ]: 
                TEMP = 0
             the else : 
                TEMP =. 1 
            DP [I] [J] = min (DP [I -. 1] [J -. 1] + TEMP, min (DP [I -. 1] [J] +. 1, DP [I] [J -. 1] +. 1 ))
     return DP [LEN1] [LEN2] 


# 190 801 
# calculated edit distance similarity 
DEF simility (word1, word2): 
    RES = edit_distance (word1, word2) 
    maxLen = max (len (word1), len (word2))
     return . 1-RES * 1.0 / maxLen

bianhaos = []
sub_sens = []
with codecs.open(r'C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\yiwoqu\code\xianbingshi_write_sub.txt','r','utf8') as f:
    for line in f:
        # bianhao,sub_sen = line.split('<->')
        # sub_sen = sub_sen.strip().strip('<b>').strip('<e>')
        # bianhaos.append(bianhao)
        sub_sens.append(line)
count = len(sub_sens)
leibie = [-1]*count
cla = 0
print(count)
for i in range(count):
    if leibie[i] != -1:
        continue
    leibie[i] = cla
    sub1 = sub_sens[i]
    for j in range(count):
        if leibie[j] != -1:
            continue
        sub2 = sub_sens[j]
        sim = simility(sub1,sub2)
        if sim >= 0.5:
            leibie[j] = cla
    cla = cla + 1
    print(i)
print(leibie)
with open('leibie05.json','w') as f:
    json.dump(leibie,f)

 

Guess you like

Origin www.cnblogs.com/yiwoqu/p/11542074.html