使用余弦相似度实现文本相似度检测

# 注意：只是文本匹配而非语义匹配

# 参考地址：https://zhuanlan.zhihu.com/p/43396514
#         https://www.geeksforgeeks.org/python-measure-similarity-between-two-sentences-using-cosine-similarity/
#
#         https://baike.baidu.com/item/%E4%BD%99%E5%BC%A6%E7%9B%B8%E4%BC%BC%E5%BA%A6/17509249
#         https://github.com/nltk/nltk
#         http://www.nltk.org/
#         https://github.com/fxsjy/jieba
# similarity = consine = (A.B) / (||A||.||B||) where A and B are vectors.（欧几里得点积公式）

# pip install nltk
# import nltk
# nltk.download('all') ['punkt'|'stopwords'|...]  --all好像要下300M+数据
# nltk.tokenize: It is used for tokenization. Tokenization is the process by which big quantity of text is
#                divided into smaller parts called tokens. word_tokenize(X) split the given sentence X into
#                words and return list.

# nltk.corpus: In this program, it is used to get a list of stopwords.
#              A stop word is a commonly used word (such as “the”, “a”, “an”, “in”).

import jieba
import math
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


def check_zh_similarity(s1, s2):
    # 使用jieba分词并去掉stopwords，合并words，每个word都是一个维度
    sw = ['。', '，']
    s1_cut = [i for i in jieba.cut(s1, cut_all=True) if i not in sw]
    s2_cut = [i for i in jieba.cut(s2, cut_all=True) if i not in sw]
    word_set = set(s1_cut).union(set(s2_cut))
    print('cut s1: %s ' % s1_cut)
    print('cut s2: %s ' % s2_cut)
    print('word set: %s \n' % word_set)

    # 列出所有词 及 每个词在set中出现的位置（即每个单词所在的维度和该维度的编码）
    word_dict = dict()
    i = 0
    for word in word_set:
        word_dict[word] = i
        i += 1
    print('word dict: %s \n' % word_dict)

    # 统计s1各维度编码并计算每个维度的词频
    s1_cut_word_code = [word_dict[word] for word in s1_cut]
    print('s1 appear code: %s ' % s1_cut_word_code)
    s1_cut_word_fq = [0] * len(word_dict)
    for word in s1_cut:
        s1_cut_word_fq[word_dict[word]] += 1
    print('s1 word appear on each code: %s ' % s1_cut_word_fq)

    # 统计s2各维度编码并计算每个维度的词频
    s2_cut_word_code = [word_dict[word] for word in s2_cut]
    print('s2 appear code: %s ' % s2_cut_word_code)
    s2_cut_word_fq = [0] * len(word_dict)
    for word in s2_cut:
        s2_cut_word_fq[word_dict[word]] += 1
    print('s2 word appear on each code: %s \n' % s2_cut_word_fq)

    # 计算余弦相似度
    sigma_sum, sq1, sq2 = 0, 0, 0
    for i in range(len(word_dict)):
        sigma_sum += s1_cut_word_fq[i] * s2_cut_word_fq[i]
        sq1 += pow(s1_cut_word_fq[i], 2)
        sq2 += pow(s2_cut_word_fq[i], 2)
    try:
        cosine = round(float(sigma_sum) / (math.sqrt(sq1) * math.sqrt(sq2)), 6)
    except ZeroDivisionError:
        cosine = 0.0
    print('cosine similarity: %s \n\n' % cosine)


def check_en_similarity1(s1, s2):
    # 使用nltk分词并去掉stopwords，合并words，每个word都是一个维度
    sw = stopwords.words('english')
    others = ['!', ',', '.', '?', '-s', '-ly', '</s>', 's']
    sw.extend(others)
    s1_cut = word_tokenize(s1)
    s2_cut = word_tokenize(s2)
    s1_cut = {w for w in s1_cut if not w.lower() in sw}
    s2_cut = {w for w in s2_cut if not w.lower() in sw}
    word_set = set(s1_cut).union(set(s2_cut))
    print('cut s1: %s ' % s1_cut)
    print('cut s2: %s ' % s2_cut)
    print('word set: %s \n' % word_set)

    # 列出所有词 及 每个词在set中出现的位置（即每个单词所在的维度和该维度的编码）
    word_dict = dict()
    i = 0
    for word in word_set:
        word_dict[word] = i
        i += 1
    print('word dict: %s \n' % word_dict)

    # 统计s1各维度编码并计算s1在每个维度的词频
    s1_cut_word_code = [word_dict[word] for word in s1_cut]
    print('s1 appear code: %s ' % s1_cut_word_code)
    s1_cut_word_fq = [0] * len(word_dict)
    for word in s1_cut:
        s1_cut_word_fq[word_dict[word]] += 1
    print('s1 word appear on each code: %s ' % s1_cut_word_fq)

    # 统计s2各维度编码并计算s2在每个维度的词频
    s2_cut_word_code = [word_dict[word] for word in s2_cut]
    print('s2 appear code: %s ' % s2_cut_word_code)
    s2_cut_word_fq = [0] * len(word_dict)
    for word in s2_cut:
        s2_cut_word_fq[word_dict[word]] += 1
    print('s2 word appear on each code: %s \n' % s2_cut_word_fq)

    # 计算余弦相似度
    sigma_sum, sq1, sq2 = 0, 0, 0
    for i in range(len(word_dict)):
        sigma_sum += s1_cut_word_fq[i] * s2_cut_word_fq[i]
        sq1 += pow(s1_cut_word_fq[i], 2)
        sq2 += pow(s2_cut_word_fq[i], 2)
    try:
        cosine = round(float(sigma_sum) / (math.sqrt(sq1) * math.sqrt(sq2)), 6)
    except ZeroDivisionError:
        cosine = 0.0
    print('cosine similarity: %s \n\n' % cosine)


def check_en_similarity2(s1, s2):
    # tokenization
    x_list = word_tokenize(s1)
    y_list = word_tokenize(s2)

    # sw contains the list of stopwords
    sw = stopwords.words('english')
    others = ['!', ',', '.', '?', '-s', '-ly', '</s>', 's']
    sw.extend(others)
    l1 = []
    l2 = []

    # remove stop words from the string
    x_set = {w for w in x_list if not w.lower() in sw}
    y_set = {w for w in y_list if not w.lower() in sw}

    # form a set containing keywords of both strings
    rvector = x_set.union(y_set)
    for w in rvector:
        if w in x_set:
            l1.append(1)  # create a vector
        else:
            l1.append(0)
        if w in y_set:
            l2.append(1)
        else:
            l2.append(0)

    # cosine formula
    c = 0
    for i in range(len(rvector)):
        c += l1[i] * l2[i]
    cosine = round(c / float((sum(l1) * sum(l2)) ** 0.5), 6)
    print("similarity: ", cosine)
    return cosine


if __name__ == '__main__':
    # 中文句子测试
    # sentence1 = '这只皮靴号码大了。那只号码合适'
    # sentence2 = '这只皮靴号码不小，那只更合适'
    # check_zh_similarity(sentence1, sentence2)

    # 英文句子测试
    # sentence3 = 'How to create ticket in ticket system?'
    # sentence4 = 'How can I create ticket?'
    sentence3 = 'Do you know I love you?'
    sentence4 = 'I love you so much?'
    check_en_similarity1(sentence3, sentence4)
    check_en_similarity2(sentence3, sentence4)
    print('end!')
猜你喜欢

目录

热门文章