NLP关键词权重算法总结Python实现(超级全,持续更)

词频

语言材料中词的使用频率
在新词发现、热词发现等场景,词频越高,权重越高。
词频统计可以按 句子级段落级篇章级
篇章级,词在整篇文章中,没出现算零次,出现了算一次
句子级,词在整个句子中,没出现算零次,出现了算一次
最细级,出现多少次就算多少次

TFIDF

TF(Term Frequency):词频
T F = 该 词 频 数 文 档 词 语 总 数 TF = \frac{该词频数}{文档词语总数} TF=

IDF(Inverse Document Frequency):逆文本频率指数
I D F = log ⁡ ( 文 档 总 数 出 现 该 词 文 档 数 + 1 ) IDF = \log(\frac{文档总数}{出现该词文档数+1}) IDF=log(+1)

TFIDF缺点

  1. 文档类型单一时,IDF将失去意义。例如,语料都是汽车主题,理应获得更高权重的汽车相关术语的IDF反而不高
  2. 在具有时效性的文本中,网络热词会阶段性频繁出现,这些热词理应要有较高权重,但IDF赋予的权重反而较低。
from collections import Counter
from math import log10
from re import split
from jieba.posseg import dt
FLAGS = set('a an b f i j l n nr nrfg nrt ns nt nz s t v vi vn z eng'.split())

def cut(text):
    for sentence in split('[^a-zA-Z0-9\u4e00-\u9fa5]+', text.strip()):
        for w in dt.cut(sentence):
            if len(w.word) > 1 and w.flag in FLAGS:
                yield w.word

class TFIDF:
    def __init__(self):
        self.idf = None
        self.idf_max = None

    def fit(self, texts):
        texts = [set(cut(text)) for text in texts]
        lent = len(texts)
        words = set(w for t in texts for w in t)
        self.idf = {
    
    w: log10(lent/(sum((w in t)for t in texts)+1)) for w in words}
        self.idf_max = log10(lent)
        return self

    def get_idf(self, word):
        return self.idf.get(word, self.idf_max)

    def extract(self, text, top_n=10):
        counter = Counter()
        for w in cut(text):
            counter[w] += self.get_idf(w)
        return [i[0] for i in counter.most_common(top_n)]

tfidf = TFIDF().fit(['奶茶', '巧克力奶茶', '巧克力酸奶', '巧克力', '巧克力']*2)
print(tfidf.extract('酸奶巧克力奶茶'))

词在文中位置

下面提供两种位置权重

粗粒度

词语位置 权重
标题 10
标题尾词 20
首句 4
末句 3
首段 3
末段 2
其它 1

细粒度

"""https://blog.csdn.net/Yellow_python/article/details/104580509"""
from sklearn.gaussian_process import GaussianProcessRegressor
from jieba import lcut

X = [[0], [.1], [.2], [.3], [.4], [.5], [.6], [.7], [.8], [.9], [1]]
Y = [[1], [.2], [.04], [.02], [.01], [0], [0], [.01], [.03], [.1], [.5]]

class GPR:
    """高斯过程回归"""
    def __init__(self):
        self.model = GaussianProcessRegressor()
        self.model.fit(X, Y)

    def predict(self, position):
        return self.model.predict([[position]])[0]

    def extract(self, text, judge):
        words = lcut(text)
        le = len(words) - 1
        entities = [(self.predict(i/le)[0], words[i]) for i in range(le+1) if judge(words[i])]
        return entities

def visualization():
    from matplotlib import pyplot as mp
    w = [[i / 500] for i in range(501)]
    z = GPR().model.predict(w)
    mp.scatter(X, Y, s=66, color='g')
    mp.scatter(w, z, s=6, color='r')
    mp.show()

"""实体抽取并返回权重"""
print(GPR().extract('剑圣联合守望者斩杀大法师', lambda x: x in {
    
    '剑圣', '大法师', '守望者'}))
"""权重分布可视化"""
visualization()

示例句子:剑圣联合守望者斩杀大法师

词语 位置 位置百分比 权重
剑圣 0 0% 1.00
守望者 2 50% 0.00
大法师 4 100% 0.50

词长

下面提供两种词长权重

词长度 1 2 3 4 5 6 7 8 9 10
权 重 = 1 − 1 词 长 权重 = 1 - \frac{1}{词长} =11 0.00 0.50 0.67 0.75 0.80 0.83 0.86 0.88 0.89 0.90
权 重 = 词 长 权重 = \sqrt{词长} = 1 1.414 1.732 2 2.236 2.449 2.646 2.828 3 3.162

词跨度

词频=1时,权重=1;
词频>1时,下面提供两种词跨度权重

简单版

权 重 = 1 + 尾 词 位 置 − 首 词 位 置 总 长 权重 = 1 + \frac{尾词位置 - 首词位置}{总长} =1+

复杂版

权 重 = ∏ i = 1 n − 1 ( 1 + 词 位 置 i + 1 − 词 位 置 i 总 长 ) 权重 = \prod^{n-1}_{i=1} (1 + \frac{词位置_{i+1} - 词位置_i}{总长}) =i=1n1(1+i+1i)

def span1(text):
    """词跨度 + 词频"""
    words = list(text)  # 分词并返回列表,此处暂时用list
    reversed_words = words[::-1]
    le = len(words)
    return {
    
    w: 2-((words.index(w)+reversed_words.index(w)+1)/le) for w in set(words)}

def span2(text):
    c, position = dict(), dict()
    words = list(text)  # 分词并返回列表,此处暂时用list
    length = len(words)
    for i in range(length):
        word = words[i]
        if word not in c:
            c[word] = 1
        else:
            c[word] *= 1 + ((i - position[word]) / length)
        position[word] = i
    return c

_text = '清水水水水水水水里清'
print(span1(_text))
print(span2(_text))

句子示例:清水水水水水水水里清

词语 简单版权重 复杂版权重
1.9 1.9
1.6 1.771561
1 1

词性

通常,实词>虚词,名词权重较高。
对于如何设定权重值,我们可以找些标注数据来建立一个词性权重模型。

词与主题的关系

给予主题相关的词更高的权重

例如,文中【车主打开车盖,用苹果照了一下水箱】句子属于【汽车主题】,同为名词的【水箱】权重要比【苹果】高。

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from re import split
from jieba import cut
from collections import Counter
from numpy import argmax
from pandas import DataFrame

def segment(text):
    for phrase in split('[^a-zA-Z\u4e00-\u9fa5]+', text.strip()):
        for word in cut(phrase):
            yield word

def clf_word(texts, labels, clf=MultinomialNB()):
    """词分类,逻辑回归,存EXCEL"""
    # 向量化
    vectorizer = TfidfVectorizer(tokenizer=segment)
    x = vectorizer.fit_transform(texts)
    # 建模
    clf.fit(x, labels)
    classes = clf.classes_
    print(clf.__class__.__name__, clf.score(x, labels), *classes)
    # 词分类
    c = Counter(w for t in texts for w in segment(t)).most_common()
    ls = []
    for word, freq in c:
        predict_proba = clf.predict_proba(vectorizer.transform([word]))[0]  # 概率
        label = classes[argmax(predict_proba)]  # 类别
        ls.append([freq, word, label, *predict_proba])
    df = DataFrame(ls, columns=['freq', 'word', 'label', *classes])
    df.to_excel('a.xlsx', index=False)  # 存excel

if __name__ == '__main__':
    from data9 import X, Y  # 导入新闻9分类语料
    clf_word(X, Y)

否定句

在特定否定语境中,关键词权重要变零

例句:大力发展智能冰箱、智能空调、智能热水器等高新技术(不含智能手机和智能电视)
抽出实体及其权重:【智能冰箱:2】【智能空调:2】【智能热水器:2】【智能手机:-1】【智能电视:-1】

{
    
    '不包含': -1, '不包括': -1, '不含': -1, '除外': -1, '包含': 1, '包括': 1}
re.compile(r'(?<![无没])不[^a-zA-Z\W\d_限无没]|除外')

自然衰减权重

出现某些特定词时,特定词后的单词权重提高,并自然衰减

from matplotlib import pyplot as mp
# 特定词
x2y = {
    
    'b': 20, 'c': 10, 'd': 30}
# 造数据
length = 100
x = ['a'] * length
x[15], x[30], x[75] = 'b', 'c', 'd'
# 计算权重
y = [0] * length
for i in range(length):
    if x[i] in x2y:
        weight = x2y[x[i]]
        for j in range(i + 1, length):
            y[j] += max(0, weight - abs(i - j))
mp.plot(y)
mp.show()

TextRank

from collections import defaultdict
from jieba.posseg import dt

ALLOW_POS = frozenset(('ns', 'n', 'vn', 'v'))


class WeightedUndigraph:
    d = 0.85

    def __init__(self):
        self.graph = defaultdict(list)

    def add_edge(self, start, end, weight):
        self.graph[start].append((start, end, weight))
        self.graph[end].append((end, start, weight))

    def rank(self):
        ws = defaultdict(float)
        outSum = defaultdict(float)

        wsdef = 1.0 / (len(self.graph) or 1.0)
        for n, out in self.graph.items():
            ws[n] = wsdef
            outSum[n] = sum((e[2] for e in out), 0.0)

        for x in range(10):  # 10次迭代
            for n in sorted(self.graph.keys()):  # 排序更稳定
                s = 0
                for e in self.graph[n]:
                    s += e[2] / outSum[e[1]] * ws[e[1]]
                ws[n] = (1 - self.d) + self.d * s

        min_rank, max_rank = min(ws.values()), max(ws.values())
        for n, w in ws.items():
            ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)  # 统一权重,无需乘以100
        return ws


class TextRank:
    def __init__(self):
        self.tokenizer = dt
        self.allow_pos = ALLOW_POS
        self.span = 5

    def flag_filter(self, wp):
        return (wp.flag in self.allow_pos) and (len(wp.word.strip()) >= 2)

    def text_rank(self, sentence, n=20, with_weight=False, allow_pos=ALLOW_POS, with_flag=False):
        """
        Parameter:
            - n: 返回关键词数量
            - with_weight: 是否返回权重
            - allow_pos: 允许的词性
            - with_flag: 是否返回词性
        """
        self.allow_pos = frozenset(allow_pos)
        g = WeightedUndigraph()
        cm = defaultdict(int)
        words = tuple(self.tokenizer.cut(sentence))
        for i, wp in enumerate(words):
            if self.flag_filter(wp):
                for j in range(i + 1, i + self.span):
                    if j >= len(words):
                        break
                    if not self.flag_filter(words[j]):
                        continue
                    if allow_pos and with_flag:
                        cm[(wp, words[j])] += 1
                    else:
                        cm[(wp.word, words[j].word)] += 1
        for terms, w in cm.items():
            g.add_edge(terms[0], terms[1], w)
        nodes_rank = g.rank()
        if with_weight:
            tags = sorted(nodes_rank.items(), key=lambda x: x[1], reverse=True)
        else:
            tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
        return tags[:n] if n else tags


_t = '越来越多的国产汽车出现在大众的眼中,国产汽车的整体性能也在不断优化,但和老牌的欧美汽车相比还存在着一定的差距'
print(TextRank().text_rank(_t, with_weight=True, with_flag=True))

上下文特征向量

待开发

最尾补充一些失败实验

失败1:
尝试用词向量分布的离散程度来计算权重但失败,结果如下:
高频词向量[5 5 5 5 5 5 0 -5 -5 -5]离散程度高
中频词向量[4 3 3 3 3 3 0 -3 -3 -3]离散程度中
低频词向量[3 1 1 1 1 1 0 -1 -1 -1]离散程度低

from re import split, fullmatch
from gensim.models import Word2Vec
from jieba import cut
from numpy import var
from pandas import DataFrame

def lcut(text):
    return [w for s in split('[\n。…;;!!??]+', text)for w in cut(s)if fullmatch('[a-zA-Z\u4e00-\u9fa5]+', w)]

def word2vector(texts):
    """词向量建模"""
    sentences = [lcut(t) for t in texts]
    wv = Word2Vec(sentences, size=75, window=10, sg=1).wv
    DataFrame([(w, var(wv[w]), *wv[w]) for w in wv.index2word], columns=[
        'word', 'weight', *(str(i) for i in range(75))]).to_excel('b.xlsx', index=False)

from data9 import X  # 新闻9分类,不导入标签
word2vector(X)

失败2:
词特征分布的离散程度:特征分布离散程度越高的词,权重应更高。
例如,两个IDF相同的词,第一个词集中分布在某类文档中,第二个词分散在不同类型的文档中,则第一个词的权重理应更高。

尝试用长文切短的主题模型来获取单词的主题分布离散程度,但结果失败,多数单词主题分布匀散,某些停词反而具有更为集中的主题分布。

from gensim import corpora, models
import re, jieba, numpy as np, pandas as pd

def word_lda(texts, num_topics=50):
    # 分词
    words_ls = [[w for w in jieba.cut(s) if re.fullmatch('[a-zA-Z\u4e00-\u9fa5]+', w)]
                for t in texts for s in re.split('[\n。…;;!!??]+', t)]
    # 构造词典
    dictionary = corpora.Dictionary(words_ls)
    # 基于词典,使【词】→【稀疏向量】,并将向量放入列表,形成【稀疏向量集】
    corpus = [dictionary.doc2bow(words) for words in words_ls]
    # lda模型,num_topics设置主题的个数
    lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
    # 主题概率矩阵
    matrix = lda.state.get_lambda()
    matrix = matrix / np.sum(matrix, axis=0)
    # 按照主题概率聚类,并存excel
    pd.DataFrame({
    
    
        'word': [dictionary[i] for i in range(len(dictionary))],
        'topic': np.argmax(matrix, axis=0),
        'probability': np.max(matrix, axis=0),
    }).sort_values(by=['topic', 'probability'], ascending=False).to_excel('word_lda.xlsx', index=False)

from data9 import X  # 新闻9分类,不导入标签
word_lda(X)

猜你喜欢

转载自blog.csdn.net/Yellow_python/article/details/108275976