理解sklearn.feature.text中的CountVectorizer和TfidfVectorizer

"""
理解sklearn中的CountVectorizer和TfidfVectorizer
"""
from collections import Counter

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

sentences = ["there is a dog dog", "here is a cat"]
count_vec = CountVectorizer()
a = count_vec.fit_transform(sentences)
print(a.toarray())
print(count_vec.vocabulary_)
"""
输出
{'dog': 1, 'there': 4, 'here': 2, 'cat': 0, 'is': 3}
表示每个词汇对应的坐标
"""

print("=" * 10)
tf_vec = TfidfVectorizer()
b = tf_vec.fit_transform(sentences)
print(b.toarray())
print(tf_vec.vocabulary_)
print(tf_vec.idf_)  # 逆文档频率
print(tf_vec.get_feature_names())


def mytf_idf(s):
    # 自己实现tfidf
    words = tf_vec.get_feature_names()
    tf_matrix = np.zeros((len(s), len(words)), dtype=np.float32)
    smooth = 1
    # 初始值加上平滑因子
    df_matrix = np.ones(len(words), dtype=np.float32) * smooth
    for i in range(len(s)):
        s_words = s[i].split()
        for j in range(len(words)):
            cnt = Counter(s_words).get(words[j], 0)
            tf_matrix[i][j] = cnt
            if cnt > 0:
                df_matrix[j] += 1
    # idf一定是大于1的数值
    idf_matrix = np.log((len(s) + smooth) / df_matrix) + 1
    matrix = tf_matrix * idf_matrix
    matrix = matrix / np.linalg.norm(matrix, 2, axis=1).reshape(matrix.shape[0], 1)
    print(matrix)


print("=" * 10)
mytf_idf(sentences)
"""
TODO:
* IDF可以学到,通过神经网络反向传播来学习IDF而不是直接计算得出
* CountVectorizer有时不需要考虑个数,只需要知道是否出现过即可
"""

猜你喜欢

转载自www.cnblogs.com/weiyinfu/p/9558755.html