Python文本特征及分类

1、情感分析

# 简单的例子
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.classify import NaiveBayesClassifier

text1 = 'I like the movie so much!'
text2 = 'That is a good movie.'
text3 = 'This is a great one.'
text4 = 'That is a really bad movie.'
text5 = 'This is a terrible movie.'
def proc_text(text):
    """
        预处处理文本
    """
    # 分词
    raw_words = nltk.word_tokenize(text)    
    # 词形归一化
    wordnet_lematizer = WordNetLemmatizer()    
    words = [wordnet_lematizer.lemmatize(raw_word) for raw_word in raw_words]    
    # 去除停用词
    filtered_words = [word for word in words if word not in stopwords.words('english')]

    # True 表示该词在文本中，为了使用nltk中的分类器
    return {word: True for word in filtered_words}
# 构造训练样本
train_data = [[proc_text(text1), 1],
              [proc_text(text2), 1],
              [proc_text(text3), 1],
              [proc_text(text4), 0],
              [proc_text(text5), 0]]
print(train_data)
# 训练模型
nb_model = NaiveBayesClassifier.train(train_data)

# 测试模型
text6 = 'That is a not bad one.'
print(nb_model.classify(proc_text(text6)))

2、文本相似度

import nltk
from nltk import FreqDist
text1 = 'I like the movie so much '
text2 = 'That is a good movie '
text3 = 'This is a great one '
text4 = 'That is a really bad movie '
text5 = 'This is a terrible movie'
text = text1 + text2 + text3 + text4 + text5
words = nltk.word_tokenize(text)
freq_dist = FreqDist(words)
print(freq_dist['That'])
print(freq_dist)
# 取出常用的n=5个单词
n = 5
# 构造“常用单词列表”
most_common_words = freq_dist.most_common(n)
print(most_common_words)
def lookup_pos(most_common_words):
    """
        查找常用单词的位置
    """
    result = {}
    pos = 0
    for word in most_common_words:
        result[word[0]] = pos
        pos += 1
    return result
# 记录位置
std_pos_dict = lookup_pos(most_common_words)
print(std_pos_dict)
# 新文本
new_text = 'That one is a good movie. This is so good!'
# 初始化向量
freq_vec = [0] * n
# 分词
new_words = nltk.word_tokenize(new_text)
# 在“常用单词列表”上计算词频
for new_word in new_words:
    if new_word in list(std_pos_dict.keys()):
        freq_vec[std_pos_dict[new_word]] += 1
print(freq_vec)
new_text = 'That one is a good movie.'

3、文本分类及TF-IDF

# 3.1 NLTK中的TF-IDF
from nltk.text import TextCollection

text1 = 'I like the movie so much '
text2 = 'That is a good movie '
text3 = 'This is a great one '
text4 = 'That is a really bad movie '
text5 = 'This is a terrible movie'

# 构建TextCollection对象
tc = TextCollection([text1, text2, text3, 
                        text4, text5])
new_text = 'That one is a good movie. This is so good!'
word = 'That'
tf_idf_val = tc.tf_idf(word, new_text)
print('{}的TF-IDF值为：{}'.format(word, tf_idf_val))

# 3.2 sklearn中的TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
feat = vectorizer.fit_transform([text1, text2, text3, text4, text5])
print(feat.toarray())
print(vectorizer.get_feature_names())
print(feat_array = feat.toarray())
print(feat_array.shape)
print(feat_array[0:2, :])

# 3.2 3.3 中文TF-IDF
import os
import re
import jieba.posseg as pseg

ch_text1 = ' 非常失望，剧本完全敷衍了事，主线剧情没突破大家可以理解，可所有的人物都缺乏动机，正邪之间、妇联内部都没什么火花。团结-分裂-团结的三段式虽然老套但其实也可以利用积攒下来的形象魅力搞出意思，但剧本写得非常肤浅、平面。场面上调度混乱呆板，满屏的铁甲审美疲劳。只有笑点算得上差强人意。'
ch_text2 = ' 2015年度最失望作品。以为面面俱到，实则画蛇添足；以为主题深刻，实则老调重弹；以为推陈出新，实则俗不可耐；以为场面很high，实则high劲不足。气！上一集的趣味全无，这集的笑点明显刻意到心虚。全片没有任何片段给我有紧张激动的时候，太弱了，跟奥创一样。'
ch_text3 = ' 《铁人2》中勾引钢铁侠，《妇联1》中勾引鹰眼，《美队2》中勾引美国队长，在《妇联2》中终于……跟绿巨人表白了，黑寡妇用实际行动告诉了我们什么叫忠贞不二；而且为了治疗不孕不育连作战武器都变成了两支验孕棒(坚决相信快银没有死，后面还得回来)'
ch_text4 = ' 虽然从头打到尾，但是真的很无聊啊。'
ch_text5 = ' 剧情不如第一集好玩了，全靠密集笑点在提神。僧多粥少的直接后果就是每部寡姐都要换着队友谈恋爱，这特么比打斗还辛苦啊，真心求放过～～～（结尾彩蛋还以为是洛基呢，结果我呸！）'
ch_texts = [ch_text1, ch_text2, ch_text3, ch_text4, ch_text5]
# 加载停用词表
stop_words_path = './stop_words/'
stopwords1 = [line.rstrip() for line in open(os.path.join(stop_words_path, '中文停用词库.txt'), 'r',encoding='utf-8')]
stopwords2 = [line.rstrip() for line in open(os.path.join(stop_words_path, '哈工大停用词表.txt'), 'r',encoding='utf-8')]
stopwords3 = [line.rstrip() for line in
open(os.path.join(stop_words_path, '四川大学机器智能实验室停用词库.txt'), 'r', encoding='utf-8')]
stopwords = stopwords1 + stopwords2 + stopwords3
print(len(stopwords))

def proc_text(raw_line):
    """
        处理文本数据
        返回分词结果
    """
    # 1. 使用正则表达式去除非中文字符
    filter_pattern = re.compile('[^\u4E00-\u9FD5]+')
    chinese_only = filter_pattern.sub('', raw_line)
    # 2. 结巴分词+词性标注
    word_list = pseg.cut(chinese_only)
    # 3. 去除停用词，保留有意义的词性
    # 动词，形容词，副词
    used_flags = ['v', 'a', 'ad']
    meaninful_words = []
    for word, flag in word_list:
        if (word not in stopwords) and (flag in used_flags):
            meaninful_words.append(word)
    return ' '.join(meaninful_words)
corpus = [proc_text(ch_text) for ch_text in ch_texts]
print(corpus)
ch_vectorizer = TfidfVectorizer()
ch_feats = ch_vectorizer.fit_transform(corpus)
print(ch_vectorizer.get_feature_names())
print(ch_feats.toarray()[0, :])

Python文本特征及分类

猜你喜欢