词袋模型把文本(段落或者文档)被看作是无序的词汇集合,忽略语法甚至是单词的顺序,把每一个单词都进行统计,同时计算每个单词出现的次数,常常被用在文本分类中,如贝叶斯算法、LDA 和 LSA 等。
import jieba
# 首先,引入 jieba 分词器、语料和停用词。
# 定义停用词、标点符号
punctuation = [",", "。", ":", ";", "?"]
# 定义语料
content = ["机器学习带动人工智能飞速的发展。",
"深度学习带动人工智能飞速的发展。",
"机器学习和深度学习带动人工智能飞速的发展。"]
# 对语料进行分词操作,这里用到 lcut() 方法
# 分词
segs_1 = [jieba.lcut(con) for con in content]
print(segs_1)
# 去停用词和标点符号
tokenized = []
for sentence in segs_1:
words = []
for word in sentence:
if word not in punctuation:
words.append(word)
tokenized.append(words)
print(tokenized)
# 取并集,再去重
# 求并集
bag_of_words = [x for item in segs_1 for x in item if x not in punctuation]
# 去重
bag_of_words = list(set(bag_of_words))
print(bag_of_words)
# 词袋化
bag_of_word2vec = []
for sentence in tokenized:
tokens = [1 if token in sentence else 0 for token in bag_of_words]
bag_of_word2vec.append(tokens)
print(bag_of_word2vec)
原文:
https://soyoger.blog.csdn.net/article/details/108729409