from collections import Counter
from jieba import cut
defstatistics(X, Y):
length =len(Y)print('|'.join(['word','label','max_freq','amount','probability']),'|'.join('-'*5), sep='\n')for w, total in Counter(w for x in X for w inset(cut(x))).most_common():
c = Counter(Y[i]for i inrange(length)if w in X[i])
label, frequency = c.most_common()[0]print(w, label, frequency, total,'%.1f%%'%(frequency/total*100), sep='|')
texts =['小米小米','苹果和橙','小米和苹果']
labels =['phone','fruit','phone']
statistics(texts, labels)
word
label
max_freq
amount
probability
小米
phone
2
2
100.0%
苹果
fruit
1
2
50.0%
和
fruit
1
2
50.0%
橙
fruit
1
1
100.0%
多标签多分类
from collections import Counter
import jieba
defstatistics(X, Y):
length =len(Y)
labels =sorted(set(i for y in Y for i in y))print('|'.join(['word','frequency',*('【%s】'% i for i in labels)]),'|'.join('-'*5), sep='\n')for w, total in Counter(w for x in X for w in x).most_common():
c = Counter(y for i inrange(length)if w in X[i]for y in Y[i])print(w, total,*('%d%%'%(c[i]/total*100)for i in labels), sep='|')
jieba.add_word('刘诗诗')
texts =['刘诗诗吃苹果','苹果手机','刘诗诗代言小米']
X =[set(jieba.cut(text))for text in texts]
Y =[['entertain','fruit'],['phone'],['entertain','phone']]
statistics(X, Y)
word
frequency
【entertain】
【fruit】
【phone】
刘诗诗
2
100.0%
50.0%
50.0%
苹果
2
50.0%
50.0%
50.0%
吃
1
100.0%
100.0%
0.0%
手机
1
0.0%
0.0%
100.0%
代言
1
100.0%
0.0%
100.0%
小米
1
100.0%
0.0%
100.0%
机器学习
from jieba import cut
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
# from sklearn.linear_model import LogisticRegression# from sklearn.tree import DecisionTreeClassifier# from sklearn.ensemble import RandomForestClassifierfrom collections import Counter
# 读取语料
X =['小米小米','苹果和橙','小米和苹果']
Y =['phone','fruit','phone']# 向量化
vectorizer = TfidfVectorizer(tokenizer=cut)
x = vectorizer.fit_transform(X)# 分类模型
clf = MultinomialNB()# clf = LogisticRegression()# clf = DecisionTreeClassifier()# clf = RandomForestClassifier()
clf.fit(x, Y)# 监督模型
words = Counter(word for text in X for word in cut(text))print('word','freq','label','probability', sep=' | ');print('-|-|-|-')for word, freq in words.most_common():
pred = clf.predict(vectorizer.transform([word]))[0]
probability =max(clf.predict_proba(vectorizer.transform([word]))[0])print(word, freq, pred, probability, sep=' | ')
from jieba import cut
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from collections import Counter
# 读取语料
X =['小米小米','苹果和橙','小米和苹果']
Y =['phone','fruit','phone']# 文本向量化
vectorizer = TfidfVectorizer(tokenizer=cut)
x = vectorizer.fit_transform(X)# 分类模型
models =[
MultinomialNB().fit(x, Y),
LogisticRegression().fit(x, Y),# 0.22的sklearn版本,要加参数solver='liblinear'
DecisionTreeClassifier().fit(x, Y),
RandomForestClassifier().fit(x, Y),]# 词频统计
words = Counter(word for text in X for word in cut(text))# 预测import sklearn;print(sklearn.__version__)# 打印sklearn版本print('word',*[m.__class__.__name__ for m in models], sep=' | ');print('-|-|-|-|-')for w, _ in words.most_common()+[('',0)]:print(w,*[m.predict(vectorizer.transform([w]))[0]+' {:.2f}%'.format(max(
m.predict_proba(vectorizer.transform([w]))[0])*100)for m in models], sep=' | ')