import pandas as pd
import random
import jieba
import pandas as pd
#加载停用词,txt内容可以随项目进行改变
stopwords = pd.read_csv('stopword.txt',index_col=False,quoting=3,sep='\t',names=['stopwords'],encoding='utf-8')
stopwords = stopwords['stopwords'].values
#加载语料库
data = pd.read_csv('data.csv',encoding='utf-8',seq=',')
data.dropna(inplace=True)
data = data.segment.values.tolist()#dataframe转为list
#分词、去停用词
def preprocess(data):
for line in data:
try:
segs = jieba.lcut(line) #分词
segs = [v for v in segs if not str(v).isdigit()]#取数字
segs = list(filter(lambda x:x.strip(),segs)) #去左右空格
segs = list(filter(lambda x:len(x)>1,segs)) #去掉长度为1的字符
segs = list(filter(lambda x:x not in stopwords,segs)) #去掉停用词
sentences.append("".join(segs))
except Exception:
print(line)
continue
return sentences
segs = preprocess(data)
#抽取词向量特征
#抽取特征,定义词袋模型,可以换成word2vec模型
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(analyzer='word',max_features=4000)
#语料数据切分成训练集和测试集
for sklearn.model_selection import train_test_split
x_train, x_test = train_test_split(data,random_state=42)
#训练数据转为词袋模型
vec.fit(x_train)
#建立模型,可以换多种训练模型
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(vec.transform(x_train), y_train)
#计算auc
print(classifier.score(vec.transform(x_test), y_test))
#预测
pre = classifier.predict(vec.transform(x_test))
【NLP】NO4:文本分类
猜你喜欢
转载自blog.csdn.net/MARY197011111/article/details/99944162
今日推荐
周排行