from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import f1_score
import os
import pandas as pd
data_set = os.path.join(os.getcwd(), "数据集\\train_set.csv\\train_set.csv")
train_df = pd.read_csv(data_set, sep='\t', nrows=15000)
def Count_Vector():
vectorizer = CountVectorizer(max_features=3000)
train_test = vectorizer.fit_transform(train_df['text'])
clf = RidgeClassifier()
clf.fit(train_test[:10000], train_df['label'].values[:10000])
val_pred = clf.predict(train_test[10000:])
print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))
def TF_IDF():
tfidf = TfidfVectorizer(ngram_range=(1, 3), max_features=3000)
train_test = tfidf.fit_transform(train_df['text'])
clf = RidgeClassifier()
clf.fit(train_test[:10000], train_df['label'].values[:10000])
val_pred = clf.predict(train_test[10000:])
print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))
if __name__ == '__main__':
Count_Vector()
TF_IDF()
Datawhale_day3
猜你喜欢
转载自blog.csdn.net/qq_38890412/article/details/107580464
今日推荐
周排行