Datawhale_day3

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import f1_score
import os
import pandas as pd

data_set = os.path.join(os.getcwd(), "数据集\\train_set.csv\\train_set.csv")
train_df = pd.read_csv(data_set, sep='\t', nrows=15000)


def Count_Vector():
    vectorizer = CountVectorizer(max_features=3000)
    train_test = vectorizer.fit_transform(train_df['text'])

    clf = RidgeClassifier()
    clf.fit(train_test[:10000], train_df['label'].values[:10000])

    val_pred = clf.predict(train_test[10000:])
    print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))


def TF_IDF():
    tfidf = TfidfVectorizer(ngram_range=(1, 3), max_features=3000)
    train_test = tfidf.fit_transform(train_df['text'])

    clf = RidgeClassifier()
    clf.fit(train_test[:10000], train_df['label'].values[:10000])

    val_pred = clf.predict(train_test[10000:])
    print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))


if __name__ == '__main__':
    Count_Vector()
    TF_IDF()

猜你喜欢

转载自blog.csdn.net/qq_38890412/article/details/107580464