tensorflow2.0 implements text classification

data set

A data set of English comments with three categories of labels, positive, negative and neutral.

Import library

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re

Read in data

data=pd.read_csv('dataset/Tweets.csv')
data.head()

Please add image description

There are many columns of data, only two of them are used, text, airline_sentiment comment content and label

data=data[['airline_sentiment','text']]
data.head()

Please add image description

Process labels, convert character type to int type

data=data.sample(len(data))#打乱顺序
def trans_label(label):
    if label=='negative':
        return 0
    elif label=='positive':
        return 1
    else:
        return 2

data['label']=data['airline_sentiment'].apply(trans_label)
del data['airline_sentiment']

Process comment data

token=re.compile(r'[A-Za-z]+|[!?,.()]')#只使用评论中的大小写字母,以及!?,.(),过滤掉其他特殊符号

def reg_text(text):
    new_text=token.findall(text)
    new_text=[word.lower() for word in new_text]#都变成小写
    return new_text

#注:如果是中文的数据集,使用jieba库进行分词,往后的操作都类似
data['text']=data.text.apply(reg_text)
data

Please add image description
Each piece of data is a list, and each of its words is in the list

Build vocabulary and convert comments into digital features

datax=data['text']
y=data['label']
x_train,x_test,y_train,y_test = train_test_split(datax,y,test_size=0.2,random_state=0)

word_set=set()#所有词组成的词典,这里只使用训练集的词,保证测试时效果可信度
for text in x_train:
    for word in text:
        word_set.add(word)
max_word=len(word_set)+2
#最大词数是词典数+2,因为有填充字符,还有未知字符,每个句子长度不一样,将其填充成长度一样的,若在测试集中有词典中未出现的词,设置为未知符号

word_list=list(word_set)
word_index=dict((word,word_list.index(word)+2) for word in word_list)

#将每个词对应成索引构建字典,从2开始,0是填充符,1是未知符
x_train_data=x_train.apply(lambda x:[word_index.get(word,1) for word in x])
x_test_data=x_test.apply(lambda x:[word_index.get(word,1) for word in x])

#出现词典未出现的词将其设置为1

max_len=max(len(x) for x in x_train_data)#所有评论的最大长度,将所有句子长度填充为此长度(也不一定是填充到所有评论的最大长度,选择一个适当的数据即可)

x_train_data=tf.keras.preprocessing.sequence.pad_sequences(x_train_data.values,maxlen=max_len)
x_test_data=tf.keras.preprocessing.sequence.pad_sequences(x_test_data.values,maxlen=max_len)
#填充数据,默认填充0

Model construction and training

model=keras.Sequential([
    layers.Embedding(max_word,50,input_length=max_len),
    layers.LSTM(64),
    layers.Dense(3,activation='softmax')
])
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['acc'])
model.fit(x_train_data,y_train,epochs=10,batch_size=128,validation_data=(x_test_data,y_test))

Please add image description

Guess you like

Origin blog.csdn.net/weixin_44599230/article/details/121363057