文本分析 nlp 处理文本数据(分词、去停用词)

import nltk
from gensim.models import word2vec
import nltk
import spacy
import en_core_web_sm
from nltk.tokenize.toktok import ToktokTokenizer

from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
tokenizer = ToktokTokenizer()
stopword_list=nltk.corpus.stopwords.words('english')
stopword_list.append('.')
stopword_list.append('*')
stopword_list.append('(')
stopword_list.append(')')
stopword_list.append('-')
stopword_list.append('--')
stopword_list.append(',')
stopword_list.append('...')
stopword_list.append('......')
stopword_list.append('[')
stopword_list.append(']')
stopword_list.append('/')
stopword_list.append(':')
stopword_list.append('?')
stopword_list.append('}')
stopword_list.append('{')
stopword_list.append('#')
stopword_list.append('@')
stopword_list.append('&')
stopword_list.append('=')
stopword_list.append('-')
stopword_list.append('+')
stopword_list.append('>')
stopword_list.append('<')
stopword_list.append(';')
stopword_list.append("'")
stopword_list.append("``" )
stopword_list.append("*/")


nlp = spacy.load('en_core_web_sm',parse=True,tag=True,entity=True)


def fenci(text):
    tokens = nltk.word_tokenize(text)
    return tokens
    
def stem_text(text):
    ps=nltk.porter.PorterStemmer()
    tokenizer=nltk.tokenize.toktok.ToktokTokenizer()
    tokens=tokenizer.tokenize(text)
    stem_token=[ps.stem(token.strip()) for token in tokens]
    text=' '.join(stem_token)
    return text

def lemmatize_text(text):
    text=nlp(text)
    lemma_word=[word.lemma_ if word.lemma_!='-PRON-' else word.text for word in text]
    text=' '.join(lemma_word)
    return text

def remove_stopwords(text,is_lower_case=False):
    tokens=nltk.word_tokenize(text)
    tokens=[token.strip() for token in tokens]
    
   # print(tokens)
    if is_lower_case:
        filtered_tokens=[token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens=[token for token in tokens if token.lower() not in stopword_list]
    filtered_tokens=np.array((filtered_tokens))
    #print(filtered_tokens)
    return filtered_tokens

def normalize_corpus(corpus,doc_tokenize=True,text_stemming=False,
                    text_lemmatization=False,
                    stopword_removal=True,text_lower_case=True):
    normalized_corpus=[]
    for doc in corpus:
      #  print("sddddcxcdcs")
        if doc_tokenize:
            doc=fenci(doc)
        if text_stemming:
            doc=stem_text(doc)
        if text_lemmatization:
            doc=lemmatize_text(doc)
        if stopword_removal:
            doc=remove_stopwords(doc,is_lower_case=text_lower_case)
        normalized_corpus.append(doc)
        
   # print(normalized_corpus)
    return normalized_corpus
#new3 new5 为文本数据
import pandas as pd
import numpy as np
t5=normalize_corpus(new5,doc_tokenize=False,text_stemming=False,
                    text_lemmatization=False,
                    stopword_removal=True,text_lower_case=False)
data5=list(t5)
t3=normalize_corpus(new3,doc_tokenize=False,text_stemming=False,
                    text_lemmatization=False,
                    stopword_removal=True,text_lower_case=False)
data3=list(t3)

猜你喜欢

转载自blog.csdn.net/m0_53112875/article/details/125089875