[NLP]预处理--使用re正则化进行文本清理

文本清理:在自然语言处理中,尽管文本清理受所做的任务影响比较大,但是有一些通用的清理流程标准是通用的,比如是否有必要替换单位、货币、数学符号、数字。可以使用正则化工具将相应内容替换为标准内容。

工具:re(简介

输入:原始文本

输出:干净文本

流程:单位替换 、略缩词替换、拼写校对、标点处理、符号替换、移除多余空格

代码:notebook

单位替换

将文本中的单位替换为统一格式如:将4kgs、4kg统一替换为4 kg,将4k替换为4000,将 100 或 100 100或100 100100替换为100 dollar。

import random
import re
text = "I want to lose 4kgs in a month. What does 4k mean in a salary?What is the best way to make money with $100?"

# 单位
text = re.sub(r"(\d+)kgs ", lambda m: m.group(1) + ' kg ', text)        # e.g. 4kgs => 4 kg
text = re.sub(r"(\d+)kg ", lambda m: m.group(1) + ' kg ', text)         # e.g. 4kg => 4 kg
text = re.sub(r"(\d+)k ", lambda m: m.group(1) + '000 ', text)          # e.g. 4k => 4000
text = re.sub(r"\$(\d+)", lambda m: m.group(1) + ' dollar ', text)      # e.g. $100 => 100 dollar
text = re.sub(r"(\d+)\$", lambda m: m.group(1) + ' dollar ', text)      # e.g. 100$ => 100 dollar

text
'I want to lose 4 kg in a month. What does 4000 mean in a salary?What is the best way to make money with 100 dollar ?'

略缩词替换

将文本中首字母略缩词替换为完整单词,如can’t、cannot替换为can not,'ve替换为have,c#替换为csharp等。

text = "Why India can't compete with China in manufacturing. What is the biggest scam you've ever seen? Why Should I Learn c#? "

# 略缩词
text = re.sub(r"can\'t", "can not", text)
text = re.sub(r"cannot", "can not ", text)
text = re.sub(r"what\'s", "what is", text)
text = re.sub(r"What\'s", "what is", text)
text = re.sub(r"\'ve ", " have ", text)
text = re.sub(r"n\'t", " not ", text)
text = re.sub(r"i\'m", "i am ", text)
text = re.sub(r"I\'m", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r"c\+\+", "cplusplus", text)
text = re.sub(r"c \+\+", "cplusplus", text)
text = re.sub(r"c \+ \+", "cplusplus", text)
text = re.sub(r"c#", "csharp", text)
text = re.sub(r"f#", "fsharp", text)
text = re.sub(r"g#", "gsharp", text)
text = re.sub(r" e mail ", " email ", text)
text = re.sub(r" e \- mail ", " email ", text)
text = re.sub(r" e\-mail ", " email ", text)
text = re.sub(r",000", '000', text)
text = re.sub(r"\'s", " ", text)

text
'Why India can not compete with China in manufacturing. What is the biggest scam you have ever seen? Why Should I Learn csharp? '

拼写校对

如将ph.d、PhD替换为phd,去掉多余空格,将缩写替换为全拼、将阿拉伯数字替换为英文数字、将美元复数替换为单数等。

text = "ph.d PhD pokemons e g fb usa 1 2 3 googling rs1 dollars"

# 拼写校对
text = re.sub(r"ph\.d", "phd", text)
text = re.sub(r"PhD", "phd", text)
text = re.sub(r"pokemons", "pokemon", text)
text = re.sub(r"pokémon", "pokemon", text)
text = re.sub(r"pokemon go ", "pokemon-go ", text)
text = re.sub(r" e g ", " eg ", text)
text = re.sub(r" b g ", " bg ", text)
text = re.sub(r" 9 11 ", " 911 ", text)
text = re.sub(r" j k ", " jk ", text)
text = re.sub(r" fb ", " facebook ", text)
text = re.sub(r"facebooks", " facebook ", text)
text = re.sub(r"facebooking", " facebook ", text)
text = re.sub(r"insidefacebook", "inside facebook", text)
text = re.sub(r"donald trump", "trump", text)
text = re.sub(r"the big bang", "big-bang", text)
text = re.sub(r"the european union", "eu", text)
text = re.sub(r" usa ", " america ", text)
text = re.sub(r" us ", " america ", text)
text = re.sub(r" u s ", " america ", text)
text = re.sub(r" U\.S\. ", " america ", text)
text = re.sub(r" US ", " america ", text)
text = re.sub(r" American ", " america ", text)
text = re.sub(r" America ", " america ", text)
text = re.sub(r" quaro ", " quora ", text)
text = re.sub(r" mbp ", " macbook-pro ", text)
text = re.sub(r" mac ", " macbook ", text)
text = re.sub(r"macbook pro", "macbook-pro", text)
text = re.sub(r"macbook-pros", "macbook-pro", text)
text = re.sub(r" 1 ", " one ", text)
text = re.sub(r" 2 ", " two ", text)
text = re.sub(r" 3 ", " three ", text)
text = re.sub(r" 4 ", " four ", text)
text = re.sub(r" 5 ", " five ", text)
text = re.sub(r" 6 ", " six ", text)
text = re.sub(r" 7 ", " seven ", text)
text = re.sub(r" 8 ", " eight ", text)
text = re.sub(r" 9 ", " nine ", text)
text = re.sub(r"googling", " google ", text)
text = re.sub(r"googled", " google ", text)
text = re.sub(r"googleable", " google ", text)
text = re.sub(r"googles", " google ", text)
text = re.sub(r" rs(\d+)", lambda m: ' rs ' + m.group(1), text)
text = re.sub(r"(\d+)rs", lambda m: ' rs ' + m.group(1), text)
text = re.sub(r"the european union", " eu ", text)
text = re.sub(r"dollars", " dollar ", text)

text
'phd phd pokemon eg facebook america one two three  google  rs 1  dollar '

标点处理

在标点两旁加上空格、去除标点’。

text = "1+1=2 What is the biggest scam you have ever seen?I am learning csharp?"

# 标点处理
text = re.sub(r"\+", " + ", text)
text = re.sub(r"'", " ", text)
text = re.sub(r"-", " - ", text)
text = re.sub(r"/", " / ", text)
text = re.sub(r"\\", " \ ", text)
text = re.sub(r"=", " = ", text)
text = re.sub(r"\^", " ^ ", text)
text = re.sub(r":", " : ", text)
text = re.sub(r"\.", " . ", text)
text = re.sub(r",", " , ", text)
text = re.sub(r"\?", " ? ", text)
text = re.sub(r"!", " ! ", text)
text = re.sub(r"\"", " \" ", text)
text = re.sub(r"&", " & ", text)
text = re.sub(r"\|", " | ", text)
text = re.sub(r";", " ; ", text)
text = re.sub(r"\(", " ( ", text)
text = re.sub(r"\)", " ( ", text)

text
'1 + 1 = 2 What is the biggest scam you have ever seen ? I am learning csharp ? '

符号替换

将逻辑符号替换为单词。

text = "1   + 1 =   2   ₹    "

# 符号替换
text = re.sub(r"&", " and ", text)
text = re.sub(r"\|", " or ", text)
text = re.sub(r"=", " equal ", text)
text = re.sub(r"\+", " plus ", text)
text = re.sub(r"₹", " rs ", text)      # 测试!
text = re.sub(r"\$", " dollar ", text)

text
'1    plus  1  equal    2    rs     '

移除多余空格

# 移除多余空格
text = ' '.join(text.split())

text
'1 plus 1 equal 2 rs'

所有代码

def clean_text(text):
    """
    Clean text
    :param text: the string of text
    :return: text string after cleaning
    """
    # unit
    text = re.sub(r"(\d+)kgs ", lambda m: m.group(1) + ' kg ', text)        # e.g. 4kgs => 4 kg
    text = re.sub(r"(\d+)kg ", lambda m: m.group(1) + ' kg ', text)         # e.g. 4kg => 4 kg
    text = re.sub(r"(\d+)k ", lambda m: m.group(1) + '000 ', text)          # e.g. 4k => 4000
    text = re.sub(r"\$(\d+)", lambda m: m.group(1) + ' dollar ', text)
    text = re.sub(r"(\d+)\$", lambda m: m.group(1) + ' dollar ', text)

    # acronym
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"cannot", "can not ", text)
    text = re.sub(r"what\'s", "what is", text)
    text = re.sub(r"What\'s", "what is", text)
    text = re.sub(r"\'ve ", " have ", text)
    text = re.sub(r"n\'t", " not ", text)
    text = re.sub(r"i\'m", "i am ", text)
    text = re.sub(r"I\'m", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"c\+\+", "cplusplus", text)
    text = re.sub(r"c \+\+", "cplusplus", text)
    text = re.sub(r"c \+ \+", "cplusplus", text)
    text = re.sub(r"c#", "csharp", text)
    text = re.sub(r"f#", "fsharp", text)
    text = re.sub(r"g#", "gsharp", text)
    text = re.sub(r" e mail ", " email ", text)
    text = re.sub(r" e \- mail ", " email ", text)
    text = re.sub(r" e\-mail ", " email ", text)
    text = re.sub(r",000", '000', text)
    text = re.sub(r"\'s", " ", text)

    # spelling correction
    text = re.sub(r"ph\.d", "phd", text)
    text = re.sub(r"PhD", "phd", text)
    text = re.sub(r"pokemons", "pokemon", text)
    text = re.sub(r"pokémon", "pokemon", text)
    text = re.sub(r"pokemon go ", "pokemon-go ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" 9 11 ", " 911 ", text)
    text = re.sub(r" j k ", " jk ", text)
    text = re.sub(r" fb ", " facebook ", text)
    text = re.sub(r"facebooks", " facebook ", text)
    text = re.sub(r"facebooking", " facebook ", text)
    text = re.sub(r"insidefacebook", "inside facebook", text)
    text = re.sub(r"donald trump", "trump", text)
    text = re.sub(r"the big bang", "big-bang", text)
    text = re.sub(r"the european union", "eu", text)
    text = re.sub(r" usa ", " america ", text)
    text = re.sub(r" us ", " america ", text)
    text = re.sub(r" u s ", " america ", text)
    text = re.sub(r" U\.S\. ", " america ", text)
    text = re.sub(r" US ", " america ", text)
    text = re.sub(r" American ", " america ", text)
    text = re.sub(r" America ", " america ", text)
    text = re.sub(r" quaro ", " quora ", text)
    text = re.sub(r" mbp ", " macbook-pro ", text)
    text = re.sub(r" mac ", " macbook ", text)
    text = re.sub(r"macbook pro", "macbook-pro", text)
    text = re.sub(r"macbook-pros", "macbook-pro", text)
    text = re.sub(r" 1 ", " one ", text)
    text = re.sub(r" 2 ", " two ", text)
    text = re.sub(r" 3 ", " three ", text)
    text = re.sub(r" 4 ", " four ", text)
    text = re.sub(r" 5 ", " five ", text)
    text = re.sub(r" 6 ", " six ", text)
    text = re.sub(r" 7 ", " seven ", text)
    text = re.sub(r" 8 ", " eight ", text)
    text = re.sub(r" 9 ", " nine ", text)
    text = re.sub(r"googling", " google ", text)
    text = re.sub(r"googled", " google ", text)
    text = re.sub(r"googleable", " google ", text)
    text = re.sub(r"googles", " google ", text)
    text = re.sub(r" rs(\d+)", lambda m: ' rs ' + m.group(1), text)
    text = re.sub(r"(\d+)rs", lambda m: ' rs ' + m.group(1), text)
    text = re.sub(r"the european union", " eu ", text)
    text = re.sub(r"dollars", " dollar ", text)

    # punctuation
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"-", " - ", text)
    text = re.sub(r"/", " / ", text)
    text = re.sub(r"\\", " \ ", text)
    text = re.sub(r"=", " = ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r"\.", " . ", text)
    text = re.sub(r",", " , ", text)
    text = re.sub(r"\?", " ? ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\"", " \" ", text)
    text = re.sub(r"&", " & ", text)
    text = re.sub(r"\|", " | ", text)
    text = re.sub(r";", " ; ", text)
    text = re.sub(r"\(", " ( ", text)
    text = re.sub(r"\)", " ( ", text)

    # symbol replacement
    text = re.sub(r"&", " and ", text)
    text = re.sub(r"\|", " or ", text)
    text = re.sub(r"=", " equal ", text)
    text = re.sub(r"\+", " plus ", text)
    text = re.sub(r"₹", " rs ", text)      # 测试!
    text = re.sub(r"\$", " dollar ", text)

    # remove extra space
    text = ' '.join(text.split())

    return text

调用

text = "I want    to lose    4kgs in a month. What does 4k mean in a salary?What is    the best way to make money with $100? Why India can't compete with China in manufacturing. What is the biggest    scam you've ever seen? Why Should I Learn c#? ph.d PhD   pokemons e g fb usa 1 2 3 googling rs1 dollars 1   +1=  2 ₹ ₹   ₹₹₹₹"

text
"I want    to lose    4kgs in a month. What does 4k mean in a salary?What is    the best way to make money with $100? Why India can't compete with China in manufacturing. What is the biggest    scam you've ever seen? Why Should I Learn c#? ph.d PhD   pokemons e g fb usa 1 2 3 googling rs1 dollars 1   +1=  2 ₹ ₹   ₹₹₹₹"
clean_text(text)
'I want to lose four kg in a month . What does 4000 mean in a salary ? What is the best way to make money with 100 dollar ? Why India can not compete with China in manufacturing . What is the biggest scam you have ever seen ? Why Should I Learn csharp ? phd phd pokemon eg facebook america one two three google rs 1 dollar one plus 1 equal two rs rs rs rs rs rs'

猜你喜欢

转载自blog.csdn.net/wmq104/article/details/82931352
今日推荐