nlp文本常见预处理方法

1. 去除标点

    def removeBianDian(self,word):
        if isinstance(word,str):
            word = word.decode("utf8")
        string = re.sub("[\.\!\/_,$%^*(+\"\']+|[+——!,。??、~@·#¥%……&*(:)\)-]+".decode("utf8"), "".decode("utf8"),word) 
        return string

2. 圆角转半角

    def strQ2B(self,ustring):
        """全角转半角"""
        if isinstance(ustring,str):
            ustring = ustring.decode("utf8")
        rstring = ""
        for uchar in ustring:
            inside_code=ord(uchar)
            if inside_code == 12288:                                  
                inside_code = 32 
            elif (inside_code >= 65281 and inside_code <= 65374):
                inside_code -= 65248
            rstring += unichr(inside_code)
        return rstring

3. 判断是否为unicode的中文

    def isHanZiUnicode(self,value):
        if value >= 0x2E80 and value <= 0x2EFF:
                return True
        if value >= 0x2F00 and value <= 0x2FDF:
                return True
        if value >= 0x3100 and value <= 0x312F:
                return True    
        if value >= 0x3190 and value <= 0x319F:
                return True
        if value >= 0x31A0 and value <= 0x31BF:
                return True
        if value >= 0x31C0 and value <= 0x31EF:
                return True
        if value >= 0x3400 and value <= 0x4DBF:
                return True
        if value >= 0x4E00 and value <= 0x9FA5:
                return True
        if value >= 0x9FA6 and value <= 0x9FFF:
                return True
        if value >= 0xF900 and value <= 0xFA2D:
                return True
        if value >= 0xFA30 and value <= 0xFA6A:
                return True
        if value >= 0x20000 and value <= 0x2A6DF:
                return True
        if value >= 0x2A700 and value <= 0x2B73F:
                return True
        if value >= 0x2B740 and value <= 0x2B81F:
                return True
        if value >= 0xFA70 and value <= 0xFAD9:
                return True
        return False

4. 判断是否为英文unicode编码

    def isEnglishUnicode(self,value):
        if value >= 0x61 and value <= 0x7a:
            return True
        if value >= 0x41 and value <= 0x5a:
            return True
        return False

5. 判断是否为数字的unicode编码

    def isDigitUnicode(self,value):
        if value >= 0x30 and value <= 0x39:
            return True
        return False

6. 判断是否为常用标点

ubiaodian = [0x3002,0xff0c,0xff01,0xff1f,0x3001,0x201c,0x201d,0x300a,0x300b]
    #常见标点unicode编码
    # 。0x3002
    # ,0xff0c
    # !0xff01
    # ?0xff1f
    # 、0x3001
    # “ 0x201c
    # ” 0x201d
    #《 0x300a
    # 》0x300b
def IsBiaoDian(self,value):
        if value in ubiaodian:
            return True
        else:
            return False

猜你喜欢

转载自blog.csdn.net/u011195431/article/details/82981139