1. 去除标点
def removeBianDian(self,word):
if isinstance(word,str):
word = word.decode("utf8")
string = re.sub("[\.\!\/_,$%^*(+\"\']+|[+——!,。??、~@·#¥%……&*(:)\)-]+".decode("utf8"), "".decode("utf8"),word)
return string
2. 圆角转半角
def strQ2B(self,ustring):
"""全角转半角"""
if isinstance(ustring,str):
ustring = ustring.decode("utf8")
rstring = ""
for uchar in ustring:
inside_code=ord(uchar)
if inside_code == 12288:
inside_code = 32
elif (inside_code >= 65281 and inside_code <= 65374):
inside_code -= 65248
rstring += unichr(inside_code)
return rstring
3. 判断是否为unicode的中文
def isHanZiUnicode(self,value):
if value >= 0x2E80 and value <= 0x2EFF:
return True
if value >= 0x2F00 and value <= 0x2FDF:
return True
if value >= 0x3100 and value <= 0x312F:
return True
if value >= 0x3190 and value <= 0x319F:
return True
if value >= 0x31A0 and value <= 0x31BF:
return True
if value >= 0x31C0 and value <= 0x31EF:
return True
if value >= 0x3400 and value <= 0x4DBF:
return True
if value >= 0x4E00 and value <= 0x9FA5:
return True
if value >= 0x9FA6 and value <= 0x9FFF:
return True
if value >= 0xF900 and value <= 0xFA2D:
return True
if value >= 0xFA30 and value <= 0xFA6A:
return True
if value >= 0x20000 and value <= 0x2A6DF:
return True
if value >= 0x2A700 and value <= 0x2B73F:
return True
if value >= 0x2B740 and value <= 0x2B81F:
return True
if value >= 0xFA70 and value <= 0xFAD9:
return True
return False
4. 判断是否为英文unicode编码
def isEnglishUnicode(self,value):
if value >= 0x61 and value <= 0x7a:
return True
if value >= 0x41 and value <= 0x5a:
return True
return False
5. 判断是否为数字的unicode编码
def isDigitUnicode(self,value):
if value >= 0x30 and value <= 0x39:
return True
return False
6. 判断是否为常用标点
ubiaodian = [0x3002,0xff0c,0xff01,0xff1f,0x3001,0x201c,0x201d,0x300a,0x300b]
#常见标点unicode编码
# 。0x3002
# ,0xff0c
# !0xff01
# ?0xff1f
# 、0x3001
# “ 0x201c
# ” 0x201d
#《 0x300a
# 》0x300b
def IsBiaoDian(self,value):
if value in ubiaodian:
return True
else:
return False