数据清洗----python

#!usr/bin/python
# -*- coding: UTF-8 -*-
from __future__ import unicode_literals
import re
import sys

nums = ['零', '壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖']
decimal_label = ['角', '分']
small_int_label = ['', '拾', '佰', '仟']


def convert(n):
    n=int(n)
    int_part, decimal_part = str(int(n)), str(n - int(n))[2:]
    res = []
    if decimal_part:
        res.append(''.join([nums[int(x)] + y for x, y in zip(decimal_part, decimal_label) if x != '0']))
    if int_part != '0':
        while int_part:
            small_int_part, int_part = int_part[-4:], int_part[:-4]
            tmp = ''.join([nums[int(x)] + (y if x != '0' else '') for x, y in zip(small_int_part[::-1], small_int_label)[::-1]])
            tmp = tmp.rstrip('零').replace('零零零', '零').replace('零零', '零')

            if tmp:
                res.append(tmp)
    return ''.join(res[::-1])

#convert num to hanzi
def num_deal(s):
    val=s
    m = re.findall(r'([0-9]+)',val)
    for num in m:
        tmp=""
        cnt=len(num)
        if 4 == cnt:
            for i in num:
                tmp+=nums[int(i)]
        else:
            tmp=convert(num)
        val=re.sub(num,tmp,val)

    return val


#delete the biaodianfuhao in line
def change2(nval):
    val=nval
    val_1=val.decode('utf-8')
    #unicode chinese huanhang  dunhao
    va1_2=re.sub(u'[^0-9a-zA-Z\u4E00-\u9FA5\u000A]+',' ',val_1)
    val_3=num_deal(va1_2)
    return val_3


#delete the line with all num or chracter
def change1(val):  
    val_1=val
    if val_1 is None or len(val_1) < 3:
        return None
    val_2=re.search(u'[0-9a-zA-Z]+',val_1)
    if val_2 is not None and len(val_2.group()) > 10:
        return None
    else:
        return 1


ls=[]
fr=open(sys.argv[1],"r")
fw=open("res.txt","w")
for line in fr.readlines():
    if change1(line) is None:
        continue
    line2=change2(line) #delete biaodianfuhao
    line3=re.split(' ',line2) #split to small
    for x in line3:
        #print x
        if x not in ls and len(x) > 3:
            ls.append(x)
            fw.write(x.encode('utf-8'))
fr.close()
fw.close()

猜你喜欢

转载自blog.csdn.net/w_manhong/article/details/80005405