Python using a large amount of split data file CSV (effective pro-test)

Reprinted: https: //www.cnblogs.com/FYZHANG/p/11629075.html

Once run successful, thank bloggers share

 

#! / Usr / bin / env python3 
# - * - Coding: UTF - 8  - * - 
# @FileName : Test.py 
# @Software PyCharm 

Import os 
Import PANDAS AS pd 

# filename for the file path, file_num after split file number of lines 
# depending on whether the header execute different programs, there is a default header 
DEF Data_split (filename, file_num, header = True):
     iF header: 
        # settings on each file requires some number of rows, initialized to 1000W 
        chunkSize = 10000 
        DATAl = pd.read_table (filename, chunkSize = chunkSize, On Sep= ' , ' , Encoding = ' GBK ' ) 
        # Print (DATAl) 
        # NUM represents the number of rows 
        NUM = 0 
        for the chunk in DATAl: 
            NUM + = len (the chunk) 
        # Print (NUM) 
        # chunkSize file needs to be allocated to each represents the number of rows 
        chunkSize = round (NUM / the file_num + . 1 ) 
        # Print (chunkSize) 
        # separated from the filename extension os.path.split (filename) 
        head, tail = the os.path.split (filename)
        DATA2 = pd.read_table (filename, chunkSize = chunkSize, On Sep = ' , ' , encoding = ' GBK ' ) 
        I = 0 
        for the chunk in DATA2: 
            chunk.to_csv ( ' {_} {0} {2}. 1 ' .format (head, I, tail), header = None, index = False)
             Print ( ' save data {0} ' .format (I)) 
            I + = . 1 
    the else : 
        the number of lines obtained for each desired file #
        chunksize=10000
        data1=pd.read_table(filename,chunksize=chunksize,header=None,sep=',')
        num=0
        for chunk in data1:
            num+=len(chunk)
            chunksize=round(num/file_num+1)

            head,tail=os.path.split(filename)
            data2=pd.read_table(filename,chunksize=chunkSize, header = None, On Sep = ' , ' ) 
            I = 0 
            for the chunk in DATA2: 
                chunk.to_csv ( ' {_} {0} {2}. 1 ' .format (head, I, tail), header = None, index = False)
                 Print ( ' save data {0} ' .format (I)) 
                I + = . 1 

filename = ' file path ' 
the #NUM file is split into a number of 
Data_split (filename, NUM, header = True )

Probably because the version of the reasons, you will be prompted read_csv running,

The last line of code, filename of the file to fill in the address you want to split, num fill in how many files you want to split into

Guess you like

Origin www.cnblogs.com/bravesunforever/p/12075670.html