python csv_merge.py

”’

Author: Qingfeng Xia
Date: 20/03/2012
Usage:
API: merge(csv_files,outputfilename=”test.csv”, horizontally=False, header_checking = True, header_count=None)
csv_files: list of file names
command line:
csv_merge.py –help

csv data format requirement:

rows*columns matrix data layout is presumed, i.e. surrogate layout is not checked and supported
the last line of header should contain the field name

specify the header lines count to assist the distinguishment the header and value lines

vertical merge: csv should have the common header and column count, horizontal merge: should have the same row count

##########################example ################################# import csv_merge
#set the correct dialect variable (before calling merge()) if necessary csv_merge.dialect=yourdialect
csv_merge.merge([‘input1.csv’,
‘input1.csv’],’output_horizontally.csv’, True,False,1)
#call the test method by
csv_merge.test()

 

”’

import argparse   # new in python 2.7, but can be installed to python 2.x
from os.path import exists, isfile
import sys

import csv
#logging.debug()

#using this to control the
dialect=None
debug=0

def warn(str):
print “Warning: “,str

def debug_print(info, obj=None):
if debug:
print info, obj

def merge(csv_files,outputfilename=”test.csv”,  horizontally=False, header_checking = True, header_count=None):
# open file to write

# your list of csv files ,     csv_files = [file1, file2]
data_layout_check=False
#output_data=[]
#header_checking = True
#output_layout={}
#header_count = 0
#output_header=[]
# iterate through your list of inputfile
fileaccount=0
#csvContent =[]
for filex in csv_files:
# mark the lines that are header lines
with open(filex, “rb”) as f:
try:
reader = csv.reader(f,dialect=dialect)
csvContent = list(reader)     # list of list

except csv.Error,e:
debug_print(” Error when convert to csv reader for file: “, filex)
sys.exit(‘file %s, line %d: %s’ % (filex, reader.line_num, e))

debug_print( “process the file: “, filex)
debug_print(‘csvContent = ‘, csvContent ) #why empty in the second file?

layout=get_layout(csvContent,header_count)
debug_print(‘layout=’,layout)
#hc=header_count(csvContent, 1)
hc=layout[‘header_count’]

if fileaccount == 0:   # for the first file
#~ hc=1
#~ csvContent =  [[‘Column0’, ‘Column1’, ‘Column2’, ‘Column3’, ‘Column4’, ‘Column5’, ‘Column6’, ‘Column7’, ‘Column8’, ‘Column9′], [’00’, ’01’, ’02’, ’03’, ’04’, ’05’, ’06’, ’07’, ’08’, ’09’], [’10’, ’11’, ’12’, ’13’, ’14’, ’15’, ’16’, ’17’, ’18’, ’19’], [’20’, ’21’, ’22’, ’23’, ’24’, ’25’, ’26’, ’27’, ’28’, ’29’], [’30’, ’31’, ’32’, ’33’, ’34’, ’35’, ’36’, ’37’, ’38’, ’39’], [’40’, ’41’, ’42’, ’43’, ’44’, ’45’, ’46’, ’47’, ’48’, ’49’], [’50’, ’51’, ’52’, ’53’, ’54’, ’55’, ’56’, ’57’, ’58’, ’59’], [’60’, ’61’, ’62’, ’63’, ’64’, ’65’, ’66’, ’67’, ’68’, ’69’], [’70’, ’71’, ’72’, ’73’, ’74’, ’75’, ’76’, ’77’, ’78’, ’79’], [’80’, ’81’, ’82’, ’83’, ’84’, ’85’, ’86’, ’87’, ’88’, ’89’], [’90’, ’91’, ’92’, ’93’, ’94’, ’95’, ’96’, ’97’, ’98’, ’99’]]
#~ output_layout={‘header_count’:1,
#~ ‘rows’:10,
#~ ‘columns’:10 }
output_layout=layout
if hc>0:
output_header=csvContent[:hc]
else:
output_header=[[]]
output_data=csvContent[hc:]
#print “output_data”, output_data

else:  # for the data file to be appended                   fileaccount>0
if hc>0: header=csvContent[:hc]
else: header=[[]]
if header_checking:
if hc != header_count:
warn(‘ the header count is not match the first file’)
# raise Warning(msg)      it is exception
#break -> continue
if horizontally==False:
#further compare the header, DO NOT  use pass, it will exit this func!!!
warn(‘ header fields numbe check is not implemented’)

data=csvContent[hc:]  # remove header
if data_layout_check :
if horizontally:
if output_layout[‘rows’] !=layout[‘rows’]:
warn(‘rows should match for horizontal merge’)
else:
if output_layout[‘columns’] !=layout[‘columns’]:
warn(‘columns should match for vertical merge’)
if output_header[hc-1] !=header[hc-1]:
warn(‘columns fields name does not match for vertical merge’)

if horizontally:
print ‘begin to merge csv files horizontally’
output_layout[‘columns’]=output_layout[‘columns’]+layout[‘columns’]
output_header[output_layout[‘header_count’]-1]=output_header[output_layout[‘header_count’]-1] + header[hc-1]
output_data = [a+b for (a,b) in zip(output_data, data)]  #error here
else:  #vertically
#output_header
output_layout[‘rows’]=output_layout[‘rows’]+layout[‘rows’]
output_data=output_data+data
#debug
debug_print(‘data = ‘,data)
debug_print(‘header line count = ‘, hc)
debug_print(“output_data”, output_data)

fileaccount=fileaccount+1

#To save such a result, you can use:
#how to write back header?
debug_print(“Layout of merged csv files: “,output_layout)

writer = csv.writer(open(outputfilename, “wb”))
writer.writerows(output_header)
writer.writerows(output_data)

#layout should be namedtuple or a dict

def get_layout(contentlist, headerlinescount=None):
if len(contentlist) == 0 or contentlist==None:
return {‘header_count’:0,
‘rows’:0,
‘columns’:0 }
if headerlinescount == None:
hc=header_count(contentlist)
else:
hc=header_count(contentlist)
layout={‘header_count’:hc,
‘rows’:len(contentlist)-hc,
‘columns’:len(contentlist[hc]) }
return layout

def isnumeric(x):
ret=True
try:
float(x) if ‘.’ in x else int(x)
except ValueError:
#print “Not a numeric string.”
ret=False
return ret

“””
it depends on the current csv.dialect to test the
some csv file provide the header count in the first line.
if contentlist[0] == contentlist[-1] >1 is must NOT a header,
isdigit(contentlist[r][0][0])
“””
def header_count(contentlist):
hc=0
for r in contentlist:
if not any( [isnumeric(s) for s in r ]):
hc=hc+1
return hc

def test():
print _parse(u’ -o output.csv -i input1.csv –horizontally’.split())
#
print isnumeric(‘abc’)  #False!
print isnumeric(‘1e-6’) #False!

#generate test data input files
N=10
r=[[str(i)+str(j) for j in range(N)] for i in range(N)]
#print r
header=[‘Column’+str(i) for i in range(N)] #
#print header
w=csv.writer(open(‘input1.csv’, “wb”))
#w.writerow(‘this is a multiple lines header example’)
w.writerow(header)
w.writerows(r)

#~ r=[[str(i)+str(i)+str(j) for j in range(N)] for i in range(N)]
#~ w2=csv.writer(open(‘input2.csv’, “wb”))
#~ w2.writerow(header)
#~ w2.writerows(r)

merge([‘input1.csv’, ‘input1.csv’],’output_horizontally.csv’, True,False,1)
merge([‘input1.csv’, ‘input1.csv’], ‘output_vertically.csv’, False)

def _parse(argv):

ilist=[]
outputfile=””
horizontally=True
inputlist=[]
#parse
parser = argparse.ArgumentParser(description=’Merge multiple csv files into one csv file’)
parser.add_argument(‘–output’, ‘-o’, dest=’outputfile’,   #action=    choices=[]
default=”output.csv”,  required=True,  metavar=’outputfilename’,
help=’output file name (default: output.csv)’ )
parser.add_argument(‘–input’,’-i’, dest=’inputfilelist’,   nargs=’*’,  # generate a list
type=file, required=True, metavar=’inputfile’,
help=’list of input file names (default: empty list)’ )
parser.add_argument(‘–header-count’, ‘-hc’, dest=’headerlinescount’,
type=int, required=False, metavar=’lines count of header’, default=None,
help=’lines count of header, specify it if the auto test the file structure failed ‘ )
group=parser.add_mutually_exclusive_group()
group.add_argument(‘–vertically’, ‘-v’, dest=’horizontally’,
default=False,  action=’store_false’,
help=’layout of output file, using –horizontally to explicitly specifiy it as True,  (default: False->Vertically)’ )
group.add_argument(‘–horizontally’, dest=’horizontally’,
default=False,  action=’store_true’,
help=’layout of output file, using –horizontally to explicitly specifiy it as True,  (default: False->Vertically)’ )
#
#ArgumentParser.set_defaults(**kwargs)

config=parser.parse_args(argv)  #return a dict
#parser.print_usage()  #debug
debug_print( “option parse: “,config)

#validate      the file existence, has been done by     add_argument( type=file
if exists(config.outputfile):
warn(‘the output file exists, it will be replaced’)
return config

def _main():

if debug:
test()
else:
c=_parse(sys.argv[1:])
merge(c.inputfilelist, c.outputfile, c.horizontally, True, c.headerlinescount)

if __name__ == “__main__”:
_main()

 

 

Advertisements

发表评论

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / 更改 )

Twitter picture

You are commenting using your Twitter account. Log Out / 更改 )

Facebook photo

You are commenting using your Facebook account. Log Out / 更改 )

Google+ photo

You are commenting using your Google+ account. Log Out / 更改 )

Connecting to %s