Skip to content

Commit 46fcdff

Browse files
committed
ENH: more intelligent inference about index_col for Excel files, test coverage for PR #735
1 parent d60e184 commit 46fcdff

File tree

5 files changed

+112
-39
lines changed

5 files changed

+112
-39
lines changed

pandas/core/frame.py

+11-10
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import with_statement
2+
13
"""
24
DataFrame
35
---------
@@ -853,12 +855,12 @@ def _helper_csvexcel(self, writer, na_rep=None, cols=None, header=True,
853855
index_label = []
854856
for i, name in enumerate(self.index.names):
855857
if name is None:
856-
name = 'level_%d' % i
858+
name = '' # 'level_%d' % i
857859
index_label.append(name)
858860
else:
859861
index_label = self.index.name
860862
if index_label is None:
861-
index_label = ['index']
863+
index_label = ['']
862864
else:
863865
index_label = [index_label]
864866
elif not isinstance(index_label, (list, tuple, np.ndarray)):
@@ -917,8 +919,8 @@ def to_csv(self, path, sep=",", na_rep='', cols=None, header=True,
917919
----------
918920
path : string
919921
File path
920-
nanRep : string, default ''
921-
Missing data rep'n
922+
na_rep : string, default ''
923+
Missing data representation
922924
cols : sequence, optional
923925
Columns to write
924926
header : boolean, default True
@@ -936,18 +938,17 @@ def to_csv(self, path, sep=",", na_rep='', cols=None, header=True,
936938
a string representing the encoding to use if the contents are
937939
non-ascii, for python versions prior to 3
938940
"""
939-
f = open(path, mode)
940-
csvout = csv.writer(f, lineterminator='\n', delimiter=sep)
941-
942941
if nanRep is not None: # pragma: no cover
943942
import warnings
944943
warnings.warn("nanRep is deprecated, use na_rep",
945944
FutureWarning)
946945
na_rep = nanRep
947946

948-
self._helper_csvexcel(csvout, na_rep=na_rep, cols=cols, header=header,
949-
index=index, index_label=index_label, encoding=encoding)
950-
f.close()
947+
with open(path, mode) as f:
948+
csvout = csv.writer(f, lineterminator='\n', delimiter=sep)
949+
self._helper_csvexcel(csvout, na_rep=na_rep, cols=cols,
950+
header=header, index=index,
951+
index_label=index_label, encoding=encoding)
951952

952953
def to_excel(self, excel_writer, sheet_name = 'sheet1', na_rep='', cols=None, header=True,
953954
index=True, index_label=None):

pandas/io/parsers.py

+29-15
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ def _make_reader(self, f):
259259
if sep is None or len(sep) == 1:
260260
sniff_sep = True
261261
# default dialect
262-
dia = csv.excel
262+
dia = csv.excel()
263263
if sep is not None:
264264
sniff_sep = False
265265
dia.delimiter = sep
@@ -477,10 +477,7 @@ def get_chunk(self, rows=None):
477477
for col, f in self.converters.iteritems():
478478
if isinstance(col, int) and col not in self.columns:
479479
col = self.columns[col]
480-
result = lib.map_infer(data[col], f)
481-
if issubclass(result.dtype.type, (basestring, unicode)):
482-
result = result.astype('O')
483-
data[col] = result
480+
data[col] = lib.map_infer(data[col], f)
484481

485482
data = _convert_to_ndarrays(data, self.na_values, self.verbose)
486483

@@ -620,8 +617,14 @@ def _parse_xlsx(self, sheetname, header=0, skiprows=None, index_col=None,
620617
chunksize=None):
621618
sheet = self.book.get_sheet_by_name(name=sheetname)
622619
data = []
623-
for row in sheet.iter_rows(): # it brings a new method: iter_rows()
620+
621+
# it brings a new method: iter_rows()
622+
for row in sheet.iter_rows():
624623
data.append([cell.internal_value for cell in row])
624+
625+
if header is not None:
626+
data[header] = _trim_excel_header(data[header])
627+
625628
parser = TextParser(data, header=header, index_col=index_col,
626629
na_values=na_values,
627630
parse_dates=parse_dates,
@@ -630,7 +633,7 @@ def _parse_xlsx(self, sheetname, header=0, skiprows=None, index_col=None,
630633
chunksize=chunksize)
631634

632635
return parser.get_chunk()
633-
636+
634637
def _parse_xls(self, sheetname, header=0, skiprows=None, index_col=None,
635638
parse_dates=False, date_parser=None, na_values=None,
636639
chunksize=None):
@@ -654,6 +657,9 @@ def _parse_xls(self, sheetname, header=0, skiprows=None, index_col=None,
654657
row.append(value)
655658
data.append(row)
656659

660+
if header is not None:
661+
data[header] = _trim_excel_header(data[header])
662+
657663
parser = TextParser(data, header=header, index_col=index_col,
658664
na_values=na_values,
659665
parse_dates=parse_dates,
@@ -663,9 +669,15 @@ def _parse_xls(self, sheetname, header=0, skiprows=None, index_col=None,
663669

664670
return parser.get_chunk()
665671

672+
def _trim_excel_header(row):
673+
# trim header row so auto-index inference works
674+
while len(row) > 0 and row[0] == '':
675+
row = row[1:]
676+
return row
677+
666678
class ExcelWriter(object):
667679
"""
668-
Class for writing DataFrame objects into excel sheets, uses xlwt for xls,
680+
Class for writing DataFrame objects into excel sheets, uses xlwt for xls,
669681
openpyxl for xlsx. See DataFrame.to_excel for typical usage.
670682
671683
Parameters
@@ -701,14 +713,15 @@ def writerow(self, row, sheet_name=None):
701713
Parameters
702714
----------
703715
row : list
704-
Row of data to save to Excel sheet
716+
Row of data to save to Excel sheet
705717
sheet_name : string, default None
706718
Name of Excel sheet, if None, then use self.cur_sheet
707719
"""
708720
if sheet_name is None:
709721
sheet_name = self.cur_sheet
710-
if sheet_name is None:
711-
raise Exception('Must pass explicit sheet_name or set cur_sheet property')
722+
if sheet_name is None: # pragma: no cover
723+
raise Exception('Must pass explicit sheet_name or set '
724+
'cur_sheet property')
712725
if self.use_xlsx:
713726
self._writerow_xlsx(row, sheet_name)
714727
else:
@@ -720,13 +733,13 @@ def _writerow_xls(self, row, sheet_name):
720733
else:
721734
sheet = self.book.add_sheet(sheet_name)
722735
row_idx = 0
723-
sheetrow = sheet.row(row_idx)
736+
sheetrow = sheet.row(row_idx)
724737
for i, val in enumerate(row):
725738
if isinstance(val, (datetime.datetime, datetime.date)):
726739
if isinstance(val, datetime.datetime):
727-
sheetrow.write(i,val,self.fm_datetime)
740+
sheetrow.write(i,val, self.fm_datetime)
728741
else:
729-
sheetrow.write(i,val,self.fm_date)
742+
sheetrow.write(i,val, self.fm_date)
730743
elif isinstance(val, np.int64):
731744
sheetrow.write(i,int(val))
732745
else:
@@ -744,6 +757,7 @@ def _writerow_xlsx(self, row, sheet_name):
744757
sheet.title = sheet_name
745758
row_idx = 0
746759

747-
sheet.append([int(val) if isinstance(val, np.int64) else val for val in row])
760+
sheet.append([int(val) if isinstance(val, np.int64) else val
761+
for val in row])
748762
row_idx += 1
749763
self.sheets[sheet_name] = (sheet, row_idx)

pandas/io/tests/test_parsers.py

+21
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,27 @@ def test_sniff_delimiter(self):
287287
data2 = read_csv(StringIO(text), index_col=0, delimiter='|')
288288
assert_frame_equal(data, data2)
289289

290+
text = """ignore this
291+
ignore this too
292+
index|A|B|C
293+
foo|1|2|3
294+
bar|4|5|6
295+
baz|7|8|9
296+
"""
297+
data3 = read_csv(StringIO(text), index_col=0, sep=None, skiprows=2)
298+
assert_frame_equal(data, data3)
299+
300+
text = u"""ignore this
301+
ignore this too
302+
index|A|B|C
303+
foo|1|2|3
304+
bar|4|5|6
305+
baz|7|8|9
306+
""".encode('utf-8')
307+
data4 = read_csv(StringIO(text), index_col=0, sep=None, skiprows=2,
308+
encoding='utf-8')
309+
assert_frame_equal(data, data4)
310+
290311
def test_read_nrows(self):
291312
df = read_csv(StringIO(self.data1), nrows=3)
292313
expected = read_csv(StringIO(self.data1))[:3]

pandas/tests/test_frame.py

+45-14
Original file line numberDiff line numberDiff line change
@@ -2265,6 +2265,11 @@ def test_to_csv_unicode(self):
22652265
df.to_csv(path, encoding='UTF-8')
22662266
df2 = read_csv(path, index_col=0, encoding='UTF-8')
22672267
assert_frame_equal(df, df2)
2268+
2269+
df.to_csv(path, encoding='UTF-8', index=False)
2270+
df2 = read_csv(path, index_col=None, encoding='UTF-8')
2271+
assert_frame_equal(df, df2)
2272+
22682273
os.remove(path)
22692274

22702275
def test_to_excel_from_excel(self):
@@ -2281,40 +2286,40 @@ def test_to_excel_from_excel(self):
22812286
# test roundtrip
22822287
self.frame.to_excel(path,'test1')
22832288
reader = ExcelFile(path)
2284-
recons = reader.parse('test1',index_col=0)
2289+
recons = reader.parse('test1', index_col=0)
22852290
assert_frame_equal(self.frame, recons)
2286-
2291+
22872292
self.frame.to_excel(path,'test1', index=False)
22882293
reader = ExcelFile(path)
2289-
recons = reader.parse('test1',index_col=None)
2294+
recons = reader.parse('test1', index_col=None)
22902295
recons.index = self.frame.index
22912296
assert_frame_equal(self.frame, recons)
22922297

22932298
self.frame.to_excel(path,'test1')
22942299
reader = ExcelFile(path)
2295-
recons = reader.parse('test1',index_col=0,skiprows=[1])
2300+
recons = reader.parse('test1', index_col=0, skiprows=[1])
22962301
assert_frame_equal(self.frame.ix[1:], recons)
22972302

22982303
self.frame.to_excel(path,'test1',na_rep='NA')
22992304
reader = ExcelFile(path)
2300-
recons = reader.parse('test1',index_col=0,na_values=['NA'])
2305+
recons = reader.parse('test1', index_col=0, na_values=['NA'])
23012306
assert_frame_equal(self.frame, recons)
2302-
2307+
23032308
self.mixed_frame.to_excel(path,'test1')
23042309
reader = ExcelFile(path)
2305-
recons = reader.parse('test1',index_col=0)
2310+
recons = reader.parse('test1', index_col=0)
23062311
assert_frame_equal(self.mixed_frame, recons)
23072312

2308-
self.tsframe.to_excel(path,'test1')
2313+
self.tsframe.to_excel(path, 'test1')
23092314
reader = ExcelFile(path)
2310-
recons = reader.parse('test1',index_col=0)
2315+
recons = reader.parse('test1')
23112316
assert_frame_equal(self.tsframe, recons)
23122317

23132318
#Test np.int64
23142319
frame = DataFrame(np.random.randn(10,2))
23152320
frame.to_excel(path,'test1')
23162321
reader = ExcelFile(path)
2317-
recons = reader.parse('test1',index_col=0)
2322+
recons = reader.parse('test1')
23182323
assert_frame_equal(frame, recons)
23192324

23202325
# Test writing to separate sheets
@@ -2330,14 +2335,25 @@ def test_to_excel_from_excel(self):
23302335

23312336
os.remove(path)
23322337

2338+
# datetime.date, not sure what to test here exactly
2339+
path = '__tmp__.xls'
2340+
tsf = self.tsframe.copy()
2341+
tsf.index = [x.date() for x in self.tsframe.index]
2342+
tsf.to_excel(path, 'test1')
2343+
reader = ExcelFile(path)
2344+
recons = reader.parse('test1')
2345+
assert_frame_equal(self.tsframe, recons)
2346+
os.remove(path)
2347+
23332348
def test_to_excel_multiindex(self):
23342349
for ext in ['xls', 'xlsx']:
23352350
path = '__tmp__.' + ext
23362351

23372352
frame = self.frame
23382353
old_index = frame.index
23392354
arrays = np.arange(len(old_index)*2).reshape(2,-1)
2340-
new_index = MultiIndex.from_arrays(arrays, names=['first', 'second'])
2355+
new_index = MultiIndex.from_arrays(arrays,
2356+
names=['first', 'second'])
23412357
frame.index = new_index
23422358
frame.to_excel(path, 'test1', header=False)
23432359
frame.to_excel(path, 'test1', cols=['A', 'B'])
@@ -2361,11 +2377,22 @@ def test_to_excel_multiindex(self):
23612377
recons = reader.parse('test1', index_col=[0,1])
23622378
assert_frame_equal(tsframe, recons)
23632379

2364-
# do not load index
2380+
# infer index
23652381
tsframe.to_excel(path, 'test1')
23662382
reader = ExcelFile(path)
2367-
recons = reader.parse('test1', index_col=None)
2368-
np.testing.assert_equal(len(recons.columns), len(tsframe.columns) + 2)
2383+
recons = reader.parse('test1')
2384+
assert_frame_equal(tsframe, recons)
2385+
2386+
# no index
2387+
tsframe.index.names = ['first', 'second']
2388+
tsframe.to_excel(path, 'test1')
2389+
reader = ExcelFile(path)
2390+
recons = reader.parse('test1')
2391+
assert_almost_equal(tsframe.values,
2392+
recons.ix[:, tsframe.columns].values)
2393+
self.assertEqual(len(tsframe.columns) + 2, len(recons.columns))
2394+
2395+
tsframe.index.names = [None, None]
23692396

23702397
# no index
23712398
tsframe.to_excel(path, 'test1', index=False)
@@ -2374,6 +2401,10 @@ def test_to_excel_multiindex(self):
23742401
assert_almost_equal(recons.values, self.tsframe.values)
23752402
self.tsframe.index = old_index # needed if setUP becomes classmethod
23762403

2404+
# write a big DataFrame
2405+
df = DataFrame(np.random.randn(1005, 1))
2406+
df.to_excel(path, 'test1')
2407+
23772408
os.remove(path)
23782409

23792410
def test_info(self):

pandas/tools/tests/test_merge.py

+6
Original file line numberDiff line numberDiff line change
@@ -543,6 +543,12 @@ def test_handle_join_key_pass_array(self):
543543
self.assert_(merged['key'].notnull().all())
544544
self.assert_(merged2['key'].notnull().all())
545545

546+
left = DataFrame({'value' : range(5)}, columns=['value', 'key'])
547+
right = DataFrame({'rvalue' : range(6)})
548+
lkey = np.array([1, 1, 2, 2, 3])
549+
rkey = np.array([1, 1, 2, 3, 4, 5])
550+
551+
546552
class TestMergeMulti(unittest.TestCase):
547553

548554
def setUp(self):

0 commit comments

Comments
 (0)