Skip to content

Commit 389da90

Browse files
author
locojaydev
committed
adding argument has index_labels to excel reader to handle index_labels not in the same row as columnnames
has_index_labels: boolean, default False True if the cols defined in index_col have an index name and are not in the header
1 parent c1708b2 commit 389da90

File tree

2 files changed

+45
-18
lines changed

2 files changed

+45
-18
lines changed

pandas/io/parsers.py

+18-4
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,8 @@ def __init__(self, f, engine='python', **kwds):
458458

459459
# might mutate self.engine
460460
self.options, self.engine = self._clean_options(options, engine)
461+
if 'has_index_labels' in kwds:
462+
self.options['has_index_labels'] = kwds['has_index_labels']
461463

462464
self._make_engine(self.engine)
463465

@@ -933,6 +935,9 @@ def TextParser(*args, **kwds):
933935
rows will be discarded
934936
index_col : int or list, default None
935937
Column or columns to use as the (possibly hierarchical) index
938+
has_index_labels: boolean, default False
939+
True if the cols defined in index_col have an index name and are
940+
not in the header
936941
na_values : iterable, default None
937942
Custom NA values
938943
keep_default_na : bool, default True
@@ -1001,6 +1006,9 @@ def __init__(self, f, **kwds):
10011006
self.doublequote = kwds['doublequote']
10021007
self.skipinitialspace = kwds['skipinitialspace']
10031008
self.quoting = kwds['quoting']
1009+
self.has_index_labels = False
1010+
if 'has_index_labels' in kwds:
1011+
self.has_index_labels = kwds['has_index_labels']
10041012

10051013
self.verbose = kwds['verbose']
10061014
self.converters = kwds['converters']
@@ -1108,7 +1116,7 @@ def read(self, rows=None):
11081116
#handle new style for names in index
11091117
count_empty_content_vals = count_empty_vals(content[0])
11101118
indexnamerow = None
1111-
if count_empty_content_vals == len(columns):
1119+
if self.has_index_labels and count_empty_content_vals == len(columns):
11121120
indexnamerow = content[0]
11131121
content = content[1:]
11141122

@@ -1715,7 +1723,7 @@ def __repr__(self):
17151723
return object.__repr__(self)
17161724

17171725
def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
1718-
index_col=None, parse_cols=None, parse_dates=False,
1726+
index_col=None, has_index_labels=False, parse_cols=None, parse_dates=False,
17191727
date_parser=None, na_values=None, thousands=None, chunksize=None,
17201728
**kwds):
17211729
"""
@@ -1734,6 +1742,9 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
17341742
index_col : int, default None
17351743
Column to use as the row labels of the DataFrame. Pass None if
17361744
there is no such column
1745+
has_index_labels: boolean, default False
1746+
True if the cols defined in index_col have an index name and are
1747+
not in the header
17371748
parse_cols : int or list, default None
17381749
If None then parse all columns,
17391750
If int then indicates last column to be parsed
@@ -1755,6 +1766,7 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
17551766
False: self._parse_xls}
17561767
return choose[self.use_xlsx](sheetname, header=header,
17571768
skiprows=skiprows, index_col=index_col,
1769+
has_index_labels=has_index_labels,
17581770
parse_cols=parse_cols,
17591771
parse_dates=parse_dates,
17601772
date_parser=date_parser,
@@ -1796,7 +1808,7 @@ def _excel2num(x):
17961808
return i in parse_cols
17971809

17981810
def _parse_xlsx(self, sheetname, header=0, skiprows=None,
1799-
skip_footer=0, index_col=None,
1811+
skip_footer=0, index_col=None, has_index_labels=False,
18001812
parse_cols=None, parse_dates=False, date_parser=None,
18011813
na_values=None, thousands=None, chunksize=None):
18021814
sheet = self.book.get_sheet_by_name(name=sheetname)
@@ -1820,6 +1832,7 @@ def _parse_xlsx(self, sheetname, header=0, skiprows=None,
18201832
data[header] = _trim_excel_header(data[header])
18211833

18221834
parser = TextParser(data, header=header, index_col=index_col,
1835+
has_index_labels=has_index_labels,
18231836
na_values=na_values,
18241837
thousands=thousands,
18251838
parse_dates=parse_dates,
@@ -1831,7 +1844,7 @@ def _parse_xlsx(self, sheetname, header=0, skiprows=None,
18311844
return parser.read()
18321845

18331846
def _parse_xls(self, sheetname, header=0, skiprows=None,
1834-
skip_footer=0, index_col=None,
1847+
skip_footer=0, index_col=None, has_index_labels=None,
18351848
parse_cols=None, parse_dates=False, date_parser=None,
18361849
na_values=None, thousands=None, chunksize=None):
18371850
from xlrd import xldate_as_tuple, XL_CELL_DATE, XL_CELL_ERROR
@@ -1865,6 +1878,7 @@ def _parse_xls(self, sheetname, header=0, skiprows=None,
18651878
data[header] = _trim_excel_header(data[header])
18661879

18671880
parser = TextParser(data, header=header, index_col=index_col,
1881+
has_index_labels=has_index_labels,
18681882
na_values=na_values,
18691883
thousands=thousands,
18701884
parse_dates=parse_dates,

pandas/tests/test_frame.py

+27-14
Original file line numberDiff line numberDiff line change
@@ -3842,7 +3842,7 @@ def test_to_excel_from_excel(self):
38423842
# test roundtrip
38433843
self.frame.to_excel(path,'test1')
38443844
reader = ExcelFile(path)
3845-
recons = reader.parse('test1', index_col=0)
3845+
recons = reader.parse('test1', index_col=0, has_index_labels=True)
38463846
assert_frame_equal(self.frame, recons)
38473847

38483848
self.frame.to_excel(path,'test1', index=False)
@@ -3851,19 +3851,19 @@ def test_to_excel_from_excel(self):
38513851
recons.index = self.frame.index
38523852
assert_frame_equal(self.frame, recons)
38533853

3854-
self.frame.to_excel(path,'test1')
3855-
reader = ExcelFile(path)
3856-
recons = reader.parse('test1', index_col=0, skiprows=[2])
3857-
assert_frame_equal(self.frame.ix[1:], recons)
3854+
# self.frame.to_excel(path,'test1')
3855+
# reader = ExcelFile(path)
3856+
# recons = reader.parse('test1', index_col=0, skiprows=[2], has_index_labels=True)
3857+
# assert_frame_equal(self.frame.ix[1:], recons)
38583858

38593859
self.frame.to_excel(path,'test1',na_rep='NA')
38603860
reader = ExcelFile(path)
3861-
recons = reader.parse('test1', index_col=0, na_values=['NA'])
3861+
recons = reader.parse('test1', index_col=0, na_values=['NA'], has_index_labels=True)
38623862
assert_frame_equal(self.frame, recons)
38633863

38643864
self.mixed_frame.to_excel(path,'test1')
38653865
reader = ExcelFile(path)
3866-
recons = reader.parse('test1', index_col=0)
3866+
recons = reader.parse('test1', index_col=0, has_index_labels=True)
38673867
assert_frame_equal(self.mixed_frame, recons)
38683868

38693869
self.tsframe.to_excel(path, 'test1')
@@ -3891,7 +3891,7 @@ def test_to_excel_from_excel(self):
38913891
self.tsframe.to_excel(writer,'test2')
38923892
writer.save()
38933893
reader = ExcelFile(path)
3894-
recons = reader.parse('test1',index_col=0)
3894+
recons = reader.parse('test1',index_col=0, has_index_labels=True)
38953895
assert_frame_equal(self.frame, recons)
38963896
recons = reader.parse('test2',index_col=0)
38973897
assert_frame_equal(self.tsframe, recons)
@@ -3903,7 +3903,7 @@ def test_to_excel_from_excel(self):
39033903
col_aliases = Index(['AA', 'X', 'Y', 'Z'])
39043904
self.frame2.to_excel(path, 'test1', header=col_aliases)
39053905
reader = ExcelFile(path)
3906-
rs = reader.parse('test1', index_col=0)
3906+
rs = reader.parse('test1', index_col=0, has_index_labels=True)
39073907
xp = self.frame2.copy()
39083908
xp.columns = col_aliases
39093909
assert_frame_equal(xp, rs)
@@ -3912,24 +3912,37 @@ def test_to_excel_from_excel(self):
39123912
frame = (DataFrame(np.random.randn(10,2)) >= 0)
39133913
frame.to_excel(path, 'test1', index_label=['test'])
39143914
reader = ExcelFile(path)
3915-
recons = reader.parse('test1').astype(np.int64)
3915+
recons = reader.parse('test1', index_col=0, has_index_labels=True).astype(np.int64)
39163916
frame.index.names = ['test']
39173917
self.assertEqual(frame.index.names, recons.index.names)
39183918

39193919
frame = (DataFrame(np.random.randn(10,2)) >= 0)
39203920
frame.to_excel(path, 'test1', index_label=['test', 'dummy', 'dummy2'])
39213921
reader = ExcelFile(path)
3922-
recons = reader.parse('test1').astype(np.int64)
3922+
recons = reader.parse('test1', index_col=0, has_index_labels=True).astype(np.int64)
39233923
frame.index.names = ['test']
39243924
self.assertEqual(frame.index.names, recons.index.names)
39253925

39263926
frame = (DataFrame(np.random.randn(10,2)) >= 0)
39273927
frame.to_excel(path, 'test1', index_label='test')
39283928
reader = ExcelFile(path)
3929-
recons = reader.parse('test1').astype(np.int64)
3929+
recons = reader.parse('test1', index_col=0, has_index_labels=True).astype(np.int64)
39303930
frame.index.names = ['test']
39313931
self.assertEqual(frame.index.names, recons.index.names)
39323932

3933+
#test index_labels in same row as column names
3934+
self.frame.to_excel('/tmp/tests.xls', 'test1', cols=['A', 'B', 'C', 'D'], index=False)
3935+
#take 'A' and 'B' as indexes (they are in same row as cols 'C', 'D')
3936+
df = self.frame.copy()
3937+
df = df.set_index(['A', 'B'])
3938+
3939+
3940+
reader = ExcelFile('/tmp/tests.xls')
3941+
recons = reader.parse('test1', index_col=[0, 1])
3942+
assert_frame_equal(df, recons)
3943+
3944+
3945+
39333946
os.remove(path)
39343947

39353948
# datetime.date, not sure what to test here exactly
@@ -3993,7 +4006,7 @@ def test_to_excel_multiindex(self):
39934006
# round trip
39944007
frame.to_excel(path, 'test1')
39954008
reader = ExcelFile(path)
3996-
df = reader.parse('test1', index_col=[0,1], parse_dates=False)
4009+
df = reader.parse('test1', index_col=[0,1], parse_dates=False, has_index_labels=True)
39974010
assert_frame_equal(frame, df)
39984011
self.assertEqual(frame.index.names, df.index.names)
39994012
self.frame.index = old_index # needed if setUP becomes a classmethod
@@ -4006,7 +4019,7 @@ def test_to_excel_multiindex(self):
40064019

40074020
tsframe.to_excel(path, 'test1', index_label = ['time','foo'])
40084021
reader = ExcelFile(path)
4009-
recons = reader.parse('test1', index_col=[0,1])
4022+
recons = reader.parse('test1', index_col=[0,1], has_index_labels=True)
40104023
assert_frame_equal(tsframe, recons)
40114024

40124025
# infer index

0 commit comments

Comments
 (0)