Skip to content

Commit b0dadc5

Browse files
committed
BUG: unnamed columns in a multi-index will be named like: Unamed 2_level_0, so they are not duplicated
ENH: add options ``multi_index_columns_compat`` both to to_csv and read_csv (default is False), to force (when True) the previous behavior of creating a list of tuples (when writing), and reading as a list of tuples (and NOT as a MultiIndex) DOC: add compat flags to io.rst
1 parent d6573f5 commit b0dadc5

File tree

6 files changed

+69
-16
lines changed

6 files changed

+69
-16
lines changed

doc/source/io.rst

+7
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,10 @@ They can take a number of arguments:
115115
- ``error_bad_lines``: if False then any lines causing an error will be skipped :ref:`bad lines <io.bad_lines>`
116116
- ``usecols``: a subset of columns to return, results in much faster parsing
117117
time and lower memory usage.
118+
- ``mangle_dup_columns``: boolean, default True, then duplicate columns will be specified
119+
as 'X.0'...'X.N', rather than 'X'...'X'
120+
- ``multi_index_columns_compat``: boolean, default False, leave a list of tuples on columns
121+
as is (default is to convert to a Multi Index on the columns)
118122

119123
.. ipython:: python
120124
:suppress:
@@ -271,6 +275,9 @@ specified.
271275
data = 'C0,C_l0_g0,C_l0_g1\nC1,C_l1_g0,C_l1_g1\nR0,,\nR_l0_g0,R0C0,R0C1\nR_l0_g1,R1C0,R1C1\nR_l0_g2,R2C0,R2C1\n'
272276
pd.read_csv(StringIO(data), header=[0,1], index_col=[0])
273277
278+
You can pass ``multi_index_columns_compat=True`` to preserve the pre-0.12 behavior of
279+
not converting a list of tuples in the columns to a Multi Index.
280+
274281
.. _io.usecols:
275282

276283
Filtering columns (``usecols``)

pandas/core/format.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -772,9 +772,10 @@ def grouper(x):
772772
class CSVFormatter(object):
773773

774774
def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
775-
cols=None, header=True, index=True, index_label=None,
776-
mode='w', nanRep=None, encoding=None, quoting=None,
777-
line_terminator='\n', chunksize=None, engine=None):
775+
cols=None, header=True, index=True, index_label=None,
776+
mode='w', nanRep=None, encoding=None, quoting=None,
777+
line_terminator='\n', chunksize=None, engine=None,
778+
multi_index_columns_compat=False):
778779

779780
self.engine = engine # remove for 0.12
780781

@@ -803,6 +804,7 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
803804
msg= "columns.is_unique == False not supported with engine='python'"
804805
raise NotImplementedError(msg)
805806

807+
self.multi_index_columns_compat=multi_index_columns_compat
806808
if cols is not None:
807809
if isinstance(cols,Index):
808810
cols = cols.to_native_types(na_rep=na_rep,float_format=float_format)
@@ -959,7 +961,8 @@ def _save_header(self):
959961
index_label = self.index_label
960962
cols = self.cols
961963
header = self.header
962-
has_mi_columns = isinstance(obj.columns, MultiIndex)
964+
has_mi_columns = isinstance(obj.columns, MultiIndex
965+
) and not self.multi_index_columns_compat
963966
encoded_labels = []
964967

965968
has_aliases = isinstance(header, (tuple, list, np.ndarray))

pandas/core/frame.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -1391,7 +1391,8 @@ def to_panel(self):
13911391
def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
13921392
cols=None, header=True, index=True, index_label=None,
13931393
mode='w', nanRep=None, encoding=None, quoting=None,
1394-
line_terminator='\n', chunksize=None,**kwds):
1394+
line_terminator='\n', chunksize=None,
1395+
multi_index_columns_compat=False, **kwds):
13951396
"""
13961397
Write DataFrame to a comma-separated values (csv) file
13971398
@@ -1429,6 +1430,9 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
14291430
quoting : optional constant from csv module
14301431
defaults to csv.QUOTE_MINIMAL
14311432
chunksize : rows to write at a time
1433+
multi_index_columns_compat : boolean, default False
1434+
write multi_index columns as a list of tuples (if True)
1435+
or new (expanded format)m if False)
14321436
"""
14331437
if nanRep is not None: # pragma: no cover
14341438
import warnings
@@ -1445,7 +1449,8 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
14451449
float_format=float_format, cols=cols,
14461450
header=header, index=index,
14471451
index_label=index_label,mode=mode,
1448-
chunksize=chunksize,engine=kwds.get("engine") )
1452+
chunksize=chunksize,engine=kwds.get("engine"),
1453+
multi_index_columns_compat=multi_index_columns_compat)
14491454
formatter.save()
14501455

14511456
def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='',

pandas/io/parsers.py

+19-5
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,11 @@ class DateConversionError(Exception):
127127
usecols : array-like
128128
Return a subset of the columns.
129129
Results in much faster parsing time and lower memory usage.
130+
mangle_dup_columns: boolean, default True
131+
Duplicate columns will be specified as 'X.0'...'X.N', rather than 'X'...'X'
132+
multi_index_columns_compat: boolean, default False
133+
Leave a list of tuples on columns as is (default is to convert to
134+
a Multi Index on the columns)
130135
131136
Returns
132137
-------
@@ -294,6 +299,7 @@ def _read(filepath_or_buffer, kwds):
294299
'squeeze': False,
295300
'compression': None,
296301
'mangle_dupe_cols': True,
302+
'multi_index_columns_compat':False,
297303
}
298304

299305

@@ -380,7 +386,8 @@ def parser_f(filepath_or_buffer,
380386
verbose=False,
381387
encoding=None,
382388
squeeze=False,
383-
mangle_dupe_cols=True
389+
mangle_dupe_cols=True,
390+
multi_index_columns_compat=False,
384391
):
385392

386393
# Alias sep -> delimiter.
@@ -438,7 +445,7 @@ def parser_f(filepath_or_buffer,
438445
error_bad_lines=error_bad_lines,
439446
low_memory=low_memory,
440447
buffer_lines=buffer_lines,
441-
mangle_dupe_cols=mangle_dupe_cols
448+
mangle_dupe_cols=mangle_dupe_cols,
442449
)
443450

444451
return _read(filepath_or_buffer, kwds)
@@ -730,6 +737,7 @@ def __init__(self, kwds):
730737
self.na_values = kwds.get('na_values')
731738
self.true_values = kwds.get('true_values')
732739
self.false_values = kwds.get('false_values')
740+
self.multi_index_columns_compat = kwds.get('multi_index_columns_compat',False)
733741

734742
self._date_conv = _make_date_converter(date_parser=self.date_parser,
735743
dayfirst=self.dayfirst)
@@ -786,7 +794,8 @@ def extract(r):
786794

787795
def _maybe_make_multi_index_columns(self, columns, col_names=None):
788796
# possibly create a column mi here
789-
if len(columns) and not isinstance(columns, MultiIndex) and all([ isinstance(c,tuple) for c in columns]):
797+
if not self.multi_index_columns_compat and len(columns) and not isinstance(
798+
columns, MultiIndex) and all([ isinstance(c,tuple) for c in columns]):
790799
columns = MultiIndex.from_tuples(columns,names=col_names)
791800
return columns
792801

@@ -1430,12 +1439,14 @@ def _infer_columns(self):
14301439

14311440
# we have a mi columns, so read and extra line
14321441
if isinstance(header,(list,tuple,np.ndarray)):
1442+
have_mi_columns = True
14331443
header = list(header) + [header[-1]+1]
14341444
else:
1445+
have_mi_columns = False
14351446
header = [ header ]
14361447

14371448
columns = []
1438-
for hr in header:
1449+
for level, hr in enumerate(header):
14391450

14401451
if len(self.buf) > 0:
14411452
line = self.buf[0]
@@ -1448,7 +1459,10 @@ def _infer_columns(self):
14481459
this_columns = []
14491460
for i, c in enumerate(line):
14501461
if c == '':
1451-
this_columns.append('Unnamed: %d' % i)
1462+
if have_mi_columns:
1463+
this_columns.append('Unnamed: %d_level_%d' % (i,level))
1464+
else:
1465+
this_columns.append('Unnamed: %d' % i)
14521466
else:
14531467
this_columns.append(c)
14541468

pandas/src/parser.pyx

+12-4
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ cdef class TextReader:
232232
cdef:
233233
parser_t *parser
234234
object file_handle
235-
bint factorize, na_filter, verbose, has_usecols
235+
bint factorize, na_filter, verbose, has_usecols, has_mi_columns
236236
int parser_start
237237
list clocks
238238
char *c_encoding
@@ -252,6 +252,7 @@ cdef class TextReader:
252252
object encoding
253253
object compression
254254
object mangle_dupe_cols
255+
object multi_index_columns_compat
255256
set noconvert, usecols
256257

257258
def __cinit__(self, source,
@@ -304,12 +305,14 @@ cdef class TextReader:
304305
skiprows=None,
305306
skip_footer=0,
306307
verbose=False,
307-
mangle_dupe_cols=True):
308+
mangle_dupe_cols=True,
309+
multi_index_columns_compat=False):
308310

309311
self.parser = parser_new()
310312
self.parser.chunksize = tokenize_chunksize
311313

312314
self.mangle_dupe_cols=mangle_dupe_cols
315+
self.multi_index_columns_compat=multi_index_columns_compat
313316

314317
# For timekeeping
315318
self.clocks = []
@@ -437,6 +440,7 @@ cdef class TextReader:
437440
self.leading_cols = 0
438441

439442
# TODO: no header vs. header is not the first row
443+
self.has_mi_columns = 0
440444
if header is None:
441445
# sentinel value
442446
self.parser.header_start = -1
@@ -454,6 +458,7 @@ cdef class TextReader:
454458
self.parser.header_end = header[-1]
455459
self.parser.header = header[0]
456460
self.parser_start = header[-1] + 1
461+
self.has_mi_columns = 1
457462
self.header = header
458463
else:
459464
self.parser.header_start = header
@@ -570,7 +575,7 @@ cdef class TextReader:
570575
if self.parser.header_start >= 0:
571576

572577
# Header is in the file
573-
for hr in self.header:
578+
for level, hr in enumerate(self.header):
574579

575580
this_header = []
576581

@@ -600,7 +605,10 @@ cdef class TextReader:
600605
self.c_encoding, errors)
601606

602607
if name == '':
603-
name = 'Unnamed: %d' % i
608+
if self.has_mi_columns:
609+
name = 'Unnamed: %d_level_%d' % (i,level)
610+
else:
611+
name = 'Unnamed: %d' % i
604612

605613
count = counts.get(name, 0)
606614
if count > 0 and self.mangle_dupe_cols:

pandas/tests/test_frame.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -4991,7 +4991,7 @@ def test_to_csv_multiindex(self):
49914991
with ensure_clean(pname) as path:
49924992
# GH3571, GH1651, GH3141
49934993

4994-
# column & index are multi-iindex
4994+
# column & index are multi-index
49954995
df = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4)
49964996
df.to_csv(path)
49974997
result = read_csv(path,header=[0,1,2,3],index_col=[0,1])
@@ -5003,6 +5003,22 @@ def test_to_csv_multiindex(self):
50035003
result = read_csv(path,header=[0,1,2,3],index_col=0)
50045004
assert_frame_equal(df,result)
50055005

5006+
# dup column names?
5007+
df = mkdf(5,3,r_idx_nlevels=3,c_idx_nlevels=4)
5008+
df.to_csv(path)
5009+
result = read_csv(path,header=[0,1,2,3],index_col=[0,1])
5010+
result.columns = ['R2','A','B','C']
5011+
new_result = result.reset_index().set_index(['R0','R1','R2'])
5012+
new_result.columns = df.columns
5013+
assert_frame_equal(df,new_result)
5014+
5015+
# column & index are multi-index (compatibility)
5016+
df = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4)
5017+
df.to_csv(path,multi_index_columns_compat=True)
5018+
result = read_csv(path,header=0,index_col=[0,1],multi_index_columns_compat=True)
5019+
result.columns = df.columns
5020+
assert_frame_equal(df,result)
5021+
50065022
with ensure_clean(pname) as path:
50075023
# empty
50085024
tsframe[:0].to_csv(path)

0 commit comments

Comments
 (0)