Skip to content

Commit 4ecb04e

Browse files
author
y-p
committed
Merge pull request #3511 from y-p/PR_mangle_dupe_cols_option
ENH: add mode.mangle_dupe_cols option GH3468
2 parents cc282d8 + a5373ea commit 4ecb04e

File tree

4 files changed

+44
-19
lines changed

4 files changed

+44
-19
lines changed

RELEASE.rst

+6
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,11 @@ pandas 0.11.1
4545
**KeyError** if **key** is not a valid store object.
4646
- The repr() for (Multi)Index now obeys display.max_seq_items rather
4747
then numpy threshold print options. (GH3426_, GH3466_)
48+
- Added mangle_dupe_cols option to read_table/csv, allowing users
49+
to control legacy behaviour re dupe cols (A, A.1, A.2 vs A, A ) (GH3468_)
50+
Note: The default value will change in 0.12 to the "no mangle" behaviour,
51+
If your code relies on this behaviour, explicitly specify mangle_dupe_cols=True
52+
in your calls.
4853

4954
**Bug Fixes**
5055

@@ -87,6 +92,7 @@ pandas 0.11.1
8792
.. _GH3038: https://github.com/pydata/pandas/issues/3038
8893
.. _GH3510: https://github.com/pydata/pandas/issues/3510
8994
.. _GH3437: https://github.com/pydata/pandas/issues/3437
95+
.. _GH3468: https://github.com/pydata/pandas/issues/3468
9096
.. _GH3455: https://github.com/pydata/pandas/issues/3455
9197
.. _GH3457: https://github.com/pydata/pandas/issues/3457
9298
.. _GH3477: https://github.com/pydata/pandas/issues/3457

pandas/io/parsers.py

+16-9
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,8 @@ def _read(filepath_or_buffer, kwds):
266266
'verbose': False,
267267
'encoding': None,
268268
'squeeze': False,
269-
'compression': None
269+
'compression': None,
270+
'mangle_dupe_cols': True,
270271
}
271272

272273

@@ -352,7 +353,9 @@ def parser_f(filepath_or_buffer,
352353

353354
verbose=False,
354355
encoding=None,
355-
squeeze=False):
356+
squeeze=False,
357+
mangle_dupe_cols=True
358+
):
356359

357360
# Alias sep -> delimiter.
358361
if delimiter is None:
@@ -408,7 +411,9 @@ def parser_f(filepath_or_buffer,
408411
warn_bad_lines=warn_bad_lines,
409412
error_bad_lines=error_bad_lines,
410413
low_memory=low_memory,
411-
buffer_lines=buffer_lines)
414+
buffer_lines=buffer_lines,
415+
mangle_dupe_cols=mangle_dupe_cols
416+
)
412417

413418
return _read(filepath_or_buffer, kwds)
414419

@@ -1154,6 +1159,7 @@ def __init__(self, f, **kwds):
11541159
self.skipinitialspace = kwds['skipinitialspace']
11551160
self.lineterminator = kwds['lineterminator']
11561161
self.quoting = kwds['quoting']
1162+
self.mangle_dupe_cols = kwds.get('mangle_dupe_cols',True)
11571163

11581164
self.has_index_names = False
11591165
if 'has_index_names' in kwds:
@@ -1335,12 +1341,13 @@ def _infer_columns(self):
13351341
else:
13361342
columns.append(c)
13371343

1338-
counts = {}
1339-
for i, col in enumerate(columns):
1340-
cur_count = counts.get(col, 0)
1341-
if cur_count > 0:
1342-
columns[i] = '%s.%d' % (col, cur_count)
1343-
counts[col] = cur_count + 1
1344+
if self.mangle_dupe_cols:
1345+
counts = {}
1346+
for i, col in enumerate(columns):
1347+
cur_count = counts.get(col, 0)
1348+
if cur_count > 0:
1349+
columns[i] = '%s.%d' % (col, cur_count)
1350+
counts[col] = cur_count + 1
13441351

13451352
self._clear_buffer()
13461353

pandas/io/tests/test_parsers.py

+15-8
Original file line numberDiff line numberDiff line change
@@ -589,14 +589,21 @@ def test_string_nas(self):
589589
tm.assert_frame_equal(result, expected)
590590

591591
def test_duplicate_columns(self):
592-
data = """A,A,B,B,B
593-
1,2,3,4,5
594-
6,7,8,9,10
595-
11,12,13,14,15
596-
"""
597-
df = self.read_table(StringIO(data), sep=',')
598-
self.assert_(np.array_equal(df.columns,
599-
['A', 'A.1', 'B', 'B.1', 'B.2']))
592+
for engine in ['python', 'c']:
593+
data = """A,A,B,B,B
594+
1,2,3,4,5
595+
6,7,8,9,10
596+
11,12,13,14,15
597+
"""
598+
# check default beahviour
599+
df = self.read_table(StringIO(data), sep=',',engine=engine)
600+
self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2'])
601+
602+
df = self.read_table(StringIO(data), sep=',',engine=engine,mangle_dupe_cols=False)
603+
self.assertEqual(list(df.columns), ['A', 'A', 'B', 'B', 'B'])
604+
605+
df = self.read_table(StringIO(data), sep=',',engine=engine,mangle_dupe_cols=True)
606+
self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2'])
600607

601608
def test_csv_mixed_type(self):
602609
data = """A,B,C

pandas/src/parser.pyx

+7-2
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,7 @@ cdef class TextReader:
249249
object dtype
250250
object encoding
251251
object compression
252+
object mangle_dupe_cols
252253
set noconvert, usecols
253254

254255
def __cinit__(self, source,
@@ -298,11 +299,14 @@ cdef class TextReader:
298299
buffer_lines=None,
299300
skiprows=None,
300301
skip_footer=0,
301-
verbose=False):
302+
verbose=False,
303+
mangle_dupe_cols=True):
302304

303305
self.parser = parser_new()
304306
self.parser.chunksize = tokenize_chunksize
305307

308+
self.mangle_dupe_cols=mangle_dupe_cols
309+
306310
# For timekeeping
307311
self.clocks = []
308312

@@ -571,8 +575,9 @@ cdef class TextReader:
571575
if name == '':
572576
name = 'Unnamed: %d' % i
573577

578+
574579
count = counts.get(name, 0)
575-
if count > 0:
580+
if count > 0 and self.mangle_dupe_cols:
576581
header.append('%s.%d' % (name, count))
577582
else:
578583
header.append(name)

0 commit comments

Comments
 (0)