Skip to content

Commit 71d6b37

Browse files
author
y-p
committed
Add mangle_dupe_cols option to read_csv/table GH3468
1 parent 15bca1c commit 71d6b37

File tree

4 files changed

+41
-19
lines changed

4 files changed

+41
-19
lines changed

RELEASE.rst

+3
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ pandas 0.11.1
4444
**KeyError** if **key** is not a valid store object.
4545
- The repr() for (Multi)Index now obeys display.max_seq_items rather
4646
then numpy threshold print options. (GH3426_, GH3466_)
47+
- Added mangle_dupe_cols option to read_table/csv, allowing users
48+
to control legacy behaviour re dupe cols (A, A.1, A.2 vs A, A ) (GH3468_)
4749

4850
**Bug Fixes**
4951

@@ -72,6 +74,7 @@ pandas 0.11.1
7274
.. _GH3466: https://github.com/pydata/pandas/issues/3466
7375
.. _GH3038: https://github.com/pydata/pandas/issues/3038
7476
.. _GH3437: https://github.com/pydata/pandas/issues/3437
77+
.. _GH3468: https://github.com/pydata/pandas/issues/3468
7578
.. _GH3455: https://github.com/pydata/pandas/issues/3455
7679
.. _GH3457: https://github.com/pydata/pandas/issues/3457
7780
.. _GH3461: https://github.com/pydata/pandas/issues/3461

pandas/io/parsers.py

+16-9
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,8 @@ def _read(filepath_or_buffer, kwds):
254254
'verbose': False,
255255
'encoding': None,
256256
'squeeze': False,
257-
'compression': None
257+
'compression': None,
258+
'mangle_dupe_cols': True,
258259
}
259260

260261

@@ -340,7 +341,9 @@ def parser_f(filepath_or_buffer,
340341

341342
verbose=False,
342343
encoding=None,
343-
squeeze=False):
344+
squeeze=False,
345+
mangle_dupe_cols=True
346+
):
344347

345348
# Alias sep -> delimiter.
346349
if delimiter is None:
@@ -396,7 +399,9 @@ def parser_f(filepath_or_buffer,
396399
warn_bad_lines=warn_bad_lines,
397400
error_bad_lines=error_bad_lines,
398401
low_memory=low_memory,
399-
buffer_lines=buffer_lines)
402+
buffer_lines=buffer_lines,
403+
mangle_dupe_cols=mangle_dupe_cols
404+
)
400405

401406
return _read(filepath_or_buffer, kwds)
402407

@@ -1142,6 +1147,7 @@ def __init__(self, f, **kwds):
11421147
self.skipinitialspace = kwds['skipinitialspace']
11431148
self.lineterminator = kwds['lineterminator']
11441149
self.quoting = kwds['quoting']
1150+
self.mangle_dupe_cols = kwds.get('mangle_dupe_cols',True)
11451151

11461152
self.has_index_names = False
11471153
if 'has_index_names' in kwds:
@@ -1323,12 +1329,13 @@ def _infer_columns(self):
13231329
else:
13241330
columns.append(c)
13251331

1326-
counts = {}
1327-
for i, col in enumerate(columns):
1328-
cur_count = counts.get(col, 0)
1329-
if cur_count > 0:
1330-
columns[i] = '%s.%d' % (col, cur_count)
1331-
counts[col] = cur_count + 1
1332+
if self.mangle_dupe_cols:
1333+
counts = {}
1334+
for i, col in enumerate(columns):
1335+
cur_count = counts.get(col, 0)
1336+
if cur_count > 0:
1337+
columns[i] = '%s.%d' % (col, cur_count)
1338+
counts[col] = cur_count + 1
13321339

13331340
self._clear_buffer()
13341341

pandas/io/tests/test_parsers.py

+15-8
Original file line numberDiff line numberDiff line change
@@ -589,14 +589,21 @@ def test_string_nas(self):
589589
tm.assert_frame_equal(result, expected)
590590

591591
def test_duplicate_columns(self):
592-
data = """A,A,B,B,B
593-
1,2,3,4,5
594-
6,7,8,9,10
595-
11,12,13,14,15
596-
"""
597-
df = self.read_table(StringIO(data), sep=',')
598-
self.assert_(np.array_equal(df.columns,
599-
['A', 'A.1', 'B', 'B.1', 'B.2']))
592+
for engine in ['python', 'c']:
593+
data = """A,A,B,B,B
594+
1,2,3,4,5
595+
6,7,8,9,10
596+
11,12,13,14,15
597+
"""
598+
# check default beahviour
599+
df = self.read_table(StringIO(data), sep=',',engine=engine)
600+
self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2'])
601+
602+
df = self.read_table(StringIO(data), sep=',',engine=engine,mangle_dupe_cols=False)
603+
self.assertEqual(list(df.columns), ['A', 'A', 'B', 'B', 'B'])
604+
605+
df = self.read_table(StringIO(data), sep=',',engine=engine,mangle_dupe_cols=True)
606+
self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2'])
600607

601608
def test_csv_mixed_type(self):
602609
data = """A,B,C

pandas/src/parser.pyx

+7-2
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,7 @@ cdef class TextReader:
249249
object dtype
250250
object encoding
251251
object compression
252+
object mangle_dupe_cols
252253
set noconvert, usecols
253254

254255
def __cinit__(self, source,
@@ -298,11 +299,14 @@ cdef class TextReader:
298299
buffer_lines=None,
299300
skiprows=None,
300301
skip_footer=0,
301-
verbose=False):
302+
verbose=False,
303+
mangle_dupe_cols=True):
302304

303305
self.parser = parser_new()
304306
self.parser.chunksize = tokenize_chunksize
305307

308+
self.mangle_dupe_cols=mangle_dupe_cols
309+
306310
# For timekeeping
307311
self.clocks = []
308312

@@ -571,8 +575,9 @@ cdef class TextReader:
571575
if name == '':
572576
name = 'Unnamed: %d' % i
573577

578+
574579
count = counts.get(name, 0)
575-
if count > 0:
580+
if count > 0 and self.mangle_dupe_cols:
576581
header.append('%s.%d' % (name, count))
577582
else:
578583
header.append(name)

0 commit comments

Comments
 (0)