Skip to content

Commit 2ae3988

Browse files
committed
BUG: Thoroughly dedup columns in read_csv
1 parent e7c10bb commit 2ae3988

File tree

6 files changed

+62
-28
lines changed

6 files changed

+62
-28
lines changed

doc/source/whatsnew/v0.21.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -263,11 +263,11 @@ Indexing
263263
I/O
264264
^^^
265265

266+
- Bug in :func:`read_csv` in which columns were not being thoroughly de-duplicated (:issue:`17060`)
266267
- Bug in :func:`read_csv` in which non integer values for the header argument generated an unhelpful / unrelated error message (:issue:`16338`)
267268
- Bug in :func:`read_csv` in which memory management issues in exception handling, under certain conditions, would cause the interpreter to segfault (:issue:`14696, :issue:`16798`).
268269
- Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size would incorrectly raise a ``MemoryError`` (:issue:`16798`).
269270
- Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`)
270-
271271
- Bug in :func:`read_html` where import check fails when run in multiple threads (:issue:`16928`)
272272

273273
Plotting

pandas/_libs/parsers.pyx

+8-5
Original file line numberDiff line numberDiff line change
@@ -788,11 +788,14 @@ cdef class TextReader:
788788
unnamed_count += 1
789789

790790
count = counts.get(name, 0)
791-
if (count > 0 and self.mangle_dupe_cols
792-
and not self.has_mi_columns):
793-
this_header.append('%s.%d' % (name, count))
794-
else:
795-
this_header.append(name)
791+
792+
if not self.has_mi_columns and self.mangle_dupe_cols:
793+
while count > 0:
794+
counts[name] = count + 1
795+
name = '%s.%d' % (name, count)
796+
count = counts.get(name, 0)
797+
798+
this_header.append(name)
796799
counts[name] = count + 1
797800

798801
if self.has_mi_columns:

pandas/io/parsers.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -2331,10 +2331,16 @@ def _infer_columns(self):
23312331

23322332
if not have_mi_columns and self.mangle_dupe_cols:
23332333
counts = {}
2334+
23342335
for i, col in enumerate(this_columns):
23352336
cur_count = counts.get(col, 0)
2336-
if cur_count > 0:
2337-
this_columns[i] = '%s.%d' % (col, cur_count)
2337+
2338+
while cur_count > 0:
2339+
counts[col] = cur_count + 1
2340+
col = "%s.%d" % (col, cur_count)
2341+
cur_count = counts.get(col, 0)
2342+
2343+
this_columns[i] = col
23382344
counts[col] = cur_count + 1
23392345
elif have_mi_columns:
23402346

pandas/tests/io/parser/common.py

-19
Original file line numberDiff line numberDiff line change
@@ -224,25 +224,6 @@ def test_unnamed_columns(self):
224224
Index(['A', 'B', 'C', 'Unnamed: 3',
225225
'Unnamed: 4']))
226226

227-
def test_duplicate_columns(self):
228-
# TODO: add test for condition 'mangle_dupe_cols=False'
229-
# once it is actually supported (gh-12935)
230-
data = """A,A,B,B,B
231-
1,2,3,4,5
232-
6,7,8,9,10
233-
11,12,13,14,15
234-
"""
235-
236-
for method in ('read_csv', 'read_table'):
237-
238-
# check default behavior
239-
df = getattr(self, method)(StringIO(data), sep=',')
240-
assert list(df.columns) == ['A', 'A.1', 'B', 'B.1', 'B.2']
241-
242-
df = getattr(self, method)(StringIO(data), sep=',',
243-
mangle_dupe_cols=True)
244-
assert list(df.columns) == ['A', 'A.1', 'B', 'B.1', 'B.2']
245-
246227
def test_csv_mixed_type(self):
247228
data = """A,B,C
248229
a,1,2
+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# -*- coding: utf-8 -*-
2+
3+
"""
4+
Tests that duplicate columns are handled appropriately when parsed by the
5+
CSV engine. In general, the expected result is that they are either thoroughly
6+
de-duplicated (if mangling requested) or ignored otherwise.
7+
"""
8+
9+
from pandas.compat import StringIO
10+
11+
12+
class DupeColumnTests(object):
13+
def test_basic(self):
14+
# TODO: add test for condition "mangle_dupe_cols=False"
15+
# once it is actually supported (gh-12935)
16+
data = "a,a,b,b,b\n1,2,3,4,5"
17+
18+
for method in ("read_csv", "read_table"):
19+
# Check default behavior.
20+
expected = ["a", "a.1", "b", "b.1", "b.2"]
21+
df = getattr(self, method)(StringIO(data), sep=",")
22+
assert list(df.columns) == expected
23+
24+
df = getattr(self, method)(StringIO(data), sep=",",
25+
mangle_dupe_cols=True)
26+
assert list(df.columns) == expected
27+
28+
def test_thorough_mangle(self):
29+
# see gh-17060
30+
data = "a,a,a.1\n1,2,3"
31+
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
32+
assert list(df.columns) == ["a", "a.1", "a.1.1"]
33+
34+
data = "a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6"
35+
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
36+
assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
37+
"a.1.1.1.1", "a.1.1.1.1.1"]
38+
39+
data = "a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7"
40+
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
41+
assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
42+
"a.2", "a.2.1", "a.3.1"]

pandas/tests/io/parser/test_parsers.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,20 @@
1919
from .c_parser_only import CParserTests
2020
from .parse_dates import ParseDatesTests
2121
from .compression import CompressionTests
22+
from .mangle_dupes import DupeColumnTests
2223
from .multithread import MultithreadTests
2324
from .python_parser_only import PythonParserTests
2425
from .dtypes import DtypeTests
2526

2627

2728
class BaseParser(CommentTests, CompressionTests,
2829
ConverterTests, DialectTests,
30+
DtypeTests, DupeColumnTests,
2931
HeaderTests, IndexColTests,
3032
MultithreadTests, NAvaluesTests,
3133
ParseDatesTests, ParserTests,
3234
SkipRowsTests, UsecolsTests,
33-
QuotingTests, DtypeTests):
35+
QuotingTests):
3436

3537
def read_csv(self, *args, **kwargs):
3638
raise NotImplementedError

0 commit comments

Comments
 (0)