Skip to content

Commit 8b55754

Browse files
committed
BUG: Thoroughly dedup column names in read_csv
1 parent c6e5bf6 commit 8b55754

File tree

3 files changed

+29
-3
lines changed

3 files changed

+29
-3
lines changed

doc/source/whatsnew/v0.21.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,7 @@ I/O
278278
^^^
279279

280280
- Bug in :func:`read_csv` in which columns were not being thoroughly de-duplicated (:issue:`17060`)
281+
- Bug in :func:`read_csv` in which specified column names were not being thoroughly de-duplicated (:issue:`17095`)
281282
- Bug in :func:`read_csv` in which non integer values for the header argument generated an unhelpful / unrelated error message (:issue:`16338`)
282283
- Bug in :func:`read_csv` in which memory management issues in exception handling, under certain conditions, would cause the interpreter to segfault (:issue:`14696`, :issue:`16798`).
283284
- Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size would incorrectly raise a ``MemoryError`` (:issue:`16798`).

pandas/io/parsers.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -1323,9 +1323,12 @@ def _maybe_dedup_names(self, names):
13231323
for i, col in enumerate(names):
13241324
cur_count = counts.get(col, 0)
13251325

1326-
if cur_count > 0:
1327-
names[i] = '%s.%d' % (col, cur_count)
1326+
while cur_count > 0:
1327+
counts[col] = cur_count + 1
1328+
col = '%s.%d' % (col, cur_count)
1329+
cur_count = counts.get(col, 0)
13281330

1331+
names[i] = col
13291332
counts[col] = cur_count + 1
13301333

13311334
return names

pandas/tests/io/parser/mangle_dupes.py

+23-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def test_basic(self):
2525
mangle_dupe_cols=True)
2626
assert list(df.columns) == expected
2727

28-
def test_thorough_mangle(self):
28+
def test_thorough_mangle_columns(self):
2929
# see gh-17060
3030
data = "a,a,a.1\n1,2,3"
3131
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
@@ -40,3 +40,25 @@ def test_thorough_mangle(self):
4040
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
4141
assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
4242
"a.2", "a.2.1", "a.3.1"]
43+
44+
def test_thorough_mangle_names(self):
45+
# see gh-17095
46+
data = "a,b,b\n1,2,3"
47+
names = ["a", "a", "a.1"]
48+
df = self.read_csv(StringIO(data), sep=",", names=names,
49+
mangle_dupe_cols=True)
50+
assert list(df.columns) == ["a", "a.1", "a.1.1"]
51+
52+
data = "a,b,c,d,e,f\n1,2,3,4,5,6"
53+
names = ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"]
54+
df = self.read_csv(StringIO(data), sep=",", names=names,
55+
mangle_dupe_cols=True)
56+
assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
57+
"a.1.1.1.1", "a.1.1.1.1.1"]
58+
59+
data = "a,b,c,d,e,f,g\n1,2,3,4,5,6,7"
60+
names = ["a", "a", "a.3", "a.1", "a.2", "a", "a"]
61+
df = self.read_csv(StringIO(data), sep=",", names=names,
62+
mangle_dupe_cols=True)
63+
assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
64+
"a.2", "a.2.1", "a.3.1"]

0 commit comments

Comments
 (0)