Skip to content

Commit 611d296

Browse files
gfyoungjreback
authored andcommitted
BUG: Thoroughly dedup column names in read_csv (#17095)
1 parent f394409 commit 611d296

File tree

3 files changed

+35
-8
lines changed

3 files changed

+35
-8
lines changed

doc/source/whatsnew/v0.21.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,7 @@ I/O
279279
^^^
280280

281281
- Bug in :func:`read_csv` in which columns were not being thoroughly de-duplicated (:issue:`17060`)
282+
- Bug in :func:`read_csv` in which specified column names were not being thoroughly de-duplicated (:issue:`17095`)
282283
- Bug in :func:`read_csv` in which non integer values for the header argument generated an unhelpful / unrelated error message (:issue:`16338`)
283284
- Bug in :func:`read_csv` in which memory management issues in exception handling, under certain conditions, would cause the interpreter to segfault (:issue:`14696`, :issue:`16798`).
284285
- Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size would incorrectly raise a ``MemoryError`` (:issue:`16798`).

pandas/io/parsers.py

+11-7
Original file line numberDiff line numberDiff line change
@@ -1318,14 +1318,18 @@ def _maybe_dedup_names(self, names):
13181318
# would be nice!
13191319
if self.mangle_dupe_cols:
13201320
names = list(names) # so we can index
1321-
counts = {}
1321+
counts = defaultdict(int)
13221322

13231323
for i, col in enumerate(names):
1324-
cur_count = counts.get(col, 0)
1324+
cur_count = counts[col]
13251325

1326-
if cur_count > 0:
1327-
names[i] = '%s.%d' % (col, cur_count)
1326+
while cur_count > 0:
1327+
counts[col] = cur_count + 1
13281328

1329+
col = '%s.%d' % (col, cur_count)
1330+
cur_count = counts[col]
1331+
1332+
names[i] = col
13291333
counts[col] = cur_count + 1
13301334

13311335
return names
@@ -2330,15 +2334,15 @@ def _infer_columns(self):
23302334
this_columns.append(c)
23312335

23322336
if not have_mi_columns and self.mangle_dupe_cols:
2333-
counts = {}
2337+
counts = defaultdict(int)
23342338

23352339
for i, col in enumerate(this_columns):
2336-
cur_count = counts.get(col, 0)
2340+
cur_count = counts[col]
23372341

23382342
while cur_count > 0:
23392343
counts[col] = cur_count + 1
23402344
col = "%s.%d" % (col, cur_count)
2341-
cur_count = counts.get(col, 0)
2345+
cur_count = counts[col]
23422346

23432347
this_columns[i] = col
23442348
counts[col] = cur_count + 1

pandas/tests/io/parser/mangle_dupes.py

+23-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def test_basic(self):
2525
mangle_dupe_cols=True)
2626
assert list(df.columns) == expected
2727

28-
def test_thorough_mangle(self):
28+
def test_thorough_mangle_columns(self):
2929
# see gh-17060
3030
data = "a,a,a.1\n1,2,3"
3131
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
@@ -40,3 +40,25 @@ def test_thorough_mangle(self):
4040
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
4141
assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
4242
"a.2", "a.2.1", "a.3.1"]
43+
44+
def test_thorough_mangle_names(self):
45+
# see gh-17095
46+
data = "a,b,b\n1,2,3"
47+
names = ["a.1", "a.1", "a.1.1"]
48+
df = self.read_csv(StringIO(data), sep=",", names=names,
49+
mangle_dupe_cols=True)
50+
assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"]
51+
52+
data = "a,b,c,d,e,f\n1,2,3,4,5,6"
53+
names = ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"]
54+
df = self.read_csv(StringIO(data), sep=",", names=names,
55+
mangle_dupe_cols=True)
56+
assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
57+
"a.1.1.1.1", "a.1.1.1.1.1"]
58+
59+
data = "a,b,c,d,e,f,g\n1,2,3,4,5,6,7"
60+
names = ["a", "a", "a.3", "a.1", "a.2", "a", "a"]
61+
df = self.read_csv(StringIO(data), sep=",", names=names,
62+
mangle_dupe_cols=True)
63+
assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
64+
"a.2", "a.2.1", "a.3.1"]

0 commit comments

Comments
 (0)