Skip to content

Commit 7f06a8a

Browse files
authored
BUG: mangle_dup_cols in read_csv replacing existing cols when conflict with target col (#44641)
1 parent 0a76877 commit 7f06a8a

File tree

4 files changed

+117
-28
lines changed

4 files changed

+117
-28
lines changed

doc/source/whatsnew/v1.4.0.rst

+35
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,41 @@ Now null-values are no longer mangled.
304304

305305
*New behavior*:
306306

307+
.. ipython:: python
308+
309+
res
310+
311+
.. _whatsnew_140.notable_bug_fixes.read_csv_mangle_dup_cols:
312+
313+
mangle_dupe_cols in read_csv no longer renaming unique columns conflicting with target names
314+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
315+
316+
:func:`read_csv` no longer renaming unique cols, which conflict with the target names of duplicated columns.
317+
Already existing columns are jumped, e.g. the next available index is used for the target column name (:issue:`14704`).
318+
319+
.. ipython:: python
320+
321+
import io
322+
323+
data = "a,a,a.1\n1,2,3"
324+
res = pd.read_csv(io.StringIO(data))
325+
326+
Previously, the second column was called ``a.1``, while the third col was also renamed to ``a.1.1``.
327+
328+
*Previous behavior*:
329+
330+
.. code-block:: ipython
331+
332+
In [3]: res
333+
Out[3]:
334+
a a.1 a.1.1
335+
0 1 2 3
336+
337+
Now the renaming checks if ``a.1`` already exists when changing the name of the second column and jumps this index. The
338+
second column is instead renamed to ``a.2``.
339+
340+
*New behavior*:
341+
307342
.. ipython:: python
308343
309344
res

pandas/_libs/parsers.pyx

+32-19
Original file line numberDiff line numberDiff line change
@@ -657,46 +657,58 @@ cdef class TextReader:
657657
field_count = self.parser.line_fields[hr]
658658
start = self.parser.line_start[hr]
659659

660-
counts = {}
661660
unnamed_count = 0
661+
unnamed_col_indices = []
662662

663663
for i in range(field_count):
664664
word = self.parser.words[start + i]
665665

666666
name = PyUnicode_DecodeUTF8(word, strlen(word),
667667
self.encoding_errors)
668668

669-
# We use this later when collecting placeholder names.
670-
old_name = name
671-
672669
if name == '':
673670
if self.has_mi_columns:
674671
name = f'Unnamed: {i}_level_{level}'
675672
else:
676673
name = f'Unnamed: {i}'
674+
677675
unnamed_count += 1
676+
unnamed_col_indices.append(i)
677+
678+
this_header.append(name)
678679

679-
count = counts.get(name, 0)
680+
if not self.has_mi_columns and self.mangle_dupe_cols:
681+
# Ensure that regular columns are used before unnamed ones
682+
# to keep given names and mangle unnamed columns
683+
col_loop_order = [i for i in range(len(this_header))
684+
if i not in unnamed_col_indices
685+
] + unnamed_col_indices
686+
counts = {}
687+
688+
for i in col_loop_order:
689+
col = this_header[i]
690+
old_col = col
691+
cur_count = counts.get(col, 0)
692+
693+
if cur_count > 0:
694+
while cur_count > 0:
695+
counts[old_col] = cur_count + 1
696+
col = f'{old_col}.{cur_count}'
697+
if col in this_header:
698+
cur_count += 1
699+
else:
700+
cur_count = counts.get(col, 0)
680701

681-
if not self.has_mi_columns and self.mangle_dupe_cols:
682-
if count > 0:
683-
while count > 0:
684-
counts[name] = count + 1
685-
name = f'{name}.{count}'
686-
count = counts.get(name, 0)
687702
if (
688703
self.dtype is not None
689704
and is_dict_like(self.dtype)
690-
and self.dtype.get(old_name) is not None
691-
and self.dtype.get(name) is None
705+
and self.dtype.get(old_col) is not None
706+
and self.dtype.get(col) is None
692707
):
693-
self.dtype.update({name: self.dtype.get(old_name)})
694-
695-
if old_name == '':
696-
unnamed_cols.add(name)
708+
self.dtype.update({col: self.dtype.get(old_col)})
697709

698-
this_header.append(name)
699-
counts[name] = count + 1
710+
this_header[i] = col
711+
counts[col] = cur_count + 1
700712

701713
if self.has_mi_columns:
702714

@@ -716,6 +728,7 @@ cdef class TextReader:
716728

717729
data_line = hr + 1
718730
header.append(this_header)
731+
unnamed_cols.update({this_header[i] for i in unnamed_col_indices})
719732

720733
if self.names is not None:
721734
header = [self.names]

pandas/io/parsers/python_parser.py

+17-5
Original file line numberDiff line numberDiff line change
@@ -401,16 +401,28 @@ def _infer_columns(self):
401401

402402
if not have_mi_columns and self.mangle_dupe_cols:
403403
counts: DefaultDict = defaultdict(int)
404-
405-
for i, col in enumerate(this_columns):
404+
# Ensure that regular columns are used before unnamed ones
405+
# to keep given names and mangle unnamed columns
406+
col_loop_order = [
407+
i
408+
for i in range(len(this_columns))
409+
if i not in this_unnamed_cols
410+
] + this_unnamed_cols
411+
412+
for i in col_loop_order:
413+
col = this_columns[i]
406414
old_col = col
407415
cur_count = counts[col]
408416

409417
if cur_count > 0:
410418
while cur_count > 0:
411-
counts[col] = cur_count + 1
412-
col = f"{col}.{cur_count}"
413-
cur_count = counts[col]
419+
counts[old_col] = cur_count + 1
420+
col = f"{old_col}.{cur_count}"
421+
if col in this_columns:
422+
cur_count += 1
423+
else:
424+
cur_count = counts[col]
425+
414426
if (
415427
self.dtype is not None
416428
and is_dict_like(self.dtype)

pandas/tests/io/parser/test_mangle_dupes.py

+33-4
Original file line numberDiff line numberDiff line change
@@ -52,19 +52,19 @@ def test_basic_names_raise(all_parsers):
5252
@pytest.mark.parametrize(
5353
"data,expected",
5454
[
55-
("a,a,a.1\n1,2,3", DataFrame([[1, 2, 3]], columns=["a", "a.1", "a.1.1"])),
55+
("a,a,a.1\n1,2,3", DataFrame([[1, 2, 3]], columns=["a", "a.2", "a.1"])),
5656
(
5757
"a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6",
5858
DataFrame(
5959
[[1, 2, 3, 4, 5, 6]],
60-
columns=["a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1", "a.1.1.1.1.1"],
60+
columns=["a", "a.2", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
6161
),
6262
),
6363
(
6464
"a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7",
6565
DataFrame(
6666
[[1, 2, 3, 4, 5, 6, 7]],
67-
columns=["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"],
67+
columns=["a", "a.4", "a.3", "a.1", "a.2", "a.5", "a.6"],
6868
),
6969
),
7070
],
@@ -131,9 +131,38 @@ def test_mangled_unnamed_placeholders(all_parsers):
131131
expected = DataFrame()
132132

133133
for j in range(i + 1):
134-
expected["Unnamed: 0" + ".1" * j] = [0, 1, 2]
134+
col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1)
135+
expected.insert(loc=0, column=col_name, value=[0, 1, 2])
135136

136137
expected[orig_key] = orig_value
137138
df = parser.read_csv(StringIO(df.to_csv()))
138139

139140
tm.assert_frame_equal(df, expected)
141+
142+
143+
@skip_pyarrow
144+
def test_mangle_dupe_cols_already_exists(all_parsers):
145+
# GH#14704
146+
parser = all_parsers
147+
148+
data = "a,a,a.1,a,a.3,a.1,a.1.1\n1,2,3,4,5,6,7"
149+
result = parser.read_csv(StringIO(data))
150+
expected = DataFrame(
151+
[[1, 2, 3, 4, 5, 6, 7]],
152+
columns=["a", "a.2", "a.1", "a.4", "a.3", "a.1.2", "a.1.1"],
153+
)
154+
tm.assert_frame_equal(result, expected)
155+
156+
157+
@skip_pyarrow
158+
def test_mangle_dupe_cols_already_exists_unnamed_col(all_parsers):
159+
# GH#14704
160+
parser = all_parsers
161+
162+
data = ",Unnamed: 0,,Unnamed: 2\n1,2,3,4"
163+
result = parser.read_csv(StringIO(data))
164+
expected = DataFrame(
165+
[[1, 2, 3, 4]],
166+
columns=["Unnamed: 0.1", "Unnamed: 0", "Unnamed: 2.1", "Unnamed: 2"],
167+
)
168+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)