BUG: mangle_dup_cols in read_csv replacing existing cols when conflict with target col (#44641)

phofl · web-flow · commit 7f06a8a577d8 · 2021-11-28T18:02:44.000-05:00
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -304,6 +304,41 @@ Now null-values are no longer mangled.
 
 *New behavior*:
 
+.. ipython:: python
+
+    res
+
+.. _whatsnew_140.notable_bug_fixes.read_csv_mangle_dup_cols:
+
+mangle_dupe_cols in read_csv no longer renaming unique columns conflicting with target names
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:func:`read_csv` no longer renaming unique cols, which conflict with the target names of duplicated columns.
+Already existing columns are jumped, e.g. the next available index is used for the target column name (:issue:`14704`).
+
+.. ipython:: python
+
+    import io
+
+    data = "a,a,a.1\n1,2,3"
+    res = pd.read_csv(io.StringIO(data))
+
+Previously, the second column was called ``a.1``, while the third col was also renamed to ``a.1.1``.
+
+*Previous behavior*:
+
+.. code-block:: ipython
+
+    In [3]: res
+    Out[3]:
+        a  a.1  a.1.1
+    0   1    2      3
+
+Now the renaming checks if ``a.1`` already exists when changing the name of the second column and jumps this index. The
+second column is instead renamed to ``a.2``.
+
+*New behavior*:
+
 .. ipython:: python
 
     res
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -657,46 +657,58 @@ cdef class TextReader:
                     field_count = self.parser.line_fields[hr]
                     start = self.parser.line_start[hr]
 
-                counts = {}
                 unnamed_count = 0
+                unnamed_col_indices = []
 
                 for i in range(field_count):
                     word = self.parser.words[start + i]
 
                     name = PyUnicode_DecodeUTF8(word, strlen(word),
                                                 self.encoding_errors)
 
-                    # We use this later when collecting placeholder names.
-                    old_name = name
-
                     if name == '':
                         if self.has_mi_columns:
                             name = f'Unnamed: {i}_level_{level}'
                         else:
                             name = f'Unnamed: {i}'
+
                         unnamed_count += 1
+                        unnamed_col_indices.append(i)
+
+                    this_header.append(name)
 
-                    count = counts.get(name, 0)
+                if not self.has_mi_columns and self.mangle_dupe_cols:
+                    # Ensure that regular columns are used before unnamed ones
+                    # to keep given names and mangle unnamed columns
+                    col_loop_order = [i for i in range(len(this_header))
+                                      if i not in unnamed_col_indices
+                                      ] + unnamed_col_indices
+                    counts = {}
+
+                    for i in col_loop_order:
+                        col = this_header[i]
+                        old_col = col
+                        cur_count = counts.get(col, 0)
+
+                        if cur_count > 0:
+                            while cur_count > 0:
+                                counts[old_col] = cur_count + 1
+                                col = f'{old_col}.{cur_count}'
+                                if col in this_header:
+                                    cur_count += 1
+                                else:
+                                    cur_count = counts.get(col, 0)
 
-                    if not self.has_mi_columns and self.mangle_dupe_cols:
-                        if count > 0:
-                            while count > 0:
-                                counts[name] = count + 1
-                                name = f'{name}.{count}'
-                                count = counts.get(name, 0)
                             if (
                                 self.dtype is not None
                                 and is_dict_like(self.dtype)
-                                and self.dtype.get(old_name) is not None
-                                and self.dtype.get(name) is None
+                                and self.dtype.get(old_col) is not None
+                                and self.dtype.get(col) is None
                             ):
-                                self.dtype.update({name: self.dtype.get(old_name)})
-
-                    if old_name == '':
-                        unnamed_cols.add(name)
+                                self.dtype.update({col: self.dtype.get(old_col)})
 
-                    this_header.append(name)
-                    counts[name] = count + 1
+                        this_header[i] = col
+                        counts[col] = cur_count + 1
 
                 if self.has_mi_columns:
 
@@ -716,6 +728,7 @@ cdef class TextReader:
 
                 data_line = hr + 1
                 header.append(this_header)
+                unnamed_cols.update({this_header[i] for i in unnamed_col_indices})
 
             if self.names is not None:
                 header = [self.names]
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -401,16 +401,28 @@ def _infer_columns(self):
 
                 if not have_mi_columns and self.mangle_dupe_cols:
                     counts: DefaultDict = defaultdict(int)
-
-                    for i, col in enumerate(this_columns):
+                    # Ensure that regular columns are used before unnamed ones
+                    # to keep given names and mangle unnamed columns
+                    col_loop_order = [
+                        i
+                        for i in range(len(this_columns))
+                        if i not in this_unnamed_cols
+                    ] + this_unnamed_cols
+
+                    for i in col_loop_order:
+                        col = this_columns[i]
                         old_col = col
                         cur_count = counts[col]
 
                         if cur_count > 0:
                             while cur_count > 0:
-                                counts[col] = cur_count + 1
-                                col = f"{col}.{cur_count}"
-                                cur_count = counts[col]
+                                counts[old_col] = cur_count + 1
+                                col = f"{old_col}.{cur_count}"
+                                if col in this_columns:
+                                    cur_count += 1
+                                else:
+                                    cur_count = counts[col]
+
                             if (
                                 self.dtype is not None
                                 and is_dict_like(self.dtype)
diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py
@@ -52,19 +52,19 @@ def test_basic_names_raise(all_parsers):
 @pytest.mark.parametrize(
     "data,expected",
     [
-        ("a,a,a.1\n1,2,3", DataFrame([[1, 2, 3]], columns=["a", "a.1", "a.1.1"])),
+        ("a,a,a.1\n1,2,3", DataFrame([[1, 2, 3]], columns=["a", "a.2", "a.1"])),
         (
             "a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6",
             DataFrame(
                 [[1, 2, 3, 4, 5, 6]],
-                columns=["a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1", "a.1.1.1.1.1"],
+                columns=["a", "a.2", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
             ),
         ),
         (
             "a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7",
             DataFrame(
                 [[1, 2, 3, 4, 5, 6, 7]],
-                columns=["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"],
+                columns=["a", "a.4", "a.3", "a.1", "a.2", "a.5", "a.6"],
             ),
         ),
     ],
@@ -131,9 +131,38 @@ def test_mangled_unnamed_placeholders(all_parsers):
         expected = DataFrame()
 
         for j in range(i + 1):
-            expected["Unnamed: 0" + ".1" * j] = [0, 1, 2]
+            col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1)
+            expected.insert(loc=0, column=col_name, value=[0, 1, 2])
 
         expected[orig_key] = orig_value
         df = parser.read_csv(StringIO(df.to_csv()))
 
         tm.assert_frame_equal(df, expected)
+
+
+@skip_pyarrow
+def test_mangle_dupe_cols_already_exists(all_parsers):
+    # GH#14704
+    parser = all_parsers
+
+    data = "a,a,a.1,a,a.3,a.1,a.1.1\n1,2,3,4,5,6,7"
+    result = parser.read_csv(StringIO(data))
+    expected = DataFrame(
+        [[1, 2, 3, 4, 5, 6, 7]],
+        columns=["a", "a.2", "a.1", "a.4", "a.3", "a.1.2", "a.1.1"],
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+def test_mangle_dupe_cols_already_exists_unnamed_col(all_parsers):
+    # GH#14704
+    parser = all_parsers
+
+    data = ",Unnamed: 0,,Unnamed: 2\n1,2,3,4"
+    result = parser.read_csv(StringIO(data))
+    expected = DataFrame(
+        [[1, 2, 3, 4]],
+        columns=["Unnamed: 0.1", "Unnamed: 0", "Unnamed: 2.1", "Unnamed: 2"],
+    )
+    tm.assert_frame_equal(result, expected)