From bda81583b422b7c1b5f2c6f9cd64797edf3e02a7 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 27 Nov 2021 20:03:54 +0100 Subject: [PATCH 1/4] BUG: mangle_dup_cols in read_csv replacing existing cols when conflic with target col --- pandas/_libs/parsers.pyx | 49 ++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index d2975f83b97d7..135c4802f073c 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -657,8 +657,8 @@ cdef class TextReader: field_count = self.parser.line_fields[hr] start = self.parser.line_start[hr] - counts = {} unnamed_count = 0 + unnamed_col_indices = [] for i in range(field_count): word = self.parser.words[start + i] @@ -666,37 +666,47 @@ cdef class TextReader: name = PyUnicode_DecodeUTF8(word, strlen(word), self.encoding_errors) - # We use this later when collecting placeholder names. - old_name = name - if name == '': if self.has_mi_columns: name = f'Unnamed: {i}_level_{level}' else: name = f'Unnamed: {i}' + unnamed_count += 1 + unnamed_col_indices.append(i) + + this_header.append(name) - count = counts.get(name, 0) + if not self.has_mi_columns and self.mangle_dupe_cols: + col_loop_order = [i for i in range(len(this_header)) + if i not in unnamed_col_indices + ] + unnamed_col_indices + counts = {} + + for i in col_loop_order: + col = this_header[i] + old_col = col + cur_count = counts.get(col, 0) + + if cur_count > 0: + while cur_count > 0: + counts[old_col] = cur_count + 1 + col = f'{old_col}.{cur_count}' + if col in this_header: + cur_count += 1 + else: + cur_count = counts.get(col, 0) - if not self.has_mi_columns and self.mangle_dupe_cols: - if count > 0: - while count > 0: - counts[name] = count + 1 - name = f'{name}.{count}' - count = counts.get(name, 0) if ( self.dtype is not None and is_dict_like(self.dtype) - and self.dtype.get(old_name) is not None - and self.dtype.get(name) is None + and self.dtype.get(old_col) is not None + and self.dtype.get(col) is None ): - self.dtype.update({name: self.dtype.get(old_name)}) - - if old_name == '': - unnamed_cols.add(name) + self.dtype.update({col: self.dtype.get(old_col)}) - this_header.append(name) - counts[name] = count + 1 + this_header[i] = col + counts[col] = cur_count + 1 if self.has_mi_columns: @@ -716,6 +726,7 @@ cdef class TextReader: data_line = hr + 1 header.append(this_header) + unnamed_cols.update({this_header[i] for i in unnamed_col_indices}) if self.names is not None: header = [self.names] From fba1e1d97fabc7c780d14bef388e7eb2e24896ef Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 27 Nov 2021 20:02:03 +0100 Subject: [PATCH 2/4] BUG: mangle_dup_cols in read_csv replacing existing cols when conflic with target col --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/io/parsers/python_parser.py | 20 ++++++++--- pandas/tests/io/parser/test_mangle_dupes.py | 37 ++++++++++++++++++--- 3 files changed, 49 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index e87f5f53256cf..0f915533fc154 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -667,6 +667,7 @@ I/O - Bug in :func:`read_csv` converting columns to numeric after date parsing failed (:issue:`11019`) - Bug in :func:`read_csv` not replacing ``NaN`` values with ``np.nan`` before attempting date conversion (:issue:`26203`) - Bug in :func:`read_csv` raising ``AttributeError`` when attempting to read a .csv file and infer index column dtype from an nullable integer type (:issue:`44079`) +- Bug in :func:`read_csv` replacing existing column names if mangling of duplicate columns conflicts with the target column (:issue:`14704`) - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`) - Bug in :func:`read_csv` when passing simultaneously a parser in ``date_parser`` and ``parse_dates=False``, the parsing was still called (:issue:`44366`) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index a9d97874304ad..6e42076e70dc6 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -415,16 +415,26 @@ def _infer_columns(self): if not have_mi_columns and self.mangle_dupe_cols: counts: DefaultDict = defaultdict(int) - - for i, col in enumerate(this_columns): + col_loop_order = [ + i + for i in range(len(this_columns)) + if i not in this_unnamed_cols + ] + this_unnamed_cols + + for i in col_loop_order: + col = this_columns[i] old_col = col cur_count = counts[col] if cur_count > 0: while cur_count > 0: - counts[col] = cur_count + 1 - col = f"{col}.{cur_count}" - cur_count = counts[col] + counts[old_col] = cur_count + 1 + col = f"{old_col}.{cur_count}" + if col in this_columns: + cur_count += 1 + else: + cur_count = counts[col] + if ( self.dtype is not None and is_dict_like(self.dtype) diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 6473e6c7670c8..3f7b1b5dfa19b 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -52,19 +52,19 @@ def test_basic_names_raise(all_parsers): @pytest.mark.parametrize( "data,expected", [ - ("a,a,a.1\n1,2,3", DataFrame([[1, 2, 3]], columns=["a", "a.1", "a.1.1"])), + ("a,a,a.1\n1,2,3", DataFrame([[1, 2, 3]], columns=["a", "a.2", "a.1"])), ( "a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6", DataFrame( [[1, 2, 3, 4, 5, 6]], - columns=["a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1", "a.1.1.1.1.1"], + columns=["a", "a.2", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"], ), ), ( "a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7", DataFrame( [[1, 2, 3, 4, 5, 6, 7]], - columns=["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"], + columns=["a", "a.4", "a.3", "a.1", "a.2", "a.5", "a.6"], ), ), ], @@ -131,9 +131,38 @@ def test_mangled_unnamed_placeholders(all_parsers): expected = DataFrame() for j in range(i + 1): - expected["Unnamed: 0" + ".1" * j] = [0, 1, 2] + col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1) + expected.insert(loc=0, column=col_name, value=[0, 1, 2]) expected[orig_key] = orig_value df = parser.read_csv(StringIO(df.to_csv())) tm.assert_frame_equal(df, expected) + + +@skip_pyarrow +def test_mangle_dupe_cols_already_exists(all_parsers): + # GH#14704 + parser = all_parsers + + data = "a,a,a.1,a,a.3,a.1,a.1.1\n1,2,3,4,5,6,7" + result = parser.read_csv(StringIO(data)) + expected = DataFrame( + [[1, 2, 3, 4, 5, 6, 7]], + columns=["a", "a.2", "a.1", "a.4", "a.3", "a.1.2", "a.1.1"], + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_mangle_dupe_cols_already_exists_unnamed_col(all_parsers): + # GH#14704 + parser = all_parsers + + data = ",Unnamed: 0,,Unnamed: 2\n1,2,3,4" + result = parser.read_csv(StringIO(data)) + expected = DataFrame( + [[1, 2, 3, 4]], + columns=["Unnamed: 0.1", "Unnamed: 0", "Unnamed: 2.1", "Unnamed: 2"], + ) + tm.assert_frame_equal(result, expected) From eff651dfd23277971ddb7b9b0552aa628080eaae Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 28 Nov 2021 03:20:04 +0100 Subject: [PATCH 3/4] Add comment --- pandas/_libs/parsers.pyx | 2 ++ pandas/io/parsers/python_parser.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 135c4802f073c..fe2e84631d3b4 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -678,6 +678,8 @@ cdef class TextReader: this_header.append(name) if not self.has_mi_columns and self.mangle_dupe_cols: + # Ensure that regular columns are used before unnamed ones + # to keep given names and mangle unnamed columns col_loop_order = [i for i in range(len(this_header)) if i not in unnamed_col_indices ] + unnamed_col_indices diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 6e42076e70dc6..43797ce7d30bd 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -415,6 +415,8 @@ def _infer_columns(self): if not have_mi_columns and self.mangle_dupe_cols: counts: DefaultDict = defaultdict(int) + # Ensure that regular columns are used before unnamed ones + # to keep given names and mangle unnamed columns col_loop_order = [ i for i in range(len(this_columns)) From 92d5b55b35d4335f7cd6516581db5cfd571ba88c Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 28 Nov 2021 22:01:48 +0100 Subject: [PATCH 4/4] Add block in bug fixes --- doc/source/whatsnew/v1.4.0.rst | 36 +++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index aa72b1487c50f..247c9057118b2 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -304,6 +304,41 @@ Now null-values are no longer mangled. *New behavior*: +.. ipython:: python + + res + +.. _whatsnew_140.notable_bug_fixes.read_csv_mangle_dup_cols: + +mangle_dupe_cols in read_csv no longer renaming unique columns conflicting with target names +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`read_csv` no longer renaming unique cols, which conflict with the target names of duplicated columns. +Already existing columns are jumped, e.g. the next available index is used for the target column name (:issue:`14704`). + +.. ipython:: python + + import io + + data = "a,a,a.1\n1,2,3" + res = pd.read_csv(io.StringIO(data)) + +Previously, the second column was called ``a.1``, while the third col was also renamed to ``a.1.1``. + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: res + Out[3]: + a a.1 a.1.1 + 0 1 2 3 + +Now the renaming checks if ``a.1`` already exists when changing the name of the second column and jumps this index. The +second column is instead renamed to ``a.2``. + +*New behavior*: + .. ipython:: python res @@ -675,7 +710,6 @@ I/O - Bug in :func:`read_csv` converting columns to numeric after date parsing failed (:issue:`11019`) - Bug in :func:`read_csv` not replacing ``NaN`` values with ``np.nan`` before attempting date conversion (:issue:`26203`) - Bug in :func:`read_csv` raising ``AttributeError`` when attempting to read a .csv file and infer index column dtype from an nullable integer type (:issue:`44079`) -- Bug in :func:`read_csv` replacing existing column names if mangling of duplicate columns conflicts with the target column (:issue:`14704`) - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`) - Bug in :func:`read_csv` when passing simultaneously a parser in ``date_parser`` and ``parse_dates=False``, the parsing was still called (:issue:`44366`)