From e344a33b4dfaa25695e376814a65e94b02e4bc0d Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 18 Feb 2022 17:50:48 +0100 Subject: [PATCH 1/2] BUG: read_csv not respecting converter in all cases for index col --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/io/parsers/base_parser.py | 8 +++++++- pandas/io/parsers/python_parser.py | 1 - pandas/tests/io/parser/test_converters.py | 20 ++++++++++++++++---- 4 files changed, 24 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index c8b2617ffc535..478337f4db24e 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -357,6 +357,7 @@ I/O - Bug in :meth:`DataFrame.info` where a new line at the end of the output is omitted when called on an empty :class:`DataFrame` (:issue:`45494`) - Bug in :func:`read_csv` not recognizing line break for ``on_bad_lines="warn"`` for ``engine="c"`` (:issue:`41710`) - Bug in :meth:`DataFrame.to_csv` not respecting ``float_format`` for ``Float64`` dtype (:issue:`45991`) +- Bug in :func:`read_csv` not respecting a specified converter to index columns in all cases (:issue:`40589`) - Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`) - Bug in :func:`DataFrame.to_excel` and :class:`ExcelWriter` would raise when writing an empty DataFrame to a ``.ods`` file (:issue:`45793`) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 7927439abb510..c76e40677ad78 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -103,6 +103,7 @@ def __init__(self, kwds): self.keep_default_na = kwds.get("keep_default_na", True) self.dtype = copy(kwds.get("dtype", None)) + self.converters = kwds.get("converters") self.true_values = kwds.get("true_values") self.false_values = kwds.get("false_values") @@ -476,6 +477,7 @@ def _clean_mapping(self, mapping): @final def _agg_index(self, index, try_parse_dates: bool = True) -> Index: arrays = [] + converters = self._clean_mapping(self.converters) for i, arr in enumerate(index): @@ -503,7 +505,11 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: if isinstance(clean_dtypes, dict) and self.index_names is not None: cast_type = clean_dtypes.get(self.index_names[i], None) - try_num_bool = not (cast_type and is_string_dtype(cast_type)) + conv = False + if isinstance(converters, dict) and self.index_names is not None: + conv = converters.get(self.index_names[i]) is not None + + try_num_bool = not (cast_type and is_string_dtype(cast_type) or conv) arr, _ = self._infer_types( arr, col_na_values | col_na_fvalues, try_num_bool diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 8a66a5c22caf5..92031cb04e768 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -95,7 +95,6 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds): self.has_index_names = kwds["has_index_names"] self.verbose = kwds["verbose"] - self.converters = kwds["converters"] self.thousands = kwds["thousands"] self.decimal = kwds["decimal"] diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index 21933d83ce3f4..85f3db0398080 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -152,16 +152,28 @@ def convert_score(x): tm.assert_frame_equal(results[0], results[1]) -def test_converter_index_col_bug(all_parsers): - # see gh-1835 +@pytest.mark.parametrize("conv_f", [lambda x: x, str]) +def test_converter_index_col_bug(all_parsers, conv_f): + # see gh-1835 , GH#40589 parser = all_parsers data = "A;B\n1;2\n3;4" rs = parser.read_csv( - StringIO(data), sep=";", index_col="A", converters={"A": lambda x: x} + StringIO(data), sep=";", index_col="A", converters={"A": conv_f} ) - xp = DataFrame({"B": [2, 4]}, index=Index([1, 3], name="A")) + xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object")) + tm.assert_frame_equal(rs, xp) + + +def test_converter_identity_object(all_parsers): + # GH#40589 + parser = all_parsers + data = "A,B\n1,2\n3,4" + + rs = parser.read_csv(StringIO(data), converters={"A": lambda x: x}) + + xp = DataFrame({"A": ["1", "3"], "B": [2, 4]}) tm.assert_frame_equal(rs, xp) From 6e5667c5b059341e6843922b6af9a687a3f22813 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 22 Feb 2022 23:51:02 +0100 Subject: [PATCH 2/2] Restructure --- pandas/io/parsers/base_parser.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index c76e40677ad78..e071e281d5a90 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -502,14 +502,17 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: clean_dtypes = self._clean_mapping(self.dtype) cast_type = None - if isinstance(clean_dtypes, dict) and self.index_names is not None: - cast_type = clean_dtypes.get(self.index_names[i], None) + index_converter = False + if self.index_names is not None: + if isinstance(clean_dtypes, dict): + cast_type = clean_dtypes.get(self.index_names[i], None) - conv = False - if isinstance(converters, dict) and self.index_names is not None: - conv = converters.get(self.index_names[i]) is not None + if isinstance(converters, dict): + index_converter = converters.get(self.index_names[i]) is not None - try_num_bool = not (cast_type and is_string_dtype(cast_type) or conv) + try_num_bool = not ( + cast_type and is_string_dtype(cast_type) or index_converter + ) arr, _ = self._infer_types( arr, col_na_values | col_na_fvalues, try_num_bool