From 91836bd23fa815f1f14e33e0d64bc8e6e186c433 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 18 Nov 2023 14:32:49 +0100 Subject: [PATCH 1/7] BUG: read_csv not respecting object dtype when option is set --- doc/source/whatsnew/v2.1.4.rst | 1 + pandas/io/parsers/readers.py | 27 ++++++++++++-- .../io/parser/dtypes/test_dtypes_basic.py | 35 +++++++++++++++++++ 3 files changed, 61 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 25afcbb3bb532..d467bcac39079 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -22,6 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :class:`Series` constructor raising DeprecationWarning when ``index`` is a list of :class:`Series` (:issue:`55228`) +- Fixed bug in :func:`read_csv` not respecting object dtype when ``infer_string`` option is set (:issue:`56047`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 83d75508920a4..e8fe07824323b 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -5,7 +5,10 @@ """ from __future__ import annotations -from collections import abc +from collections import ( + abc, + defaultdict, +) import csv import sys from textwrap import fill @@ -38,8 +41,10 @@ is_float, is_integer, is_list_like, + pandas_dtype, ) +from pandas import Series from pandas.core.frame import DataFrame from pandas.core.indexes.api import RangeIndex from pandas.core.shared_docs import _shared_docs @@ -1843,7 +1848,25 @@ def read(self, nrows: int | None = None) -> DataFrame: else: new_rows = len(index) - df = DataFrame(col_dict, columns=columns, index=index) + if hasattr(self, "orig_options"): + dtype_arg = self.orig_options.get("dtype", None) + else: + dtype_arg = None + + if dtype_arg is None: + dtype = defaultdict(lambda: None) + elif isinstance(dtype_arg, dict): + dtype = defaultdict(lambda: None) + dtype.update(dtype_arg) + else: + dtype = defaultdict(lambda: dtype_arg) + + new_col_dict = {} + for k, v in col_dict.items(): + d = dtype[k] if pandas_dtype(dtype[k]) == "object" else None + new_col_dict[k] = Series(v, index=index, dtype=d) + + df = DataFrame(new_col_dict, columns=columns, index=index) self._currow += new_rows return df diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 32b4b1dedc3cb..fc12368511374 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -571,6 +571,41 @@ def test_string_inference(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("dtype", ["O", object, "object", np.object_]) +def test_string_inference_object_dtype(all_parsers, dtype): + # GH#56047 + pytest.importorskip("pyarrow") + + data = """a,b +x,a +y,a +z,a""" + parser = all_parsers + with pd.option_context("future.infer_string", True): + result = parser.read_csv(StringIO(data), dtype=dtype) + + expected = DataFrame( + { + "a": pd.Series(["x", "y", "z"], dtype=object), + "b": pd.Series(["a", "a", "a"], dtype=object), + }, + columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + ) + tm.assert_frame_equal(result, expected) + + with pd.option_context("future.infer_string", True): + result = parser.read_csv(StringIO(data), dtype={"a": dtype}) + + expected = DataFrame( + { + "a": pd.Series(["x", "y", "z"], dtype=object), + "b": pd.Series(["a", "a", "a"], dtype="string[pyarrow_numpy]"), + }, + columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + ) + tm.assert_frame_equal(result, expected) + + def test_accurate_parsing_of_large_integers(all_parsers): # GH#52505 data = """SYMBOL,MOMENT,ID,ID_DEAL From 3c946b3618e98ab46daeb67c5b1cfa0ec1c0730a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 21 Nov 2023 21:19:17 +0100 Subject: [PATCH 2/7] Update readers.py --- pandas/io/parsers/readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index e8fe07824323b..f8c70bdc0966c 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1854,7 +1854,7 @@ def read(self, nrows: int | None = None) -> DataFrame: dtype_arg = None if dtype_arg is None: - dtype = defaultdict(lambda: None) + dtype = defaultdict(lambda: None) # type: ignore[var-annotated] elif isinstance(dtype_arg, dict): dtype = defaultdict(lambda: None) dtype.update(dtype_arg) From 7f705032a03453a5cfd31a6ba27706d4f8cd9585 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 26 Nov 2023 22:42:30 +0100 Subject: [PATCH 3/7] Cover str too --- pandas/io/parsers/readers.py | 6 +++++- pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 3b0b1e973974b..d2b2134d227df 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1866,7 +1866,11 @@ def read(self, nrows: int | None = None) -> DataFrame: new_col_dict = {} for k, v in col_dict.items(): - d = dtype[k] if pandas_dtype(dtype[k]) == "object" else None + d = ( + dtype[k] + if pandas_dtype(dtype[k]) in (np.str_, np.object_) + else None + ) new_col_dict[k] = Series(v, index=index, dtype=d) df = DataFrame(new_col_dict, columns=columns, index=index) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index fc12368511374..d24de412cf807 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -571,7 +571,7 @@ def test_string_inference(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("dtype", ["O", object, "object", np.object_]) +@pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_]) def test_string_inference_object_dtype(all_parsers, dtype): # GH#56047 pytest.importorskip("pyarrow") From 867abceaa449dc4787eb4aaf3db8d8f941c1d0cb Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 1 Dec 2023 21:34:46 +0100 Subject: [PATCH 4/7] Adjust --- pandas/io/parsers/readers.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index d2b2134d227df..11f75a871d6b2 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1856,22 +1856,28 @@ def read(self, nrows: int | None = None) -> DataFrame: else: dtype_arg = None - if dtype_arg is None: - dtype = defaultdict(lambda: None) # type: ignore[var-annotated] - elif isinstance(dtype_arg, dict): + if isinstance(dtype_arg, dict): dtype = defaultdict(lambda: None) dtype.update(dtype_arg) - else: + elif dtype_arg is not None and pandas_dtype(dtype_arg) in ( + np.str_, + np.object_, + ): dtype = defaultdict(lambda: dtype_arg) - - new_col_dict = {} - for k, v in col_dict.items(): - d = ( - dtype[k] - if pandas_dtype(dtype[k]) in (np.str_, np.object_) - else None - ) - new_col_dict[k] = Series(v, index=index, dtype=d) + else: + dtype = None + + if dtype is not None: + new_col_dict = {} + for k, v in col_dict.items(): + d = ( + dtype[k] + if pandas_dtype(dtype_arg) in (np.str_, np.object_) + else None + ) + new_col_dict[k] = Series(v, index=index, dtype=d) + else: + new_col_dict = col_dict df = DataFrame(new_col_dict, columns=columns, index=index) From 3031d0d970774020949096c8603aba51538ef5a2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 1 Dec 2023 23:06:50 +0100 Subject: [PATCH 5/7] Fixup --- pandas/io/parsers/readers.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 11f75a871d6b2..c74b4653715eb 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -26,6 +26,8 @@ import numpy as np +from pandas._config import using_copy_on_write + from pandas._libs import lib from pandas._libs.parsers import STR_NA_VALUES from pandas.errors import ( @@ -1872,14 +1874,19 @@ def read(self, nrows: int | None = None) -> DataFrame: for k, v in col_dict.items(): d = ( dtype[k] - if pandas_dtype(dtype_arg) in (np.str_, np.object_) + if pandas_dtype(dtype[k]) in (np.str_, np.object_) else None ) - new_col_dict[k] = Series(v, index=index, dtype=d) + new_col_dict[k] = Series(v, index=index, dtype=d, copy=False) else: new_col_dict = col_dict - df = DataFrame(new_col_dict, columns=columns, index=index) + df = DataFrame( + new_col_dict, + columns=columns, + index=index, + copy=not using_copy_on_write(), + ) self._currow += new_rows return df From 51a367e05ac4644c3b6f18edc2431341480b3ec8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 2 Dec 2023 01:11:12 +0100 Subject: [PATCH 6/7] Fixup --- pandas/io/parsers/arrow_parser_wrapper.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 037239e4ec83c..fba6ccc95f231 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -295,18 +295,8 @@ def read(self) -> DataFrame: dtype_mapping[pa.null()] = pd.Int64Dtype() frame = table.to_pandas(types_mapper=dtype_mapping.get) elif using_pyarrow_string_dtype(): - - def types_mapper(dtype): - dtype_dict = self.kwds["dtype"] - if dtype_dict is not None and dtype_dict.get(dtype, None) is not None: - return dtype_dict.get(dtype) - return arrow_string_types_mapper()(dtype) - - frame = table.to_pandas(types_mapper=types_mapper) + frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) else: - if isinstance(self.kwds.get("dtype"), dict): - frame = table.to_pandas(types_mapper=self.kwds["dtype"].get) - else: - frame = table.to_pandas() + frame = table.to_pandas() return self._finalize_pandas_output(frame) From d38b9eb2f5789151bcb66a47e27ac5eca8e13e8e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 8 Dec 2023 22:45:51 +0100 Subject: [PATCH 7/7] Update readers.py --- pandas/io/parsers/readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index c74b4653715eb..2f9243c895ae8 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1859,7 +1859,7 @@ def read(self, nrows: int | None = None) -> DataFrame: dtype_arg = None if isinstance(dtype_arg, dict): - dtype = defaultdict(lambda: None) + dtype = defaultdict(lambda: None) # type: ignore[var-annotated] dtype.update(dtype_arg) elif dtype_arg is not None and pandas_dtype(dtype_arg) in ( np.str_,