From 39042458ff6570cacbc4a22090d7bc0f49a872fc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 30 Jul 2023 16:06:00 +0200 Subject: [PATCH 01/11] ENH: Implement arrow string option for various I/O methods --- pandas/_config/__init__.py | 5 +++++ pandas/io/_util.py | 8 ++++++++ pandas/io/feather_format.py | 8 +++++++- pandas/io/orc.py | 9 ++++++++- pandas/io/parquet.py | 5 +++++ pandas/io/parsers/arrow_parser_wrapper.py | 9 ++++++++- pandas/io/pytables.py | 17 ++++++++++++++++- 7 files changed, 57 insertions(+), 4 deletions(-) diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py index c37ad563df8ef..9f9cf0fd2a5a8 100644 --- a/pandas/_config/__init__.py +++ b/pandas/_config/__init__.py @@ -38,3 +38,8 @@ def using_copy_on_write() -> bool: def using_nullable_dtypes() -> bool: _mode_options = _global_config["mode"] return _mode_options["nullable_dtypes"] + + +def using_pyarrow_string_dtype() -> bool: + _mode_options = _global_config["future"] + return _mode_options["pyarrow_strings"] diff --git a/pandas/io/_util.py b/pandas/io/_util.py index d2a001f0cf925..27316b3ab0af0 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -1,5 +1,7 @@ from __future__ import annotations +from typing import Callable + from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -21,3 +23,9 @@ def _arrow_dtype_mapping() -> dict: pa.float32(): pd.Float32Dtype(), pa.float64(): pd.Float64Dtype(), } + + +def arrow_string_types_mapper() -> Callable: + pa = import_optional_dependency("pyarrow") + + return {pa.string(): pd.ArrowDtype(pa.string())}.get diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 77b2b12fda77f..a8f56565b676b 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -6,6 +6,8 @@ Any, ) +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc @@ -15,6 +17,7 @@ from pandas.core.api import DataFrame from pandas.core.shared_docs import _shared_docs +from pandas.io._util import arrow_string_types_mapper from pandas.io.common import get_handle if TYPE_CHECKING: @@ -119,7 +122,7 @@ def read_feather( with get_handle( path, "rb", storage_options=storage_options, is_text=False ) as handles: - if dtype_backend is lib.no_default: + if dtype_backend is lib.no_default and not using_pyarrow_string_dtype(): return feather.read_feather( handles.handle, columns=columns, use_threads=bool(use_threads) ) @@ -135,3 +138,6 @@ def read_feather( elif dtype_backend == "pyarrow": return pa_table.to_pandas(types_mapper=pd.ArrowDtype) + + elif using_pyarrow_string_dtype(): + return pa_table.to_pandas(types_mapper=arrow_string_types_mapper()) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 75f7f9e56439e..774f9d797b011 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -9,6 +9,8 @@ Literal, ) +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas.compat import pa_version_under8p0 from pandas.compat._optional import import_optional_dependency @@ -24,6 +26,7 @@ import pandas as pd from pandas.core.indexes.api import default_index +from pandas.io._util import arrow_string_types_mapper from pandas.io.common import ( get_handle, is_fsspec_url, @@ -132,7 +135,11 @@ def read_orc( df = pa_table.to_pandas(types_mapper=mapping.get) return df else: - return pa_table.to_pandas() + if using_pyarrow_string_dtype(): + types_mapper = arrow_string_types_mapper() + else: + types_mapper = None + return pa_table.to_pandas(types_mapper=types_mapper) def to_orc( diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 61112542fb9d8..39b43f48ac343 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -11,6 +11,8 @@ import warnings from warnings import catch_warnings +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError @@ -25,6 +27,7 @@ ) from pandas.core.shared_docs import _shared_docs +from pandas.io._util import arrow_string_types_mapper from pandas.io.common import ( IOHandles, get_handle, @@ -244,6 +247,8 @@ def read( to_pandas_kwargs["types_mapper"] = mapping.get elif dtype_backend == "pyarrow": to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] # noqa: E501 + elif using_pyarrow_string_dtype(): + to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper() manager = get_option("mode.data_manager") if manager == "array": diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 09ea6b8b7902b..71bfb00a95b50 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -2,6 +2,8 @@ from typing import TYPE_CHECKING +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -10,7 +12,10 @@ import pandas as pd from pandas import DataFrame -from pandas.io._util import _arrow_dtype_mapping +from pandas.io._util import ( + _arrow_dtype_mapping, + arrow_string_types_mapper, +) from pandas.io.parsers.base_parser import ParserBase if TYPE_CHECKING: @@ -215,6 +220,8 @@ def read(self) -> DataFrame: dtype_mapping = _arrow_dtype_mapping() dtype_mapping[pa.null()] = pd.Int64Dtype() frame = table.to_pandas(types_mapper=dtype_mapping.get) + elif using_pyarrow_string_dtype(): + frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) else: frame = table.to_pandas() return self._finalize_pandas_output(frame) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index e50a1f6e56d51..9d905311d32ed 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -30,6 +30,7 @@ from pandas._config import ( config, get_option, + using_pyarrow_string_dtype, ) from pandas._libs import ( @@ -66,6 +67,7 @@ ) from pandas.core.dtypes.missing import array_equivalent +import pandas as pd from pandas import ( DataFrame, DatetimeIndex, @@ -3219,7 +3221,12 @@ def read( self.validate_read(columns, where) index = self.read_index("index", start=start, stop=stop) values = self.read_array("values", start=start, stop=stop) - return Series(values, index=index, name=self.name, copy=False) + result = Series(values, index=index, name=self.name, copy=False) + if result.dtype.kind == "O" and using_pyarrow_string_dtype(): + import pyarrow as pa + + result = result.astype(pd.ArrowDtype(pa.string())) + return result # error: Signature of "write" incompatible with supertype "Fixed" def write(self, obj, **kwargs) -> None: # type: ignore[override] @@ -3287,6 +3294,10 @@ def read( columns = items[items.get_indexer(blk_items)] df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) + if values.dtype.kind == "O" and using_pyarrow_string_dtype(): + import pyarrow as pa + + df = df.astype(pd.ArrowDtype(pa.string())) dfs.append(df) if len(dfs) > 0: @@ -4669,6 +4680,10 @@ def read( # Categorical df = DataFrame._from_arrays([values], columns=cols_, index=index_) assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) + if values.dtype.kind == "O" and using_pyarrow_string_dtype(): + import pyarrow as pa + + df = df.astype(pd.ArrowDtype(pa.string())) frames.append(df) if len(frames) == 1: From ebe0bd51c2939f10f10164eb169276537fa15c51 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 31 Jul 2023 20:02:14 -0700 Subject: [PATCH 02/11] ENH: allow opt-in to inferring pyarrow strings --- pandas/_libs/lib.pyx | 38 ++++++++++++++++++++++++++++++++++++++ pandas/core/config_init.py | 11 +++++++++++ pandas/core/dtypes/cast.py | 19 +++++++++++++++++++ 3 files changed, 68 insertions(+) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c3fbd3ee4853e..183a111249710 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1299,6 +1299,7 @@ cdef class Seen: bint datetimetz_ # seen_datetimetz bint period_ # seen_period bint interval_ # seen_interval + bint str_ # seen_str def __cinit__(self, bint coerce_numeric=False): """ @@ -1325,6 +1326,7 @@ cdef class Seen: self.datetimetz_ = False self.period_ = False self.interval_ = False + self.str_ = False self.coerce_numeric = coerce_numeric cdef bint check_uint64_conflict(self) except -1: @@ -2615,6 +2617,13 @@ def maybe_convert_objects(ndarray[object] objects, else: seen.object_ = True break + elif isinstance(val, str): + if convert_non_numeric: + seen.str_ = True + break + else: + seen.object_ = True + break else: seen.object_ = True break @@ -2669,6 +2678,35 @@ def maybe_convert_objects(ndarray[object] objects, return pi._data seen.object_ = True + elif seen.str_: + if is_string_array(objects): + from pandas._config import get_option + opt = get_option("future.infer_string") + if opt is True: + import pyarrow as pa + + from pandas.core.dtypes.dtypes import ArrowDtype + + obj = pa.array(objects) + dtype = ArrowDtype(obj.type) + return dtype.construct_array_type()(obj) + # elif opt is False: + # # explicitly set to keep the old behavior and avoid the warning + # pass + # else: + # from pandas.util._exceptions import find_stack_level + # warnings.warn( + # "Pandas type inference with a sequence of `str` " + # "objects is deprecated. In a future version, this will give " + # "string[pyarrow] dtype, which will require pyarrow to be " + # "installed. To opt in to the new behavior immediately set " + # "`pd.set_option('future.infer_string', True)`. To keep the " + # "old behavior pass `dtype=object`.", + # FutureWarning, + # stacklevel=find_stack_level(), + # ) + + seen.object_ = True elif seen.interval_: if is_interval_array(objects): from pandas import IntervalIndex diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 3f662073f0357..4c02d90827760 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -889,3 +889,14 @@ def register_converter_cb(key) -> None: styler_environment, validator=is_instance_factory([type(None), str]), ) + + +with cf.config_prefix("future"): + cf.register_option( + "future.infer_string", + None, + "Whether to infer sequence of str objects as pyarrow string " + "dtype, which will be the default in pandas 3.0 " + "(at which point this option will be deprecated).", + validator=is_one_of_factory([True, False, None]), + ) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 09105bf49c050..d33d884832c60 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -18,6 +18,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs import lib from pandas._libs.missing import ( NA, @@ -796,6 +798,23 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: # coming out as np.str_! dtype = _dtype_obj + opt = get_option("future.infer_string") + if opt is True: + import pyarrow as pa + + pa_dtype = pa.string() + dtype = ArrowDtype(pa_dtype) + # elif opt is None: + # warnings.warn( + # "Pandas type inference with a `str` " + # "object is deprecated. In a future version, this will give " + # "string[pyarrow] dtype, which will require pyarrow to be " + # "installed. To opt in to the new behavior immediately set " + # "`pd.set_option('future.infer_string', True)`. To keep the " + # "old behavior pass `dtype=object`.", + # FutureWarning, + # stacklevel=find_stack_level(), + # ) elif isinstance(val, (np.datetime64, dt.datetime)): try: From 0889028e1b20e087aefedab1560e064e814f01f7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 5 Aug 2023 19:30:54 +0200 Subject: [PATCH 03/11] Remove comments and add tests --- pandas/_libs/lib.pyx | 15 -------- pandas/core/config_init.py | 6 ++-- pandas/core/dtypes/cast.py | 11 ------ pandas/tests/frame/test_constructors.py | 35 +++++++++++++++++++ .../indexes/base_class/test_constructors.py | 15 ++++++++ .../io/parser/dtypes/test_dtypes_basic.py | 21 +++++++++++ pandas/tests/io/test_sql.py | 17 +++++++++ pandas/tests/series/test_constructors.py | 14 ++++++++ 8 files changed, 105 insertions(+), 29 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 183a111249710..2bd99724b1cad 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2690,21 +2690,6 @@ def maybe_convert_objects(ndarray[object] objects, obj = pa.array(objects) dtype = ArrowDtype(obj.type) return dtype.construct_array_type()(obj) - # elif opt is False: - # # explicitly set to keep the old behavior and avoid the warning - # pass - # else: - # from pandas.util._exceptions import find_stack_level - # warnings.warn( - # "Pandas type inference with a sequence of `str` " - # "objects is deprecated. In a future version, this will give " - # "string[pyarrow] dtype, which will require pyarrow to be " - # "installed. To opt in to the new behavior immediately set " - # "`pd.set_option('future.infer_string', True)`. To keep the " - # "old behavior pass `dtype=object`.", - # FutureWarning, - # stacklevel=find_stack_level(), - # ) seen.object_ = True elif seen.interval_: diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 4c02d90827760..27e9bf8958ab0 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -893,10 +893,10 @@ def register_converter_cb(key) -> None: with cf.config_prefix("future"): cf.register_option( - "future.infer_string", - None, + "infer_string", + False, "Whether to infer sequence of str objects as pyarrow string " "dtype, which will be the default in pandas 3.0 " "(at which point this option will be deprecated).", - validator=is_one_of_factory([True, False, None]), + validator=is_one_of_factory([True, False]), ) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index d33d884832c60..9d2530ddc4e12 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -804,17 +804,6 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: pa_dtype = pa.string() dtype = ArrowDtype(pa_dtype) - # elif opt is None: - # warnings.warn( - # "Pandas type inference with a `str` " - # "object is deprecated. In a future version, this will give " - # "string[pyarrow] dtype, which will require pyarrow to be " - # "installed. To opt in to the new behavior immediately set " - # "`pd.set_option('future.infer_string', True)`. To keep the " - # "old behavior pass `dtype=object`.", - # FutureWarning, - # stacklevel=find_stack_level(), - # ) elif isinstance(val, (np.datetime64, dt.datetime)): try: diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index c87f04efffcf4..b82dc98cd0210 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2670,6 +2670,41 @@ def test_construct_with_strings_and_none(self): expected = DataFrame({"a": ["1", "2", None]}, dtype="str") tm.assert_frame_equal(df, expected) + def test_frame_string_inference(self): + # GH#54430 + pa = pytest.importorskip("pyarrow") + dtype = pd.ArrowDtype(pa.string()) + expected = DataFrame( + {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) + ) + with pd.option_context("future.infer_string", True): + df = DataFrame({"a": ["a", "b"]}) + tm.assert_frame_equal(df, expected) + + expected = DataFrame( + {"a": ["a", "b"]}, + dtype=dtype, + columns=Index(["a"], dtype=dtype), + index=Index(["x", "y"], dtype=dtype), + ) + with pd.option_context("future.infer_string", True): + df = DataFrame({"a": ["a", "b"]}, index=["x", "y"]) + tm.assert_frame_equal(df, expected) + + expected = DataFrame( + {"a": ["a", 1]}, dtype="object", columns=Index(["a"], dtype=dtype) + ) + with pd.option_context("future.infer_string", True): + df = DataFrame({"a": ["a", 1]}) + tm.assert_frame_equal(df, expected) + + expected = DataFrame( + {"a": ["a", "b"]}, dtype="object", columns=Index(["a"], dtype=dtype) + ) + with pd.option_context("future.infer_string", True): + df = DataFrame({"a": ["a", "b"]}, dtype="object") + tm.assert_frame_equal(df, expected) + class TestDataFrameConstructorIndexInference: def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self): diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index cf8b7214f3b91..638124ac20e06 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -1,6 +1,7 @@ import numpy as np import pytest +import pandas as pd from pandas import ( Index, MultiIndex, @@ -42,3 +43,17 @@ def test_construct_empty_tuples(self, tuple_list): expected = MultiIndex.from_tuples(tuple_list) tm.assert_index_equal(result, expected) + + def test_index_string_inference(self): + # GH#54430 + pa = pytest.importorskip("pyarrow") + dtype = pd.ArrowDtype(pa.string()) + expected = Index(["a", "b"], dtype=dtype) + with pd.option_context("future.infer_string", True): + ser = Index(["a", "b"]) + tm.assert_index_equal(ser, expected) + + expected = Index(["a", 1], dtype="object") + with pd.option_context("future.infer_string", True): + ser = Index(["a", 1]) + tm.assert_index_equal(ser, expected) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 915cc9a9a1f95..1a613c91880ea 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -538,3 +538,24 @@ def test_ea_int_avoid_overflow(all_parsers): } ) tm.assert_frame_equal(result, expected) + + +def test_string_inference(all_parsers): + # GH#54430 + pa = pytest.importorskip("pyarrow") + dtype = pd.ArrowDtype(pa.string()) + + data = """a,b +x,1 +y,2""" + parser = all_parsers + if parser.engine == "pyarrow": + pytest.skip("TODO: Follow up") + with pd.option_context("future.infer_string", True): + result = parser.read_csv(StringIO(data)) + + expected = DataFrame( + {"a": pd.Series(["x", "y"], dtype=dtype), "b": [1, 2]}, + columns=pd.Index(["a", "b"], dtype=dtype), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 6800e55396d7b..63ca91cc89ede 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2920,6 +2920,23 @@ def test_read_sql_dtype_backend_table(self, string_storage, func): # GH#50048 Not supported for sqlite pass + def test_read_sql_string_inference(self): + # GH#54430 + pa = pytest.importorskip("pyarrow") + table = "test" + df = DataFrame({"a": ["x", "y"]}) + df.to_sql(table, self.conn, index=False, if_exists="replace") + + with pd.option_context("future.infer_string", True): + result = read_sql_table(table, self.conn) + + dtype = pd.ArrowDtype(pa.string()) + expected = DataFrame( + {"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) + ) + + tm.assert_frame_equal(result, expected) + @pytest.mark.db class TestMySQLAlchemy(_TestSQLAlchemy): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 9540d7a014409..e67196edcd444 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2070,6 +2070,20 @@ def test_series_from_index_dtype_equal_does_not_copy(self): ser.iloc[0] = 100 tm.assert_index_equal(idx, expected) + def test_series_string_inference(self): + # GH#54430 + pa = pytest.importorskip("pyarrow") + dtype = pd.ArrowDtype(pa.string()) + expected = Series(["a", "b"], dtype=dtype) + with pd.option_context("future.infer_string", True): + ser = Series(["a", "b"]) + tm.assert_series_equal(ser, expected) + + expected = Series(["a", 1], dtype="object") + with pd.option_context("future.infer_string", True): + ser = Series(["a", 1]) + tm.assert_series_equal(ser, expected) + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): From 35a8240d6ba58d1ba9877b411cb43f1dc38f72f9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 5 Aug 2023 19:45:44 +0200 Subject: [PATCH 04/11] Add string option to arrow parsers --- pandas/_config/__init__.py | 2 +- pandas/io/orc.py | 1 + .../tests/io/parser/dtypes/test_dtypes_basic.py | 2 -- pandas/tests/io/test_feather.py | 14 ++++++++++++++ pandas/tests/io/test_orc.py | 17 +++++++++++++++++ pandas/tests/io/test_parquet.py | 16 ++++++++++++++++ 6 files changed, 49 insertions(+), 3 deletions(-) diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py index 9f9cf0fd2a5a8..daeb135f5bcf7 100644 --- a/pandas/_config/__init__.py +++ b/pandas/_config/__init__.py @@ -42,4 +42,4 @@ def using_nullable_dtypes() -> bool: def using_pyarrow_string_dtype() -> bool: _mode_options = _global_config["future"] - return _mode_options["pyarrow_strings"] + return _mode_options["infer_string"] diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 774f9d797b011..d612f2eaadd02 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -135,6 +135,7 @@ def read_orc( df = pa_table.to_pandas(types_mapper=mapping.get) return df else: + print("Ts") if using_pyarrow_string_dtype(): types_mapper = arrow_string_types_mapper() else: diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 9f6575ddaa95c..ed225c90a4e02 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -549,8 +549,6 @@ def test_string_inference(all_parsers): x,1 y,2""" parser = all_parsers - if parser.engine == "pyarrow": - pytest.skip("TODO: Follow up") with pd.option_context("future.infer_string", True): result = parser.read_csv(StringIO(data)) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 9de097fe8c0e6..a0fee6751bf53 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -219,3 +219,17 @@ def test_invalid_dtype_backend(self): df.to_feather(path) with pytest.raises(ValueError, match=msg): read_feather(path, dtype_backend="numpy") + + def test_string_inference(self, tmp_path): + # GH#54431 + import pyarrow as pa + + path = tmp_path / "test_string_inference.p" + df = pd.DataFrame(data={"a": ["x", "y"]}) + df.to_feather(path) + with pd.option_context("future.infer_string", True): + result = read_feather(path) + expected = pd.DataFrame( + data={"a": ["x", "y"]}, dtype=pd.ArrowDtype(pa.string()) + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 8483eb0d5c159..047d9bfe90a88 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -415,3 +415,20 @@ def test_invalid_dtype_backend(): df.to_orc(path) with pytest.raises(ValueError, match=msg): read_orc(path, dtype_backend="numpy") + + +def test_string_inference(tmp_path): + # GH#54431 + import pyarrow as pa + + path = tmp_path / "test_string_inference.p" + df = pd.DataFrame(data={"a": ["x", "y"]}) + df.to_orc(path) + with pd.option_context("future.infer_string", True): + result = read_orc(path) + expected = pd.DataFrame( + data={"a": ["x", "y"]}, + dtype=pd.ArrowDtype(pa.string()), + columns=pd.Index(["a"], dtype=pd.ArrowDtype(pa.string())), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 501e471695a8a..8b18413661703 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1106,6 +1106,22 @@ def test_df_attrs_persistence(self, tmp_path, pa): new_df = read_parquet(path, engine=pa) assert new_df.attrs == df.attrs + def test_string_inference(self, tmp_path, pa): + # GH#54431 + import pyarrow as pa + + path = tmp_path / "test_string_inference.p" + df = pd.DataFrame(data={"a": ["x", "y"]}, index=["a", "b"]) + df.to_parquet(path, engine="pyarrow") + with pd.option_context("future.infer_string", True): + result = read_parquet(path, engine="pyarrow") + expected = pd.DataFrame( + data={"a": ["x", "y"]}, + dtype=pd.ArrowDtype(pa.string()), + index=pd.Index(["a", "b"], dtype=pd.ArrowDtype(pa.string())), + ) + tm.assert_frame_equal(result, expected) + class TestParquetFastParquet(Base): def test_basic(self, fp, df_full): From b677a89bedcd360a63f3c2d34205d3decfa726e8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 5 Aug 2023 23:38:44 +0200 Subject: [PATCH 05/11] Update --- pandas/io/feather_format.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index a8f56565b676b..c463f6e4d2759 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -141,3 +141,5 @@ def read_feather( elif using_pyarrow_string_dtype(): return pa_table.to_pandas(types_mapper=arrow_string_types_mapper()) + else: + raise NotImplementedError From 11b267eac481ddf54b61946ff72463edc681baf0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 6 Aug 2023 16:44:07 +0200 Subject: [PATCH 06/11] Update --- pandas/tests/io/test_orc.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 047d9bfe90a88..c2d791ba24c87 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -419,8 +419,6 @@ def test_invalid_dtype_backend(): def test_string_inference(tmp_path): # GH#54431 - import pyarrow as pa - path = tmp_path / "test_string_inference.p" df = pd.DataFrame(data={"a": ["x", "y"]}) df.to_orc(path) From 8072a860e9577235bb807a937e42f9ed0bb8931e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 9 Aug 2023 21:44:24 +0200 Subject: [PATCH 07/11] Adjust csv --- pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index ed225c90a4e02..1c0f0939029ff 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -547,13 +547,14 @@ def test_string_inference(all_parsers): data = """a,b x,1 -y,2""" +y,2 +,3""" parser = all_parsers with pd.option_context("future.infer_string", True): result = parser.read_csv(StringIO(data)) expected = DataFrame( - {"a": pd.Series(["x", "y"], dtype=dtype), "b": [1, 2]}, + {"a": pd.Series(["x", "y", None], dtype=dtype), "b": [1, 2, 3]}, columns=pd.Index(["a", "b"], dtype=dtype), ) tm.assert_frame_equal(result, expected) From bed3124f0794b6cd42a23e2d716cfdd7ef0cc158 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 9 Aug 2023 21:48:06 +0200 Subject: [PATCH 08/11] Update --- pandas/_libs/lib.pyx | 6 +++--- pandas/core/dtypes/cast.py | 5 ++--- pandas/io/orc.py | 1 - 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 924cf360a35cc..55819ebd1f15e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -38,6 +38,8 @@ from cython cimport ( floating, ) +from pandas._config import using_pyarrow_string_dtype + from pandas._libs.missing import check_na_tuples_nonequal import_datetime() @@ -2680,9 +2682,7 @@ def maybe_convert_objects(ndarray[object] objects, elif seen.str_: if is_string_array(objects, skipna=True): - from pandas._config import get_option - opt = get_option("future.infer_string") - if opt is True: + if using_pyarrow_string_dtype(): import pyarrow as pa from pandas.core.dtypes.dtypes import ArrowDtype diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 9d2530ddc4e12..9f7c0b3e36032 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -18,7 +18,7 @@ import numpy as np -from pandas._config import get_option +from pandas._config import using_pyarrow_string_dtype from pandas._libs import lib from pandas._libs.missing import ( @@ -798,8 +798,7 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: # coming out as np.str_! dtype = _dtype_obj - opt = get_option("future.infer_string") - if opt is True: + if using_pyarrow_string_dtype(): import pyarrow as pa pa_dtype = pa.string() diff --git a/pandas/io/orc.py b/pandas/io/orc.py index d612f2eaadd02..774f9d797b011 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -135,7 +135,6 @@ def read_orc( df = pa_table.to_pandas(types_mapper=mapping.get) return df else: - print("Ts") if using_pyarrow_string_dtype(): types_mapper = arrow_string_types_mapper() else: From efb6f4a2bcb65ce2d40fa892b5540bba1aae67ec Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 9 Aug 2023 22:10:43 +0200 Subject: [PATCH 09/11] Update --- pandas/_libs/lib.pyi | 3 +++ pandas/_libs/lib.pyx | 5 +++-- pandas/core/dtypes/cast.py | 1 + pandas/io/pytables.py | 7 ++++--- pandas/tests/io/pytables/test_read.py | 12 ++++++++++++ 5 files changed, 23 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 7e92032a73325..ec0095a7087c8 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -84,6 +84,7 @@ def maybe_convert_objects( convert_non_numeric: Literal[False] = ..., convert_to_nullable_dtype: Literal[False] = ..., dtype_if_all_nat: DtypeObj | None = ..., + convert_string: Literal[False] = ..., ) -> npt.NDArray[np.object_ | np.number]: ... @overload def maybe_convert_objects( @@ -95,6 +96,7 @@ def maybe_convert_objects( convert_non_numeric: bool = ..., convert_to_nullable_dtype: Literal[True] = ..., dtype_if_all_nat: DtypeObj | None = ..., + convert_string: bool = ..., ) -> ArrayLike: ... @overload def maybe_convert_objects( @@ -106,6 +108,7 @@ def maybe_convert_objects( convert_non_numeric: bool = ..., convert_to_nullable_dtype: bool = ..., dtype_if_all_nat: DtypeObj | None = ..., + convert_string: bool = ..., ) -> ArrayLike: ... @overload def maybe_convert_numeric( diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 55819ebd1f15e..9a044a23a8cbc 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2443,7 +2443,8 @@ def maybe_convert_objects(ndarray[object] objects, bint convert_numeric=True, # NB: different default! bint convert_to_nullable_dtype=False, bint convert_non_numeric=False, - object dtype_if_all_nat=None) -> "ArrayLike": + object dtype_if_all_nat=None, + bint convert_string=True) -> "ArrayLike": """ Type inference function-- convert object array to proper dtype @@ -2681,7 +2682,7 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True elif seen.str_: - if is_string_array(objects, skipna=True): + if convert_string and is_string_array(objects, skipna=True): if using_pyarrow_string_dtype(): import pyarrow as pa diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 9f7c0b3e36032..db1c948d607dd 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1185,6 +1185,7 @@ def maybe_infer_to_datetimelike( convert_numeric=False, convert_non_numeric=True, dtype_if_all_nat=np.dtype("M8[ns]"), + convert_string=False, ) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index afbba5dc74c28..6b2734140de0e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -37,6 +37,7 @@ lib, writers as libwriters, ) +from pandas._libs.lib import is_string_array from pandas._libs.tslibs import timezones from pandas.compat._optional import import_optional_dependency from pandas.compat.pickle_compat import patch_pickle @@ -3222,7 +3223,7 @@ def read( index = self.read_index("index", start=start, stop=stop) values = self.read_array("values", start=start, stop=stop) result = Series(values, index=index, name=self.name, copy=False) - if result.dtype.kind == "O" and using_pyarrow_string_dtype(): + if using_pyarrow_string_dtype() and is_string_array(result, skipna=True): import pyarrow as pa result = result.astype(pd.ArrowDtype(pa.string())) @@ -3294,7 +3295,7 @@ def read( columns = items[items.get_indexer(blk_items)] df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) - if values.dtype.kind == "O" and using_pyarrow_string_dtype(): + if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): import pyarrow as pa df = df.astype(pd.ArrowDtype(pa.string())) @@ -4680,7 +4681,7 @@ def read( # Categorical df = DataFrame._from_arrays([values], columns=cols_, index=index_) assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) - if values.dtype.kind == "O" and using_pyarrow_string_dtype(): + if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): import pyarrow as pa df = df.astype(pd.ArrowDtype(pa.string())) diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index 89b234b24522c..eb9deae55c3a9 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -388,3 +388,15 @@ def test_read_py2_hdf_file_in_py3(datapath): ) as store: result = store["p"] tm.assert_frame_equal(result, expected) + + +def test_read_infer_string(tmp_path, setup_path): + # GH#54431 + pa = pytest.importorskip("pyarrow") + df = DataFrame({"a": ["a", "b", None]}) + path = tmp_path / setup_path + df.to_hdf(path, key="data", format="table") + with pd.option_context("future.infer_string", True): + result = read_hdf(path, key="data", mode="r") + expected = DataFrame({"a": ["a", "b", None]}, dtype=pd.ArrowDtype(pa.string())) + tm.assert_frame_equal(result, expected) From 0ac28a18179a0d2987fd20d7076e086c376e746b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 9 Aug 2023 22:53:18 +0200 Subject: [PATCH 10/11] Add test --- pandas/_libs/lib.pyi | 3 --- pandas/_libs/lib.pyx | 5 ++--- pandas/core/dtypes/cast.py | 1 - pandas/io/pytables.py | 3 ++- pandas/tests/io/pytables/test_read.py | 6 +++++- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index ec0095a7087c8..7e92032a73325 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -84,7 +84,6 @@ def maybe_convert_objects( convert_non_numeric: Literal[False] = ..., convert_to_nullable_dtype: Literal[False] = ..., dtype_if_all_nat: DtypeObj | None = ..., - convert_string: Literal[False] = ..., ) -> npt.NDArray[np.object_ | np.number]: ... @overload def maybe_convert_objects( @@ -96,7 +95,6 @@ def maybe_convert_objects( convert_non_numeric: bool = ..., convert_to_nullable_dtype: Literal[True] = ..., dtype_if_all_nat: DtypeObj | None = ..., - convert_string: bool = ..., ) -> ArrayLike: ... @overload def maybe_convert_objects( @@ -108,7 +106,6 @@ def maybe_convert_objects( convert_non_numeric: bool = ..., convert_to_nullable_dtype: bool = ..., dtype_if_all_nat: DtypeObj | None = ..., - convert_string: bool = ..., ) -> ArrayLike: ... @overload def maybe_convert_numeric( diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 9a044a23a8cbc..55819ebd1f15e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2443,8 +2443,7 @@ def maybe_convert_objects(ndarray[object] objects, bint convert_numeric=True, # NB: different default! bint convert_to_nullable_dtype=False, bint convert_non_numeric=False, - object dtype_if_all_nat=None, - bint convert_string=True) -> "ArrayLike": + object dtype_if_all_nat=None) -> "ArrayLike": """ Type inference function-- convert object array to proper dtype @@ -2682,7 +2681,7 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True elif seen.str_: - if convert_string and is_string_array(objects, skipna=True): + if is_string_array(objects, skipna=True): if using_pyarrow_string_dtype(): import pyarrow as pa diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index db1c948d607dd..9f7c0b3e36032 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1185,7 +1185,6 @@ def maybe_infer_to_datetimelike( convert_numeric=False, convert_non_numeric=True, dtype_if_all_nat=np.dtype("M8[ns]"), - convert_string=False, ) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 6b2734140de0e..3c27b186dffe0 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4680,7 +4680,8 @@ def read( else: # Categorical df = DataFrame._from_arrays([values], columns=cols_, index=index_) - assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) + if not using_pyarrow_string_dtype(): + assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): import pyarrow as pa diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index eb9deae55c3a9..425828cb881a7 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -398,5 +398,9 @@ def test_read_infer_string(tmp_path, setup_path): df.to_hdf(path, key="data", format="table") with pd.option_context("future.infer_string", True): result = read_hdf(path, key="data", mode="r") - expected = DataFrame({"a": ["a", "b", None]}, dtype=pd.ArrowDtype(pa.string())) + expected = DataFrame( + {"a": ["a", "b", None]}, + dtype=pd.ArrowDtype(pa.string()), + columns=Index(["a"], dtype=pd.ArrowDtype(pa.string())), + ) tm.assert_frame_equal(result, expected) From ff38a2908c429c7aca767857776eedd2e9daf42d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 10 Aug 2023 09:47:29 +0200 Subject: [PATCH 11/11] Fix mypy --- pandas/io/pytables.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 3c27b186dffe0..f26411f65d91f 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3223,7 +3223,7 @@ def read( index = self.read_index("index", start=start, stop=stop) values = self.read_array("values", start=start, stop=stop) result = Series(values, index=index, name=self.name, copy=False) - if using_pyarrow_string_dtype() and is_string_array(result, skipna=True): + if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): import pyarrow as pa result = result.astype(pd.ArrowDtype(pa.string())) @@ -4680,9 +4680,12 @@ def read( else: # Categorical df = DataFrame._from_arrays([values], columns=cols_, index=index_) - if not using_pyarrow_string_dtype(): + if not (using_pyarrow_string_dtype() and values.dtype.kind == "O"): assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) - if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): + if using_pyarrow_string_dtype() and is_string_array( + values, # type: ignore[arg-type] + skipna=True, + ): import pyarrow as pa df = df.astype(pd.ArrowDtype(pa.string()))