diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py index c37ad563df8ef..daeb135f5bcf7 100644 --- a/pandas/_config/__init__.py +++ b/pandas/_config/__init__.py @@ -38,3 +38,8 @@ def using_copy_on_write() -> bool: def using_nullable_dtypes() -> bool: _mode_options = _global_config["mode"] return _mode_options["nullable_dtypes"] + + +def using_pyarrow_string_dtype() -> bool: + _mode_options = _global_config["future"] + return _mode_options["infer_string"] diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 924cf360a35cc..55819ebd1f15e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -38,6 +38,8 @@ from cython cimport ( floating, ) +from pandas._config import using_pyarrow_string_dtype + from pandas._libs.missing import check_na_tuples_nonequal import_datetime() @@ -2680,9 +2682,7 @@ def maybe_convert_objects(ndarray[object] objects, elif seen.str_: if is_string_array(objects, skipna=True): - from pandas._config import get_option - opt = get_option("future.infer_string") - if opt is True: + if using_pyarrow_string_dtype(): import pyarrow as pa from pandas.core.dtypes.dtypes import ArrowDtype diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 9d2530ddc4e12..9f7c0b3e36032 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -18,7 +18,7 @@ import numpy as np -from pandas._config import get_option +from pandas._config import using_pyarrow_string_dtype from pandas._libs import lib from pandas._libs.missing import ( @@ -798,8 +798,7 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: # coming out as np.str_! dtype = _dtype_obj - opt = get_option("future.infer_string") - if opt is True: + if using_pyarrow_string_dtype(): import pyarrow as pa pa_dtype = pa.string() diff --git a/pandas/io/_util.py b/pandas/io/_util.py index d2a001f0cf925..27316b3ab0af0 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -1,5 +1,7 @@ from __future__ import annotations +from typing import Callable + from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -21,3 +23,9 @@ def _arrow_dtype_mapping() -> dict: pa.float32(): pd.Float32Dtype(), pa.float64(): pd.Float64Dtype(), } + + +def arrow_string_types_mapper() -> Callable: + pa = import_optional_dependency("pyarrow") + + return {pa.string(): pd.ArrowDtype(pa.string())}.get diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 77b2b12fda77f..c463f6e4d2759 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -6,6 +6,8 @@ Any, ) +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc @@ -15,6 +17,7 @@ from pandas.core.api import DataFrame from pandas.core.shared_docs import _shared_docs +from pandas.io._util import arrow_string_types_mapper from pandas.io.common import get_handle if TYPE_CHECKING: @@ -119,7 +122,7 @@ def read_feather( with get_handle( path, "rb", storage_options=storage_options, is_text=False ) as handles: - if dtype_backend is lib.no_default: + if dtype_backend is lib.no_default and not using_pyarrow_string_dtype(): return feather.read_feather( handles.handle, columns=columns, use_threads=bool(use_threads) ) @@ -135,3 +138,8 @@ def read_feather( elif dtype_backend == "pyarrow": return pa_table.to_pandas(types_mapper=pd.ArrowDtype) + + elif using_pyarrow_string_dtype(): + return pa_table.to_pandas(types_mapper=arrow_string_types_mapper()) + else: + raise NotImplementedError diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 75f7f9e56439e..774f9d797b011 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -9,6 +9,8 @@ Literal, ) +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas.compat import pa_version_under8p0 from pandas.compat._optional import import_optional_dependency @@ -24,6 +26,7 @@ import pandas as pd from pandas.core.indexes.api import default_index +from pandas.io._util import arrow_string_types_mapper from pandas.io.common import ( get_handle, is_fsspec_url, @@ -132,7 +135,11 @@ def read_orc( df = pa_table.to_pandas(types_mapper=mapping.get) return df else: - return pa_table.to_pandas() + if using_pyarrow_string_dtype(): + types_mapper = arrow_string_types_mapper() + else: + types_mapper = None + return pa_table.to_pandas(types_mapper=types_mapper) def to_orc( diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index aaf7710ac0986..91987e6531261 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -12,6 +12,8 @@ import warnings from warnings import catch_warnings +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError @@ -26,6 +28,7 @@ ) from pandas.core.shared_docs import _shared_docs +from pandas.io._util import arrow_string_types_mapper from pandas.io.common import ( IOHandles, get_handle, @@ -252,6 +255,8 @@ def read( to_pandas_kwargs["types_mapper"] = mapping.get elif dtype_backend == "pyarrow": to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] # noqa: E501 + elif using_pyarrow_string_dtype(): + to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper() manager = get_option("mode.data_manager") if manager == "array": diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 09ea6b8b7902b..71bfb00a95b50 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -2,6 +2,8 @@ from typing import TYPE_CHECKING +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -10,7 +12,10 @@ import pandas as pd from pandas import DataFrame -from pandas.io._util import _arrow_dtype_mapping +from pandas.io._util import ( + _arrow_dtype_mapping, + arrow_string_types_mapper, +) from pandas.io.parsers.base_parser import ParserBase if TYPE_CHECKING: @@ -215,6 +220,8 @@ def read(self) -> DataFrame: dtype_mapping = _arrow_dtype_mapping() dtype_mapping[pa.null()] = pd.Int64Dtype() frame = table.to_pandas(types_mapper=dtype_mapping.get) + elif using_pyarrow_string_dtype(): + frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) else: frame = table.to_pandas() return self._finalize_pandas_output(frame) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index e7832006567d1..f26411f65d91f 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -30,12 +30,14 @@ from pandas._config import ( config, get_option, + using_pyarrow_string_dtype, ) from pandas._libs import ( lib, writers as libwriters, ) +from pandas._libs.lib import is_string_array from pandas._libs.tslibs import timezones from pandas.compat._optional import import_optional_dependency from pandas.compat.pickle_compat import patch_pickle @@ -66,6 +68,7 @@ ) from pandas.core.dtypes.missing import array_equivalent +import pandas as pd from pandas import ( DataFrame, DatetimeIndex, @@ -3219,7 +3222,12 @@ def read( self.validate_read(columns, where) index = self.read_index("index", start=start, stop=stop) values = self.read_array("values", start=start, stop=stop) - return Series(values, index=index, name=self.name, copy=False) + result = Series(values, index=index, name=self.name, copy=False) + if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): + import pyarrow as pa + + result = result.astype(pd.ArrowDtype(pa.string())) + return result # error: Signature of "write" incompatible with supertype "Fixed" def write(self, obj, **kwargs) -> None: # type: ignore[override] @@ -3287,6 +3295,10 @@ def read( columns = items[items.get_indexer(blk_items)] df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) + if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): + import pyarrow as pa + + df = df.astype(pd.ArrowDtype(pa.string())) dfs.append(df) if len(dfs) > 0: @@ -4668,7 +4680,15 @@ def read( else: # Categorical df = DataFrame._from_arrays([values], columns=cols_, index=index_) - assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) + if not (using_pyarrow_string_dtype() and values.dtype.kind == "O"): + assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) + if using_pyarrow_string_dtype() and is_string_array( + values, # type: ignore[arg-type] + skipna=True, + ): + import pyarrow as pa + + df = df.astype(pd.ArrowDtype(pa.string())) frames.append(df) if len(frames) == 1: diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 9f6575ddaa95c..1c0f0939029ff 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -547,15 +547,14 @@ def test_string_inference(all_parsers): data = """a,b x,1 -y,2""" +y,2 +,3""" parser = all_parsers - if parser.engine == "pyarrow": - pytest.skip("TODO: Follow up") with pd.option_context("future.infer_string", True): result = parser.read_csv(StringIO(data)) expected = DataFrame( - {"a": pd.Series(["x", "y"], dtype=dtype), "b": [1, 2]}, + {"a": pd.Series(["x", "y", None], dtype=dtype), "b": [1, 2, 3]}, columns=pd.Index(["a", "b"], dtype=dtype), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index 89b234b24522c..425828cb881a7 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -388,3 +388,19 @@ def test_read_py2_hdf_file_in_py3(datapath): ) as store: result = store["p"] tm.assert_frame_equal(result, expected) + + +def test_read_infer_string(tmp_path, setup_path): + # GH#54431 + pa = pytest.importorskip("pyarrow") + df = DataFrame({"a": ["a", "b", None]}) + path = tmp_path / setup_path + df.to_hdf(path, key="data", format="table") + with pd.option_context("future.infer_string", True): + result = read_hdf(path, key="data", mode="r") + expected = DataFrame( + {"a": ["a", "b", None]}, + dtype=pd.ArrowDtype(pa.string()), + columns=Index(["a"], dtype=pd.ArrowDtype(pa.string())), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 9de097fe8c0e6..a0fee6751bf53 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -219,3 +219,17 @@ def test_invalid_dtype_backend(self): df.to_feather(path) with pytest.raises(ValueError, match=msg): read_feather(path, dtype_backend="numpy") + + def test_string_inference(self, tmp_path): + # GH#54431 + import pyarrow as pa + + path = tmp_path / "test_string_inference.p" + df = pd.DataFrame(data={"a": ["x", "y"]}) + df.to_feather(path) + with pd.option_context("future.infer_string", True): + result = read_feather(path) + expected = pd.DataFrame( + data={"a": ["x", "y"]}, dtype=pd.ArrowDtype(pa.string()) + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 8483eb0d5c159..c2d791ba24c87 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -415,3 +415,18 @@ def test_invalid_dtype_backend(): df.to_orc(path) with pytest.raises(ValueError, match=msg): read_orc(path, dtype_backend="numpy") + + +def test_string_inference(tmp_path): + # GH#54431 + path = tmp_path / "test_string_inference.p" + df = pd.DataFrame(data={"a": ["x", "y"]}) + df.to_orc(path) + with pd.option_context("future.infer_string", True): + result = read_orc(path) + expected = pd.DataFrame( + data={"a": ["x", "y"]}, + dtype=pd.ArrowDtype(pa.string()), + columns=pd.Index(["a"], dtype=pd.ArrowDtype(pa.string())), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index f3b1ac8062f19..fcc1c218a149d 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1103,6 +1103,22 @@ def test_df_attrs_persistence(self, tmp_path, pa): new_df = read_parquet(path, engine=pa) assert new_df.attrs == df.attrs + def test_string_inference(self, tmp_path, pa): + # GH#54431 + import pyarrow as pa + + path = tmp_path / "test_string_inference.p" + df = pd.DataFrame(data={"a": ["x", "y"]}, index=["a", "b"]) + df.to_parquet(path, engine="pyarrow") + with pd.option_context("future.infer_string", True): + result = read_parquet(path, engine="pyarrow") + expected = pd.DataFrame( + data={"a": ["x", "y"]}, + dtype=pd.ArrowDtype(pa.string()), + index=pd.Index(["a", "b"], dtype=pd.ArrowDtype(pa.string())), + ) + tm.assert_frame_equal(result, expected) + class TestParquetFastParquet(Base): def test_basic(self, fp, df_full):