ENH: Implement arrow string option for various I/O methods (pandas-dev#54431)

phofl · jbrockmendel · mroeschke · commit d746349e105b · 2023-08-18T10:36:27.000-07:00
* ENH: Implement arrow string option for various I/O methods

* ENH: allow opt-in to inferring pyarrow strings

* Remove comments and add tests

* Add string option to arrow parsers

* Update

* Update

* Adjust csv

* Update

* Update

* Add test

* Fix mypy

---------

Co-authored-by: Brock &lt;jbrockmendel@gmail.com&gt;
diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py
@@ -38,3 +38,8 @@ def using_copy_on_write() -> bool:
 def using_nullable_dtypes() -> bool:
     _mode_options = _global_config["mode"]
     return _mode_options["nullable_dtypes"]
+
+
+def using_pyarrow_string_dtype() -> bool:
+    _mode_options = _global_config["future"]
+    return _mode_options["infer_string"]
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -38,6 +38,8 @@ from cython cimport (
     floating,
 )
 
+from pandas._config import using_pyarrow_string_dtype
+
 from pandas._libs.missing import check_na_tuples_nonequal
 
 import_datetime()
@@ -2679,9 +2681,7 @@ def maybe_convert_objects(ndarray[object] objects,
 
     elif seen.str_:
         if is_string_array(objects, skipna=True):
-            from pandas._config import get_option
-            opt = get_option("future.infer_string")
-            if opt is True:
+            if using_pyarrow_string_dtype():
                 import pyarrow as pa
 
                 from pandas.core.dtypes.dtypes import ArrowDtype
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -18,7 +18,7 @@
 
 import numpy as np
 
-from pandas._config import get_option
+from pandas._config import using_pyarrow_string_dtype
 
 from pandas._libs import lib
 from pandas._libs.missing import (
@@ -798,8 +798,7 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
         # coming out as np.str_!
 
         dtype = _dtype_obj
-        opt = get_option("future.infer_string")
-        if opt is True:
+        if using_pyarrow_string_dtype():
             import pyarrow as pa
 
             pa_dtype = pa.string()
diff --git a/pandas/io/_util.py b/pandas/io/_util.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+from typing import Callable
+
 from pandas.compat._optional import import_optional_dependency
 
 import pandas as pd
@@ -21,3 +23,9 @@ def _arrow_dtype_mapping() -> dict:
         pa.float32(): pd.Float32Dtype(),
         pa.float64(): pd.Float64Dtype(),
     }
+
+
+def arrow_string_types_mapper() -> Callable:
+    pa = import_optional_dependency("pyarrow")
+
+    return {pa.string(): pd.ArrowDtype(pa.string())}.get
diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py
@@ -6,6 +6,8 @@
     Any,
 )
 
+from pandas._config import using_pyarrow_string_dtype
+
 from pandas._libs import lib
 from pandas.compat._optional import import_optional_dependency
 from pandas.util._decorators import doc
@@ -15,6 +17,7 @@
 from pandas.core.api import DataFrame
 from pandas.core.shared_docs import _shared_docs
 
+from pandas.io._util import arrow_string_types_mapper
 from pandas.io.common import get_handle
 
 if TYPE_CHECKING:
@@ -119,7 +122,7 @@ def read_feather(
     with get_handle(
         path, "rb", storage_options=storage_options, is_text=False
     ) as handles:
-        if dtype_backend is lib.no_default:
+        if dtype_backend is lib.no_default and not using_pyarrow_string_dtype():
             return feather.read_feather(
                 handles.handle, columns=columns, use_threads=bool(use_threads)
             )
@@ -135,3 +138,8 @@ def read_feather(
 
         elif dtype_backend == "pyarrow":
             return pa_table.to_pandas(types_mapper=pd.ArrowDtype)
+
+        elif using_pyarrow_string_dtype():
+            return pa_table.to_pandas(types_mapper=arrow_string_types_mapper())
+        else:
+            raise NotImplementedError
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
@@ -9,6 +9,8 @@
     Literal,
 )
 
+from pandas._config import using_pyarrow_string_dtype
+
 from pandas._libs import lib
 from pandas.compat import pa_version_under8p0
 from pandas.compat._optional import import_optional_dependency
@@ -24,6 +26,7 @@
 import pandas as pd
 from pandas.core.indexes.api import default_index
 
+from pandas.io._util import arrow_string_types_mapper
 from pandas.io.common import (
     get_handle,
     is_fsspec_url,
@@ -132,7 +135,11 @@ def read_orc(
             df = pa_table.to_pandas(types_mapper=mapping.get)
         return df
     else:
-        return pa_table.to_pandas()
+        if using_pyarrow_string_dtype():
+            types_mapper = arrow_string_types_mapper()
+        else:
+            types_mapper = None
+        return pa_table.to_pandas(types_mapper=types_mapper)
 
 
 def to_orc(
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -12,6 +12,8 @@
 import warnings
 from warnings import catch_warnings
 
+from pandas._config import using_pyarrow_string_dtype
+
 from pandas._libs import lib
 from pandas.compat._optional import import_optional_dependency
 from pandas.errors import AbstractMethodError
@@ -26,6 +28,7 @@
 )
 from pandas.core.shared_docs import _shared_docs
 
+from pandas.io._util import arrow_string_types_mapper
 from pandas.io.common import (
     IOHandles,
     get_handle,
@@ -252,6 +255,8 @@ def read(
             to_pandas_kwargs["types_mapper"] = mapping.get
         elif dtype_backend == "pyarrow":
             to_pandas_kwargs["types_mapper"] = pd.ArrowDtype  # type: ignore[assignment]  # noqa: E501
+        elif using_pyarrow_string_dtype():
+            to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper()
 
         manager = get_option("mode.data_manager")
         if manager == "array":
diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -2,6 +2,8 @@
 
 from typing import TYPE_CHECKING
 
+from pandas._config import using_pyarrow_string_dtype
+
 from pandas._libs import lib
 from pandas.compat._optional import import_optional_dependency
 
@@ -10,7 +12,10 @@
 import pandas as pd
 from pandas import DataFrame
 
-from pandas.io._util import _arrow_dtype_mapping
+from pandas.io._util import (
+    _arrow_dtype_mapping,
+    arrow_string_types_mapper,
+)
 from pandas.io.parsers.base_parser import ParserBase
 
 if TYPE_CHECKING:
@@ -215,6 +220,8 @@ def read(self) -> DataFrame:
             dtype_mapping = _arrow_dtype_mapping()
             dtype_mapping[pa.null()] = pd.Int64Dtype()
             frame = table.to_pandas(types_mapper=dtype_mapping.get)
+        elif using_pyarrow_string_dtype():
+            frame = table.to_pandas(types_mapper=arrow_string_types_mapper())
         else:
             frame = table.to_pandas()
         return self._finalize_pandas_output(frame)
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -30,12 +30,14 @@
 from pandas._config import (
     config,
     get_option,
+    using_pyarrow_string_dtype,
 )
 
 from pandas._libs import (
     lib,
     writers as libwriters,
 )
+from pandas._libs.lib import is_string_array
 from pandas._libs.tslibs import timezones
 from pandas.compat._optional import import_optional_dependency
 from pandas.compat.pickle_compat import patch_pickle
@@ -66,6 +68,7 @@
 )
 from pandas.core.dtypes.missing import array_equivalent
 
+import pandas as pd
 from pandas import (
     DataFrame,
     DatetimeIndex,
@@ -3219,7 +3222,12 @@ def read(
         self.validate_read(columns, where)
         index = self.read_index("index", start=start, stop=stop)
         values = self.read_array("values", start=start, stop=stop)
-        return Series(values, index=index, name=self.name, copy=False)
+        result = Series(values, index=index, name=self.name, copy=False)
+        if using_pyarrow_string_dtype() and is_string_array(values, skipna=True):
+            import pyarrow as pa
+
+            result = result.astype(pd.ArrowDtype(pa.string()))
+        return result
 
     # error: Signature of "write" incompatible with supertype "Fixed"
     def write(self, obj, **kwargs) -> None:  # type: ignore[override]
@@ -3287,6 +3295,10 @@ def read(
 
             columns = items[items.get_indexer(blk_items)]
             df = DataFrame(values.T, columns=columns, index=axes[1], copy=False)
+            if using_pyarrow_string_dtype() and is_string_array(values, skipna=True):
+                import pyarrow as pa
+
+                df = df.astype(pd.ArrowDtype(pa.string()))
             dfs.append(df)
 
         if len(dfs) > 0:
@@ -4668,7 +4680,15 @@ def read(
             else:
                 # Categorical
                 df = DataFrame._from_arrays([values], columns=cols_, index=index_)
-            assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
+            if not (using_pyarrow_string_dtype() and values.dtype.kind == "O"):
+                assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
+            if using_pyarrow_string_dtype() and is_string_array(
+                values,  # type: ignore[arg-type]
+                skipna=True,
+            ):
+                import pyarrow as pa
+
+                df = df.astype(pd.ArrowDtype(pa.string()))
             frames.append(df)
 
         if len(frames) == 1:
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -547,15 +547,14 @@ def test_string_inference(all_parsers):
 
     data = """a,b
 x,1
-y,2"""
+y,2
+,3"""
     parser = all_parsers
-    if parser.engine == "pyarrow":
-        pytest.skip("TODO: Follow up")
     with pd.option_context("future.infer_string", True):
         result = parser.read_csv(StringIO(data))
 
     expected = DataFrame(
-        {"a": pd.Series(["x", "y"], dtype=dtype), "b": [1, 2]},
+        {"a": pd.Series(["x", "y", None], dtype=dtype), "b": [1, 2, 3]},
         columns=pd.Index(["a", "b"], dtype=dtype),
     )
     tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py
@@ -388,3 +388,19 @@ def test_read_py2_hdf_file_in_py3(datapath):
     ) as store:
         result = store["p"]
         tm.assert_frame_equal(result, expected)
+
+
+def test_read_infer_string(tmp_path, setup_path):
+    # GH#54431
+    pa = pytest.importorskip("pyarrow")
+    df = DataFrame({"a": ["a", "b", None]})
+    path = tmp_path / setup_path
+    df.to_hdf(path, key="data", format="table")
+    with pd.option_context("future.infer_string", True):
+        result = read_hdf(path, key="data", mode="r")
+    expected = DataFrame(
+        {"a": ["a", "b", None]},
+        dtype=pd.ArrowDtype(pa.string()),
+        columns=Index(["a"], dtype=pd.ArrowDtype(pa.string())),
+    )
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py
@@ -219,3 +219,17 @@ def test_invalid_dtype_backend(self):
             df.to_feather(path)
             with pytest.raises(ValueError, match=msg):
                 read_feather(path, dtype_backend="numpy")
+
+    def test_string_inference(self, tmp_path):
+        # GH#54431
+        import pyarrow as pa
+
+        path = tmp_path / "test_string_inference.p"
+        df = pd.DataFrame(data={"a": ["x", "y"]})
+        df.to_feather(path)
+        with pd.option_context("future.infer_string", True):
+            result = read_feather(path)
+        expected = pd.DataFrame(
+            data={"a": ["x", "y"]}, dtype=pd.ArrowDtype(pa.string())
+        )
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
@@ -415,3 +415,18 @@ def test_invalid_dtype_backend():
         df.to_orc(path)
         with pytest.raises(ValueError, match=msg):
             read_orc(path, dtype_backend="numpy")
+
+
+def test_string_inference(tmp_path):
+    # GH#54431
+    path = tmp_path / "test_string_inference.p"
+    df = pd.DataFrame(data={"a": ["x", "y"]})
+    df.to_orc(path)
+    with pd.option_context("future.infer_string", True):
+        result = read_orc(path)
+    expected = pd.DataFrame(
+        data={"a": ["x", "y"]},
+        dtype=pd.ArrowDtype(pa.string()),
+        columns=pd.Index(["a"], dtype=pd.ArrowDtype(pa.string())),
+    )
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -1103,6 +1103,22 @@ def test_df_attrs_persistence(self, tmp_path, pa):
         new_df = read_parquet(path, engine=pa)
         assert new_df.attrs == df.attrs
 
+    def test_string_inference(self, tmp_path, pa):
+        # GH#54431
+        import pyarrow as pa
+
+        path = tmp_path / "test_string_inference.p"
+        df = pd.DataFrame(data={"a": ["x", "y"]}, index=["a", "b"])
+        df.to_parquet(path, engine="pyarrow")
+        with pd.option_context("future.infer_string", True):
+            result = read_parquet(path, engine="pyarrow")
+        expected = pd.DataFrame(
+            data={"a": ["x", "y"]},
+            dtype=pd.ArrowDtype(pa.string()),
+            index=pd.Index(["a", "b"], dtype=pd.ArrowDtype(pa.string())),
+        )
+        tm.assert_frame_equal(result, expected)
+
 
 class TestParquetFastParquet(Base):
     def test_basic(self, fp, df_full):