String dtype: fix pyarrow-based IO + update tests (#59478)

jorisvandenbossche · web-flow · commit 487c585e0db2 · 2024-08-22T11:43:57.000+02:00
diff --git a/pandas/io/_util.py b/pandas/io/_util.py
@@ -27,6 +27,8 @@ def _arrow_dtype_mapping() -> dict:
         pa.string(): pd.StringDtype(),
         pa.float32(): pd.Float32Dtype(),
         pa.float64(): pd.Float64Dtype(),
+        pa.string(): pd.StringDtype(),
+        pa.large_string(): pd.StringDtype(),
     }
 
 
diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py
@@ -5,19 +5,15 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 import pandas._testing as tm
 
 from pandas.io.feather_format import read_feather, to_feather  # isort:skip
 
-pytestmark = [
-    pytest.mark.filterwarnings(
-        "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
-    ),
-    pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
-]
+pytestmark = pytest.mark.filterwarnings(
+    "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
+)
+
 
 pa = pytest.importorskip("pyarrow")
 
@@ -150,8 +146,8 @@ def test_path_pathlib(self):
     def test_passthrough_keywords(self):
         df = pd.DataFrame(
             1.1 * np.arange(120).reshape((30, 4)),
-            columns=pd.Index(list("ABCD"), dtype=object),
-            index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
+            columns=pd.Index(list("ABCD")),
+            index=pd.Index([f"i-{i}" for i in range(30)]),
         ).reset_index()
         self.check_round_trip(df, write_kwargs={"version": 1})
 
@@ -165,7 +161,9 @@ def test_http_path(self, feather_file, httpserver):
             res = read_feather(httpserver.url)
         tm.assert_frame_equal(expected, res)
 
-    def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
+    def test_read_feather_dtype_backend(
+        self, string_storage, dtype_backend, using_infer_string
+    ):
         # GH#50765
         df = pd.DataFrame(
             {
@@ -187,7 +185,10 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
 
         if dtype_backend == "pyarrow":
             pa = pytest.importorskip("pyarrow")
-            string_dtype = pd.ArrowDtype(pa.string())
+            if using_infer_string:
+                string_dtype = pd.ArrowDtype(pa.large_string())
+            else:
+                string_dtype = pd.ArrowDtype(pa.string())
         else:
             string_dtype = pd.StringDtype(string_storage)
 
@@ -214,6 +215,10 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
                 }
             )
 
+        if using_infer_string:
+            expected.columns = expected.columns.astype(
+                pd.StringDtype(string_storage, na_value=np.nan)
+            )
         tm.assert_frame_equal(result, expected)
 
     def test_int_columns_and_index(self):
diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py
@@ -176,7 +176,7 @@ def test_excel_options(fsspectest):
     assert fsspectest.test[0] == "read"
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet")
 def test_to_parquet_new_file(cleared_fs, df1):
     """Regression test for writing to a not-yet-existent GCS Parquet file."""
     pytest.importorskip("fastparquet")
@@ -205,7 +205,7 @@ def test_arrowparquet_options(fsspectest):
     assert fsspectest.test[0] == "parquet_read"
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet")
 def test_fastparquet_options(fsspectest):
     """Regression test for writing to a not-yet-existent GCS Parquet file."""
     pytest.importorskip("fastparquet")
@@ -263,7 +263,7 @@ def test_s3_protocols(s3_public_bucket_with_data, tips_file, protocol, s3so):
     )
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet")
 @pytest.mark.single_cpu
 def test_s3_parquet(s3_public_bucket, s3so, df1):
     pytest.importorskip("fastparquet")
diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py
@@ -208,7 +208,7 @@ def test_to_csv_compression_encoding_gcs(
     tm.assert_frame_equal(df, read_df)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet")
 def test_to_parquet_gcs_new_file(monkeypatch, tmpdir):
     """Regression test for writing to a not-yet-existent GCS Parquet file."""
     pytest.importorskip("fastparquet")
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
@@ -9,8 +9,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 from pandas import read_orc
 import pandas._testing as tm
@@ -20,20 +18,17 @@
 
 import pyarrow as pa
 
-pytestmark = [
-    pytest.mark.filterwarnings(
-        "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
-    ),
-    pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
-]
+pytestmark = pytest.mark.filterwarnings(
+    "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
+)
 
 
 @pytest.fixture
 def dirpath(datapath):
     return datapath("io", "data", "orc")
 
 
-def test_orc_reader_empty(dirpath):
+def test_orc_reader_empty(dirpath, using_infer_string):
     columns = [
         "boolean1",
         "byte1",
@@ -54,11 +49,12 @@ def test_orc_reader_empty(dirpath):
         "float32",
         "float64",
         "object",
-        "object",
+        "str" if using_infer_string else "object",
     ]
     expected = pd.DataFrame(index=pd.RangeIndex(0))
     for colname, dtype in zip(columns, dtypes):
         expected[colname] = pd.Series(dtype=dtype)
+    expected.columns = expected.columns.astype("str")
 
     inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc")
     got = read_orc(inputfile, columns=columns)
@@ -305,7 +301,7 @@ def test_orc_writer_dtypes_not_supported(orc_writer_dtypes_not_supported):
         df.to_orc()
 
 
-def test_orc_dtype_backend_pyarrow():
+def test_orc_dtype_backend_pyarrow(using_infer_string):
     pytest.importorskip("pyarrow")
     df = pd.DataFrame(
         {
@@ -338,6 +334,13 @@ def test_orc_dtype_backend_pyarrow():
             for col in df.columns
         }
     )
+    if using_infer_string:
+        # ORC does not preserve distinction between string and large string
+        # -> the default large string comes back as string
+        string_dtype = pd.ArrowDtype(pa.string())
+        expected["string"] = expected["string"].astype(string_dtype)
+        expected["string_with_nan"] = expected["string_with_nan"].astype(string_dtype)
+        expected["string_with_none"] = expected["string_with_none"].astype(string_dtype)
 
     tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -52,7 +52,6 @@
     pytest.mark.filterwarnings(
         "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
     ),
-    pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
 ]
 
 
@@ -61,10 +60,17 @@
     params=[
         pytest.param(
             "fastparquet",
-            marks=pytest.mark.skipif(
-                not _HAVE_FASTPARQUET,
-                reason="fastparquet is not installed",
-            ),
+            marks=[
+                pytest.mark.skipif(
+                    not _HAVE_FASTPARQUET,
+                    reason="fastparquet is not installed",
+                ),
+                pytest.mark.xfail(
+                    using_string_dtype(),
+                    reason="TODO(infer_string) fastparquet",
+                    strict=False,
+                ),
+            ],
         ),
         pytest.param(
             "pyarrow",
@@ -86,15 +92,22 @@ def pa():
 
 
 @pytest.fixture
-def fp():
+def fp(request):
     if not _HAVE_FASTPARQUET:
         pytest.skip("fastparquet is not installed")
+    if using_string_dtype():
+        request.applymarker(
+            pytest.mark.xfail(reason="TODO(infer_string) fastparquet", strict=False)
+        )
     return "fastparquet"
 
 
 @pytest.fixture
 def df_compat():
-    return pd.DataFrame({"A": [1, 2, 3], "B": "foo"})
+    # TODO(infer_string) should this give str columns?
+    return pd.DataFrame(
+        {"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"], dtype=object)
+    )
 
 
 @pytest.fixture
@@ -366,16 +379,6 @@ def check_external_error_on_write(self, df, engine, exc):
             with tm.external_error_raised(exc):
                 to_parquet(df, path, engine, compression=None)
 
-    @pytest.mark.network
-    @pytest.mark.single_cpu
-    def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine):
-        if engine != "auto":
-            pytest.importorskip(engine)
-        with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f:
-            httpserver.serve_content(content=f.read())
-            df = read_parquet(httpserver.url)
-        tm.assert_frame_equal(df, df_compat)
-
 
 class TestBasic(Base):
     def test_error(self, engine):
@@ -673,6 +676,16 @@ def test_read_empty_array(self, pa, dtype):
             df, pa, read_kwargs={"dtype_backend": "numpy_nullable"}, expected=expected
         )
 
+    @pytest.mark.network
+    @pytest.mark.single_cpu
+    def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine):
+        if engine != "auto":
+            pytest.importorskip(engine)
+        with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f:
+            httpserver.serve_content(content=f.read())
+            df = read_parquet(httpserver.url, engine=engine)
+        tm.assert_frame_equal(df, df_compat)
+
 
 class TestParquetPyArrow(Base):
     @pytest.mark.xfail(reason="datetime_with_nat unit doesn't round-trip")
@@ -906,7 +919,7 @@ def test_write_with_schema(self, pa):
         out_df = df.astype(bool)
         check_round_trip(df, pa, write_kwargs={"schema": schema}, expected=out_df)
 
-    def test_additional_extension_arrays(self, pa):
+    def test_additional_extension_arrays(self, pa, using_infer_string):
         # test additional ExtensionArrays that are supported through the
         # __arrow_array__ protocol
         pytest.importorskip("pyarrow")
@@ -917,17 +930,25 @@ def test_additional_extension_arrays(self, pa):
                 "c": pd.Series(["a", None, "c"], dtype="string"),
             }
         )
-        check_round_trip(df, pa)
+        if using_infer_string:
+            check_round_trip(df, pa, expected=df.astype({"c": "str"}))
+        else:
+            check_round_trip(df, pa)
 
         df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")})
         check_round_trip(df, pa)
 
-    def test_pyarrow_backed_string_array(self, pa, string_storage):
+    def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_string):
         # test ArrowStringArray supported through the __arrow_array__ protocol
         pytest.importorskip("pyarrow")
         df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")})
         with pd.option_context("string_storage", string_storage):
-            check_round_trip(df, pa, expected=df.astype(f"string[{string_storage}]"))
+            if using_infer_string:
+                expected = df.astype("str")
+                expected.columns = expected.columns.astype("str")
+            else:
+                expected = df.astype(f"string[{string_storage}]")
+            check_round_trip(df, pa, expected=expected)
 
     def test_additional_extension_types(self, pa):
         # test additional ExtensionArrays that are supported through the

Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,8 @@ def _arrow_dtype_mapping() -> dict:`
`27`	`27`	`pa.string(): pd.StringDtype(),`
`28`	`28`	`pa.float32(): pd.Float32Dtype(),`
`29`	`29`	`pa.float64(): pd.Float64Dtype(),`
	`30`	`+ pa.string(): pd.StringDtype(),`
	`31`	`+ pa.large_string(): pd.StringDtype(),`
`30`	`32`	`}`
`31`	`33`
`32`	`34`