TST (string dtype): clean up construction of expected string arrays (pandas-dev#59481)

jorisvandenbossche · WillAyd · commit 7d74bdb41afc · 2024-08-22T19:32:17.000-04:00
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -30,10 +30,6 @@
     read_csv,
 )
 import pandas._testing as tm
-from pandas.core.arrays import (
-    ArrowStringArray,
-    StringArray,
-)
 
 if is_platform_windows():
     pytestmark = pytest.mark.single_cpu
@@ -663,41 +659,31 @@ def test_dtype_backend_and_dtype(self, read_ext):
     @pytest.mark.xfail(
         using_string_dtype(), reason="infer_string takes precedence", strict=False
     )
-    def test_dtype_backend_string(self, read_ext, string_storage):
+    def test_dtype_backend_string(self, read_ext, string_storage, tmp_excel):
         # GH#36712
         if read_ext in (".xlsb", ".xls"):
             pytest.skip(f"No engine for filetype: '{read_ext}'")
 
-        pa = pytest.importorskip("pyarrow")
+        df = DataFrame(
+            {
+                "a": np.array(["a", "b"], dtype=np.object_),
+                "b": np.array(["x", pd.NA], dtype=np.object_),
+            }
+        )
+        df.to_excel(tmp_excel, sheet_name="test", index=False)
 
         with pd.option_context("mode.string_storage", string_storage):
-            df = DataFrame(
-                {
-                    "a": np.array(["a", "b"], dtype=np.object_),
-                    "b": np.array(["x", pd.NA], dtype=np.object_),
-                }
+            result = pd.read_excel(
+                tmp_excel, sheet_name="test", dtype_backend="numpy_nullable"
             )
-            with tm.ensure_clean(read_ext) as file_path:
-                df.to_excel(file_path, sheet_name="test", index=False)
-                result = pd.read_excel(
-                    file_path, sheet_name="test", dtype_backend="numpy_nullable"
-                )
 
-            if string_storage == "python":
-                expected = DataFrame(
-                    {
-                        "a": StringArray(np.array(["a", "b"], dtype=np.object_)),
-                        "b": StringArray(np.array(["x", pd.NA], dtype=np.object_)),
-                    }
-                )
-            else:
-                expected = DataFrame(
-                    {
-                        "a": ArrowStringArray(pa.array(["a", "b"])),
-                        "b": ArrowStringArray(pa.array(["x", None])),
-                    }
-                )
-            tm.assert_frame_equal(result, expected)
+        expected = DataFrame(
+            {
+                "a": Series(["a", "b"], dtype=pd.StringDtype(string_storage)),
+                "b": Series(["x", None], dtype=pd.StringDtype(string_storage)),
+            }
+        )
+        tm.assert_frame_equal(result, expected)
 
     @pytest.mark.parametrize("dtypes, exp_value", [({}, 1), ({"a.1": "int64"}, 1)])
     def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value):
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -31,11 +31,6 @@
     read_json,
 )
 import pandas._testing as tm
-from pandas.core.arrays import (
-    ArrowStringArray,
-    StringArray,
-)
-from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
 
 from pandas.io.json import ujson_dumps
 
@@ -2037,14 +2032,10 @@ def test_json_uint64(self):
         assert result == expected
 
     @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
-    @pytest.mark.parametrize(
-        "orient", ["split", "records", "values", "index", "columns"]
-    )
     def test_read_json_dtype_backend(
         self, string_storage, dtype_backend, orient, using_infer_string
     ):
         # GH#50750
-        pa = pytest.importorskip("pyarrow")
         df = DataFrame(
             {
                 "a": Series([1, np.nan, 3], dtype="Int64"),
@@ -2058,30 +2049,18 @@ def test_read_json_dtype_backend(
             }
         )
 
-        if using_infer_string:
-            string_array = ArrowStringArrayNumpySemantics(pa.array(["a", "b", "c"]))
-            string_array_na = ArrowStringArrayNumpySemantics(pa.array(["a", "b", None]))
-        elif string_storage == "python":
-            string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
-            string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))
-
-        elif dtype_backend == "pyarrow":
-            pa = pytest.importorskip("pyarrow")
-            from pandas.arrays import ArrowExtensionArray
-
-            string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
-            string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))
-
-        else:
-            string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
-            string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
-
         out = df.to_json(orient=orient)
         with pd.option_context("mode.string_storage", string_storage):
             result = read_json(
                 StringIO(out), dtype_backend=dtype_backend, orient=orient
             )
 
+        if dtype_backend == "pyarrow":
+            pa = pytest.importorskip("pyarrow")
+            string_dtype = pd.ArrowDtype(pa.string())
+        else:
+            string_dtype = pd.StringDtype(string_storage)
+
         expected = DataFrame(
             {
                 "a": Series([1, np.nan, 3], dtype="Int64"),
@@ -2090,12 +2069,13 @@ def test_read_json_dtype_backend(
                 "d": Series([1.5, 2.0, 2.5], dtype="Float64"),
                 "e": Series([True, False, NA], dtype="boolean"),
                 "f": Series([True, False, True], dtype="boolean"),
-                "g": string_array,
-                "h": string_array_na,
+                "g": Series(["a", "b", "c"], dtype=string_dtype),
+                "h": Series(["a", "b", None], dtype=string_dtype),
             }
         )
 
         if dtype_backend == "pyarrow":
+            pa = pytest.importorskip("pyarrow")
             from pandas.arrays import ArrowExtensionArray
 
             expected = DataFrame(
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -18,11 +18,7 @@
     Timestamp,
 )
 import pandas._testing as tm
-from pandas.core.arrays import (
-    ArrowStringArray,
-    IntegerArray,
-    StringArray,
-)
+from pandas.core.arrays import IntegerArray
 
 pytestmark = pytest.mark.filterwarnings(
     "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
@@ -465,8 +461,6 @@ def test_dtype_backend_and_dtype(all_parsers):
 
 def test_dtype_backend_string(all_parsers, string_storage):
     # GH#36712
-    pa = pytest.importorskip("pyarrow")
-
     with pd.option_context("mode.string_storage", string_storage):
         parser = all_parsers
 
@@ -476,21 +470,13 @@ def test_dtype_backend_string(all_parsers, string_storage):
 """
         result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable")
 
-        if string_storage == "python":
-            expected = DataFrame(
-                {
-                    "a": StringArray(np.array(["a", "b"], dtype=np.object_)),
-                    "b": StringArray(np.array(["x", pd.NA], dtype=np.object_)),
-                }
-            )
-        else:
-            expected = DataFrame(
-                {
-                    "a": ArrowStringArray(pa.array(["a", "b"])),
-                    "b": ArrowStringArray(pa.array(["x", None])),
-                }
-            )
-        tm.assert_frame_equal(result, expected)
+    expected = DataFrame(
+        {
+            "a": pd.array(["a", "b"], dtype=pd.StringDtype(string_storage)),
+            "b": pd.array(["x", pd.NA], dtype=pd.StringDtype(string_storage)),
+        }
+    )
+    tm.assert_frame_equal(result, expected)
 
 
 def test_dtype_backend_ea_dtype_specified(all_parsers):
diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py
@@ -14,8 +14,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.errors import EmptyDataError
 
 import pandas as pd
@@ -24,10 +22,6 @@
     DatetimeIndex,
 )
 import pandas._testing as tm
-from pandas.core.arrays import (
-    ArrowStringArray,
-    StringArray,
-)
 
 from pandas.io.common import urlopen
 from pandas.io.parsers import (
@@ -968,39 +962,30 @@ def test_widths_and_usecols():
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 def test_dtype_backend(string_storage, dtype_backend):
     # GH#50289
-    if string_storage == "python":
-        arr = StringArray(np.array(["a", "b"], dtype=np.object_))
-        arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_))
-    elif dtype_backend == "pyarrow":
-        pa = pytest.importorskip("pyarrow")
-        from pandas.arrays import ArrowExtensionArray
-
-        arr = ArrowExtensionArray(pa.array(["a", "b"]))
-        arr_na = ArrowExtensionArray(pa.array([None, "a"]))
-    else:
-        pa = pytest.importorskip("pyarrow")
-        arr = ArrowStringArray(pa.array(["a", "b"]))
-        arr_na = ArrowStringArray(pa.array([None, "a"]))
-
     data = """a  b    c      d  e     f  g    h  i
 1  2.5  True  a
 3  4.5  False b  True  6  7.5  a"""
     with pd.option_context("mode.string_storage", string_storage):
         result = read_fwf(StringIO(data), dtype_backend=dtype_backend)
 
+    if dtype_backend == "pyarrow":
+        pa = pytest.importorskip("pyarrow")
+        string_dtype = pd.ArrowDtype(pa.string())
+    else:
+        string_dtype = pd.StringDtype(string_storage)
+
     expected = DataFrame(
         {
             "a": pd.Series([1, 3], dtype="Int64"),
             "b": pd.Series([2.5, 4.5], dtype="Float64"),
             "c": pd.Series([True, False], dtype="boolean"),
-            "d": arr,
+            "d": pd.Series(["a", "b"], dtype=string_dtype),
             "e": pd.Series([pd.NA, True], dtype="boolean"),
             "f": pd.Series([pd.NA, 6], dtype="Int64"),
             "g": pd.Series([pd.NA, 7.5], dtype="Float64"),
-            "h": arr_na,
+            "h": pd.Series([None, "a"], dtype=string_dtype),
             "i": pd.Series([pd.NA, pd.NA], dtype="Int64"),
         }
     )
diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py
@@ -19,10 +19,6 @@
     read_clipboard,
 )
 import pandas._testing as tm
-from pandas.core.arrays import (
-    ArrowStringArray,
-    StringArray,
-)
 
 from pandas.io.clipboard import (
     CheckedCall,
@@ -358,23 +354,15 @@ def test_read_clipboard_dtype_backend(
         self, clipboard, string_storage, dtype_backend, engine
     ):
         # GH#50502
-        if string_storage == "pyarrow" or dtype_backend == "pyarrow":
-            pa = pytest.importorskip("pyarrow")
-
-        if string_storage == "python":
-            string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
-            string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))
-
-        elif dtype_backend == "pyarrow" and engine != "c":
+        if dtype_backend == "pyarrow":
             pa = pytest.importorskip("pyarrow")
-            from pandas.arrays import ArrowExtensionArray
-
-            string_array = ArrowExtensionArray(pa.array(["x", "y"]))
-            string_array_na = ArrowExtensionArray(pa.array(["x", None]))
-
+            if engine == "c" and string_storage == "pyarrow":
+                # TODO avoid this exception?
+                string_dtype = pd.ArrowDtype(pa.large_string())
+            else:
+                string_dtype = pd.ArrowDtype(pa.string())
         else:
-            string_array = ArrowStringArray(pa.array(["x", "y"]))
-            string_array_na = ArrowStringArray(pa.array(["x", None]))
+            string_dtype = pd.StringDtype(string_storage)
 
         text = """a,b,c,d,e,f,g,h,i
 x,1,4.0,x,2,4.0,,True,False
@@ -386,10 +374,10 @@ def test_read_clipboard_dtype_backend(
 
         expected = DataFrame(
             {
-                "a": string_array,
+                "a": Series(["x", "y"], dtype=string_dtype),
                 "b": Series([1, 2], dtype="Int64"),
                 "c": Series([4.0, 5.0], dtype="Float64"),
-                "d": string_array_na,
+                "d": Series(["x", None], dtype=string_dtype),
                 "e": Series([2, NA], dtype="Int64"),
                 "f": Series([4.0, NA], dtype="Float64"),
                 "g": Series([NA, NA], dtype="Int64"),
diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py
@@ -6,10 +6,6 @@
 
 import pandas as pd
 import pandas._testing as tm
-from pandas.core.arrays import (
-    ArrowStringArray,
-    StringArray,
-)
 
 from pandas.io.feather_format import read_feather, to_feather  # isort:skip
 
@@ -188,25 +184,17 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
             }
         )
 
-        if string_storage == "python":
-            string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
-            string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_))
-
-        elif dtype_backend == "pyarrow":
-            from pandas.arrays import ArrowExtensionArray
-
-            string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
-            string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))
-
-        else:
-            string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
-            string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
-
         with tm.ensure_clean() as path:
             to_feather(df, path)
             with pd.option_context("mode.string_storage", string_storage):
                 result = read_feather(path, dtype_backend=dtype_backend)
 
+        if dtype_backend == "pyarrow":
+            pa = pytest.importorskip("pyarrow")
+            string_dtype = pd.ArrowDtype(pa.string())
+        else:
+            string_dtype = pd.StringDtype(string_storage)
+
         expected = pd.DataFrame(
             {
                 "a": pd.Series([1, np.nan, 3], dtype="Int64"),
@@ -215,8 +203,8 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
                 "d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"),
                 "e": pd.Series([True, False, pd.NA], dtype="boolean"),
                 "f": pd.Series([True, False, True], dtype="boolean"),
-                "g": string_array,
-                "h": string_array_na,
+                "g": pd.Series(["a", "b", "c"], dtype=string_dtype),
+                "h": pd.Series(["a", "b", None], dtype=string_dtype),
             }
         )
 
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py