Adjust tests in strings folder for new string option (#56159)

phofl · web-flow · commit a61c556f6abd · 2023-12-09T11:24:30.000-08:00
* Adjust tests in strings folder for new string option

* BUG: translate losing object dtype with new string dtype

* Fix

* BUG: Index.str.cat casting result always to object

* Update accessor.py

* Fix further bugs

* Fix

* Fix tests

* Update accessor.py
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
@@ -918,7 +918,13 @@ def split(
         if is_re(pat):
             regex = True
         result = self._data.array._str_split(pat, n, expand, regex)
-        return self._wrap_result(result, returns_string=expand, expand=expand)
+        if self._data.dtype == "category":
+            dtype = self._data.dtype.categories.dtype
+        else:
+            dtype = object if self._data.dtype == object else None
+        return self._wrap_result(
+            result, expand=expand, returns_string=expand, dtype=dtype
+        )
 
     @Appender(
         _shared_docs["str_split"]
@@ -936,7 +942,10 @@ def split(
     @forbid_nonstring_types(["bytes"])
     def rsplit(self, pat=None, *, n=-1, expand: bool = False):
         result = self._data.array._str_rsplit(pat, n=n)
-        return self._wrap_result(result, expand=expand, returns_string=expand)
+        dtype = object if self._data.dtype == object else None
+        return self._wrap_result(
+            result, expand=expand, returns_string=expand, dtype=dtype
+        )
 
     _shared_docs[
         "str_partition"
@@ -1032,7 +1041,13 @@ def rsplit(self, pat=None, *, n=-1, expand: bool = False):
     @forbid_nonstring_types(["bytes"])
     def partition(self, sep: str = " ", expand: bool = True):
         result = self._data.array._str_partition(sep, expand)
-        return self._wrap_result(result, expand=expand, returns_string=expand)
+        if self._data.dtype == "category":
+            dtype = self._data.dtype.categories.dtype
+        else:
+            dtype = object if self._data.dtype == object else None
+        return self._wrap_result(
+            result, expand=expand, returns_string=expand, dtype=dtype
+        )
 
     @Appender(
         _shared_docs["str_partition"]
@@ -1046,7 +1061,13 @@ def partition(self, sep: str = " ", expand: bool = True):
     @forbid_nonstring_types(["bytes"])
     def rpartition(self, sep: str = " ", expand: bool = True):
         result = self._data.array._str_rpartition(sep, expand)
-        return self._wrap_result(result, expand=expand, returns_string=expand)
+        if self._data.dtype == "category":
+            dtype = self._data.dtype.categories.dtype
+        else:
+            dtype = object if self._data.dtype == object else None
+        return self._wrap_result(
+            result, expand=expand, returns_string=expand, dtype=dtype
+        )
 
     def get(self, i):
         """
@@ -2752,7 +2773,7 @@ def extract(
         else:
             name = _get_single_group_name(regex)
             result = self._data.array._str_extract(pat, flags=flags, expand=returns_df)
-        return self._wrap_result(result, name=name)
+        return self._wrap_result(result, name=name, dtype=result_dtype)
 
     @forbid_nonstring_types(["bytes"])
     def extractall(self, pat, flags: int = 0) -> DataFrame:
@@ -3492,7 +3513,7 @@ def str_extractall(arr, pat, flags: int = 0) -> DataFrame:
         raise ValueError("pattern contains no capture groups")
 
     if isinstance(arr, ABCIndex):
-        arr = arr.to_series().reset_index(drop=True)
+        arr = arr.to_series().reset_index(drop=True).astype(arr.dtype)
 
     columns = _get_group_names(regex)
     match_list = []
diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py
@@ -8,6 +8,7 @@
     MultiIndex,
     Series,
     _testing as tm,
+    option_context,
 )
 from pandas.core.strings.accessor import StringMethods
 
@@ -163,7 +164,8 @@ def test_api_per_method(
 
     if inferred_dtype in allowed_types:
         # xref GH 23555, GH 23556
-        method(*args, **kwargs)  # works!
+        with option_context("future.no_silent_downcasting", True):
+            method(*args, **kwargs)  # works!
     else:
         # GH 23011, GH 23163
         msg = (
diff --git a/pandas/tests/strings/test_case_justify.py b/pandas/tests/strings/test_case_justify.py
@@ -21,7 +21,8 @@ def test_title_mixed_object():
     s = Series(["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0])
     result = s.str.title()
     expected = Series(
-        ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan]
+        ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan],
+        dtype=object,
     )
     tm.assert_almost_equal(result, expected)
 
@@ -41,11 +42,15 @@ def test_lower_upper_mixed_object():
     s = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0])
 
     result = s.str.upper()
-    expected = Series(["A", np.nan, "B", np.nan, np.nan, "FOO", None, np.nan, np.nan])
+    expected = Series(
+        ["A", np.nan, "B", np.nan, np.nan, "FOO", None, np.nan, np.nan], dtype=object
+    )
     tm.assert_series_equal(result, expected)
 
     result = s.str.lower()
-    expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan])
+    expected = Series(
+        ["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object
+    )
     tm.assert_series_equal(result, expected)
 
 
@@ -71,7 +76,8 @@ def test_capitalize_mixed_object():
     s = Series(["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0])
     result = s.str.capitalize()
     expected = Series(
-        ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan]
+        ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan],
+        dtype=object,
     )
     tm.assert_series_equal(result, expected)
 
@@ -87,7 +93,8 @@ def test_swapcase_mixed_object():
     s = Series(["FOO", np.nan, "bar", True, datetime.today(), "Blah", None, 1, 2.0])
     result = s.str.swapcase()
     expected = Series(
-        ["foo", np.nan, "BAR", np.nan, np.nan, "bLAH", None, np.nan, np.nan]
+        ["foo", np.nan, "BAR", np.nan, np.nan, "bLAH", None, np.nan, np.nan],
+        dtype=object,
     )
     tm.assert_series_equal(result, expected)
 
@@ -138,19 +145,22 @@ def test_pad_mixed_object():
 
     result = s.str.pad(5, side="left")
     expected = Series(
-        ["    a", np.nan, "    b", np.nan, np.nan, "   ee", None, np.nan, np.nan]
+        ["    a", np.nan, "    b", np.nan, np.nan, "   ee", None, np.nan, np.nan],
+        dtype=object,
     )
     tm.assert_series_equal(result, expected)
 
     result = s.str.pad(5, side="right")
     expected = Series(
-        ["a    ", np.nan, "b    ", np.nan, np.nan, "ee   ", None, np.nan, np.nan]
+        ["a    ", np.nan, "b    ", np.nan, np.nan, "ee   ", None, np.nan, np.nan],
+        dtype=object,
     )
     tm.assert_series_equal(result, expected)
 
     result = s.str.pad(5, side="both")
     expected = Series(
-        ["  a  ", np.nan, "  b  ", np.nan, np.nan, "  ee ", None, np.nan, np.nan]
+        ["  a  ", np.nan, "  b  ", np.nan, np.nan, "  ee ", None, np.nan, np.nan],
+        dtype=object,
     )
     tm.assert_series_equal(result, expected)
 
@@ -238,7 +248,8 @@ def test_center_ljust_rjust_mixed_object():
             None,
             np.nan,
             np.nan,
-        ]
+        ],
+        dtype=object,
     )
     tm.assert_series_equal(result, expected)
 
@@ -255,7 +266,8 @@ def test_center_ljust_rjust_mixed_object():
             None,
             np.nan,
             np.nan,
-        ]
+        ],
+        dtype=object,
     )
     tm.assert_series_equal(result, expected)
 
@@ -272,7 +284,8 @@ def test_center_ljust_rjust_mixed_object():
             None,
             np.nan,
             np.nan,
-        ]
+        ],
+        dtype=object,
     )
     tm.assert_series_equal(result, expected)
 
diff --git a/pandas/tests/strings/test_extract.py b/pandas/tests/strings/test_extract.py
@@ -47,13 +47,16 @@ def test_extract_expand_False_mixed_object():
     # two groups
     result = ser.str.extract(".*(BAD[_]+).*(BAD)", expand=False)
     er = [np.nan, np.nan]  # empty row
-    expected = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er])
+    expected = DataFrame(
+        [["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er], dtype=object
+    )
     tm.assert_frame_equal(result, expected)
 
     # single group
     result = ser.str.extract(".*(BAD[_]+).*BAD", expand=False)
     expected = Series(
-        ["BAD_", np.nan, "BAD_", np.nan, np.nan, np.nan, None, np.nan, np.nan]
+        ["BAD_", np.nan, "BAD_", np.nan, np.nan, np.nan, None, np.nan, np.nan],
+        dtype=object,
     )
     tm.assert_series_equal(result, expected)
 
@@ -238,7 +241,9 @@ def test_extract_expand_True_mixed_object():
     )
 
     result = mixed.str.extract(".*(BAD[_]+).*(BAD)", expand=True)
-    expected = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er])
+    expected = DataFrame(
+        [["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er], dtype=object
+    )
     tm.assert_frame_equal(result, expected)
 
 
@@ -603,8 +608,8 @@ def test_extractall_stringindex(any_string_dtype):
     # index.name doesn't affect to the result
     if any_string_dtype == "object":
         for idx in [
-            Index(["a1a2", "b1", "c1"]),
-            Index(["a1a2", "b1", "c1"], name="xxx"),
+            Index(["a1a2", "b1", "c1"], dtype=object),
+            Index(["a1a2", "b1", "c1"], name="xxx", dtype=object),
         ]:
             result = idx.str.extractall(r"[ab](?P<digit>\d)")
             tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
@@ -242,7 +242,7 @@ def test_contains_nan(any_string_dtype):
 
 
 @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")])
-@pytest.mark.parametrize("dtype", [None, "category"])
+@pytest.mark.parametrize("dtype", ["object", "category"])
 @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA])
 @pytest.mark.parametrize("na", [True, False])
 def test_startswith(pat, dtype, null_value, na):
@@ -254,10 +254,10 @@ def test_startswith(pat, dtype, null_value, na):
 
     result = values.str.startswith(pat)
     exp = Series([False, np.nan, True, False, False, np.nan, True])
-    if dtype is None and null_value is pd.NA:
+    if dtype == "object" and null_value is pd.NA:
         # GH#18463
         exp = exp.fillna(null_value)
-    elif dtype is None and null_value is None:
+    elif dtype == "object" and null_value is None:
         exp[exp.isna()] = None
     tm.assert_series_equal(result, exp)
 
@@ -300,7 +300,7 @@ def test_startswith_nullable_string_dtype(nullable_string_dtype, na):
 
 
 @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")])
-@pytest.mark.parametrize("dtype", [None, "category"])
+@pytest.mark.parametrize("dtype", ["object", "category"])
 @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA])
 @pytest.mark.parametrize("na", [True, False])
 def test_endswith(pat, dtype, null_value, na):
@@ -312,10 +312,10 @@ def test_endswith(pat, dtype, null_value, na):
 
     result = values.str.endswith(pat)
     exp = Series([False, np.nan, False, False, True, np.nan, True])
-    if dtype is None and null_value is pd.NA:
+    if dtype == "object" and null_value is pd.NA:
         # GH#18463
-        exp = exp.fillna(pd.NA)
-    elif dtype is None and null_value is None:
+        exp = exp.fillna(null_value)
+    elif dtype == "object" and null_value is None:
         exp[exp.isna()] = None
     tm.assert_series_equal(result, exp)
 
@@ -382,7 +382,9 @@ def test_replace_mixed_object():
         ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0]
     )
     result = Series(ser).str.replace("BAD[_]*", "", regex=True)
-    expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan])
+    expected = Series(
+        ["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object
+    )
     tm.assert_series_equal(result, expected)
 
 
@@ -469,7 +471,9 @@ def test_replace_compiled_regex_mixed_object():
         ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0]
     )
     result = Series(ser).str.replace(pat, "", regex=True)
-    expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan])
+    expected = Series(
+        ["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object
+    )
     tm.assert_series_equal(result, expected)
 
 
@@ -913,7 +917,7 @@ def test_translate_mixed_object():
     # Series with non-string values
     s = Series(["a", "b", "c", 1.2])
     table = str.maketrans("abc", "cde")
-    expected = Series(["c", "d", "e", np.nan])
+    expected = Series(["c", "d", "e", np.nan], dtype=object)
     result = s.str.translate(table)
     tm.assert_series_equal(result, expected)
 
diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py
@@ -681,22 +681,24 @@ def test_partition_sep_kwarg(any_string_dtype, method):
 def test_get():
     ser = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"])
     result = ser.str.split("_").str.get(1)
-    expected = Series(["b", "d", np.nan, "g"])
+    expected = Series(["b", "d", np.nan, "g"], dtype=object)
     tm.assert_series_equal(result, expected)
 
 
 def test_get_mixed_object():
     ser = Series(["a_b_c", np.nan, "c_d_e", True, datetime.today(), None, 1, 2.0])
     result = ser.str.split("_").str.get(1)
-    expected = Series(["b", np.nan, "d", np.nan, np.nan, None, np.nan, np.nan])
+    expected = Series(
+        ["b", np.nan, "d", np.nan, np.nan, None, np.nan, np.nan], dtype=object
+    )
     tm.assert_series_equal(result, expected)
 
 
 @pytest.mark.parametrize("idx", [2, -3])
 def test_get_bounds(idx):
     ser = Series(["1_2_3_4_5", "6_7_8_9_10", "11_12"])
     result = ser.str.split("_").str.get(idx)
-    expected = Series(["3", "8", np.nan])
+    expected = Series(["3", "8", np.nan], dtype=object)
     tm.assert_series_equal(result, expected)
 
 
diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py
@@ -8,6 +8,7 @@
     DataFrame,
     Series,
     _testing as tm,
+    option_context,
 )
 
 
@@ -56,7 +57,8 @@ def test_string_array(nullable_string_dtype, any_string_method):
         columns = expected.select_dtypes(include="object").columns
         assert all(result[columns].dtypes == nullable_string_dtype)
         result[columns] = result[columns].astype(object)
-        expected[columns] = expected[columns].fillna(NA)  # GH#18463
+        with option_context("future.no_silent_downcasting", True):
+            expected[columns] = expected[columns].fillna(NA)  # GH#18463
 
     tm.assert_equal(result, expected)
 
diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py