pandas-dev · simonjayhawkins · Nov 18, 2024 · Nov 18, 2024 · Nov 18, 2024 · Nov 18, 2024
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -1435,6 +1435,10 @@ def any_string_dtype(request):
         return pd.StringDtype(storage, na_value)
 
 
+# Generate cartesian product of any_string_dtype:
+any_string_dtype2 = any_string_dtype
+
+
 @pytest.fixture(params=tm.DATETIME64_DTYPES)
 def datetime64_dtype(request):
     """

diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py
@@ -13,6 +13,10 @@
 from pandas._libs import missing as libmissing
 from pandas._libs.sparse import IntIndex
 
+from pandas.core.dtypes.cast import (
+    find_common_type,
+    infer_dtype_from_scalar,
+)
 from pandas.core.dtypes.common import (
     is_integer_dtype,
     is_list_like,
@@ -567,7 +571,13 @@ def from_dummies(
             )
         else:
             data_slice = data_to_decode.loc[:, prefix_slice]
-        cats_array = data._constructor_sliced(cats, dtype=data.columns.dtype)
+        dtype = data.columns.dtype
+        if default_category:
+            default_category_dtype = infer_dtype_from_scalar(default_category[prefix])[
+                0
+            ]
+            dtype = find_common_type([dtype, default_category_dtype])
+        cats_array = data._constructor_sliced(cats, dtype=dtype)
         # get indices of True entries along axis=1
         true_values = data_slice.idxmax(axis=1)
         indexer = data_slice.columns.get_indexer_for(true_values)

diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas import (
     DataFrame,
     Series,
@@ -330,14 +328,10 @@ def test_no_prefix_string_cats_contains_get_dummies_NaN_column():
         ),
     ],
 )
-def test_no_prefix_string_cats_default_category(
-    default_category, expected, using_infer_string
-):
+def test_no_prefix_string_cats_default_category(default_category, expected):
     dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]})
     result = from_dummies(dummies, default_category=default_category)
     expected = DataFrame(expected)
-    if using_infer_string:
-        expected[""] = expected[""].astype("str")
     tm.assert_frame_equal(result, expected)
 
 
@@ -364,7 +358,6 @@ def test_with_prefix_contains_get_dummies_NaN_column():
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize(
     "default_category, expected",
     [
@@ -390,7 +383,7 @@ def test_with_prefix_contains_get_dummies_NaN_column():
         ),
         pytest.param(
             {"col2": None, "col1": False},
-            {"col1": ["a", "b", False], "col2": [None, "a", "c"]},
+            {"col1": ["a", "b", False], "col2": Series([None, "a", "c"], dtype=object)},
             id="default_category is a dict with bool and None values",
         ),
         pytest.param(

diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
@@ -9,8 +9,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat.numpy import np_version_gte1p25
 
 import pandas as pd
@@ -2664,46 +2662,46 @@ def test_pivot_columns_not_given(self):
         with pytest.raises(TypeError, match="missing 1 required keyword-only argument"):
             df.pivot()
 
-    @pytest.mark.xfail(
-        using_string_dtype(), reason="TODO(infer_string) None is cast to NaN"
-    )
     def test_pivot_columns_is_none(self):
         # GH#48293
-        df = DataFrame({None: [1], "b": 2, "c": 3})
+        df = DataFrame([[1, 2, 3]], columns=Index([None, "b", "c"], dtype="object"))
         result = df.pivot(columns=None)
         expected = DataFrame({("b", 1): [2], ("c", 1): 3})
+        expected.columns = expected.columns.set_levels(
+            expected.columns.levels[0].astype(object), level=0
+        )
         tm.assert_frame_equal(result, expected)
 
         result = df.pivot(columns=None, index="b")
         expected = DataFrame({("c", 1): 3}, index=Index([2], name="b"))
+        expected.columns = expected.columns.set_levels(
+            expected.columns.levels[0].astype(object), level=0
+        )
         tm.assert_frame_equal(result, expected)
 
         result = df.pivot(columns=None, index="b", values="c")
         expected = DataFrame({1: 3}, index=Index([2], name="b"))
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.xfail(
-        using_string_dtype(), reason="TODO(infer_string) None is cast to NaN"
-    )
     def test_pivot_index_is_none(self):
         # GH#48293
-        df = DataFrame({None: [1], "b": 2, "c": 3})
+        df = DataFrame([[1, 2, 3]], columns=Index([None, "b", "c"], dtype="object"))
 
         result = df.pivot(columns="b", index=None)
         expected = DataFrame({("c", 2): 3}, index=[1])
+        expected.columns = expected.columns.set_levels(
+            expected.columns.levels[0].astype(object), level=0
+        )
         expected.columns.names = [None, "b"]
         tm.assert_frame_equal(result, expected)
 
         result = df.pivot(columns="b", index=None, values="c")
         expected = DataFrame(3, index=[1], columns=Index([2], name="b"))
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.xfail(
-        using_string_dtype(), reason="TODO(infer_string) None is cast to NaN"
-    )
     def test_pivot_values_is_none(self):
         # GH#48293
-        df = DataFrame({None: [1], "b": 2, "c": 3})
+        df = DataFrame([[1, 2, 3]], columns=Index([None, "b", "c"], dtype="object"))
 
         result = df.pivot(columns="b", index="c", values=None)
         expected = DataFrame(

diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
@@ -293,23 +293,12 @@ def test_startswith_endswith_validate_na(any_string_dtype):
         dtype=any_string_dtype,
     )
 
-    dtype = ser.dtype
-    if (isinstance(dtype, pd.StringDtype)) or dtype == np.dtype("object"):
-        msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated"
-        with tm.assert_produces_warning(FutureWarning, match=msg):
-            ser.str.startswith("kapow", na="baz")
-        msg = "Allowing a non-bool 'na' in obj.str.endswith is deprecated"
-        with tm.assert_produces_warning(FutureWarning, match=msg):
-            ser.str.endswith("bar", na="baz")
-    else:
-        # TODO(infer_string): don't surface pyarrow errors
-        import pyarrow as pa
-
-        msg = "Could not convert 'baz' with type str: tried to convert to boolean"
-        with pytest.raises(pa.lib.ArrowInvalid, match=msg):
-            ser.str.startswith("kapow", na="baz")
-        with pytest.raises(pa.lib.ArrowInvalid, match=msg):
-            ser.str.endswith("kapow", na="baz")
+    msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        ser.str.startswith("kapow", na="baz")
+    msg = "Allowing a non-bool 'na' in obj.str.endswith is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        ser.str.endswith("bar", na="baz")
 
 
 @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")])

diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas.util._test_decorators as td
 
 from pandas import (
@@ -98,30 +96,77 @@ def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype):
 
 
 # GH#47872
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-def test_get_dummies_with_str_dtype(any_string_dtype):
+@pytest.mark.parametrize("use_string_repr", [True, False])
+def test_get_dummies_with_any_string_dtype(
+    request, any_string_dtype, any_string_dtype2, use_string_repr, using_infer_string
+):
     s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
-    result = s.str.get_dummies("|", dtype=str)
-    expected = DataFrame(
-        [["T", "T", "F"], ["T", "F", "T"], ["F", "F", "F"]],
-        columns=list("abc"),
-        dtype=str,
-    )
-    tm.assert_frame_equal(result, expected)
-
-
-# GH#47872
-@td.skip_if_no("pyarrow")
-def test_get_dummies_with_pa_str_dtype(any_string_dtype):
-    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
-    result = s.str.get_dummies("|", dtype="str[pyarrow]")
-    expected = DataFrame(
-        [
-            ["true", "true", "false"],
-            ["true", "false", "true"],
-            ["false", "false", "false"],
-        ],
-        columns=list("abc"),
-        dtype="str[pyarrow]",
-    )
+    test_ids = request.node.callspec.id.split("-")
+    series_dtype_id = test_ids[0][7:]
+    expected_dtype_id = test_ids[1][7:]
+    if expected_dtype_id == "object":
+        if "pyarrow" in series_dtype_id:
+            request.applymarker(
+                pytest.mark.xfail(
+                    reason=("pyarrow.lib.ArrowTypeError: Expected integer, got bool"),
+                    strict=True,
+                )
+            )
+        expected = DataFrame(
+            [
+                [True, True, False],
+                [True, False, True],
+                [False, False, False],
+            ],
+            columns=list("abc"),
+            dtype=np.bool_,
+        )
+    elif expected_dtype_id == "str[pyarrow]" and use_string_repr:
+        # data type 'str[pyarrow]' uses pandas.ArrowDtype instead
+        expected = DataFrame(
+            [
+                ["true", "true", "false"],
+                ["true", "false", "true"],
+                ["false", "false", "false"],
+            ],
+            columns=list("abc"),
+            dtype="str[pyarrow]",
+        )
+    elif expected_dtype_id == "str[python]" and use_string_repr:
+        # data type 'str[python]' not understood"
+        expected_dtype_id = str
+        if using_infer_string:
+            expected = DataFrame(
+                [
+                    ["True", "True", "False"],
+                    ["True", "False", "True"],
+                    ["False", "False", "False"],
+                ],
+                columns=list("abc"),
+                dtype=expected_dtype_id,
+            )
+        else:
+            expected = DataFrame(
+                [
+                    ["T", "T", "F"],
+                    ["T", "F", "T"],
+                    ["F", "F", "F"],
+                ],
+                columns=list("abc"),
+                dtype=expected_dtype_id,
+            )
+    else:
+        expected = DataFrame(
+            [
+                ["True", "True", "False"],
+                ["True", "False", "True"],
+                ["False", "False", "False"],
+            ],
+            columns=list("abc"),
+            dtype=any_string_dtype2,
+        )
+    if use_string_repr:
+        result = s.str.get_dummies("|", dtype=expected_dtype_id)
+    else:
+        result = s.str.get_dummies("|", dtype=any_string_dtype2)
     tm.assert_frame_equal(result, expected)