From e6f9527097471a5dc82dc7afb0ff6ca841c8fcd3 Mon Sep 17 00:00:00 2001
From: Aaron Chu-Carroll <aaron.chucarroll@gmail.com>
Date: Wed, 21 Aug 2024 14:26:54 -0400
Subject: [PATCH 01/22] Add prefix, prefix_sep, dummy_na, and dtype args to
 StringMethods get_dummies()

---
 pandas/core/arrays/arrow/array.py        | 23 +++++++-
 pandas/core/arrays/string_arrow.py       | 14 +++--
 pandas/core/strings/accessor.py          | 71 +++++++++++++++++++++++-
 pandas/core/strings/base.py              |  5 +-
 pandas/core/strings/object_array.py      | 12 +++-
 pandas/tests/strings/test_get_dummies.py | 56 +++++++++++++++++++
 6 files changed, 170 insertions(+), 11 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 3d55513ab914c..a374eb69b8528 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2533,20 +2533,39 @@ def _str_findall(self, pat: str, flags: int = 0) -> Self:
         result = self._apply_elementwise(predicate)
         return type(self)(pa.chunked_array(result))
 
-    def _str_get_dummies(self, sep: str = "|"):
+    def _str_get_dummies(
+        self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype = None
+    ):
+        if dtype is None:
+            dtype = np.bool_
         split = pc.split_pattern(self._pa_array, sep)
         flattened_values = pc.list_flatten(split)
+        if dummy_na:
+            nan_mask = self._pa_array.is_null()
+            flattened_values = flattened_values.fill_null(pa.NA)
         uniques = flattened_values.unique()
         uniques_sorted = uniques.take(pa.compute.array_sort_indices(uniques))
+        if dummy_na:
+            if "__nan__" not in uniques_sorted.to_pylist():
+                uniques_sorted = pa.concat_arrays(
+                    [uniques_sorted, pa.array(["__nan__"], type=uniques_sorted.type)]
+                )
         lengths = pc.list_value_length(split).fill_null(0).to_numpy()
         n_rows = len(self)
         n_cols = len(uniques)
         indices = pc.index_in(flattened_values, uniques_sorted).to_numpy()
         indices = indices + np.arange(n_rows).repeat(lengths) * n_cols
-        dummies = np.zeros(n_rows * n_cols, dtype=np.bool_)
+        dummies = np.zeros(n_rows * n_cols, dtype=dtype)
         dummies[indices] = True
         dummies = dummies.reshape((n_rows, n_cols))
+        if dummy_na:
+            nan_column = nan_mask.to_numpy().reshape(-1, 1)
+            dummies = np.hstack([dummies, nan_column])
         result = type(self)(pa.array(list(dummies)))
+        if dummy_na:
+            uniques_sorted = pa.array(
+                ["NaN" if x == "__nan__" else x for x in uniques_sorted.to_pylist()]
+            )
         return result, uniques_sorted.to_pylist()
 
     def _str_index(self, sub: str, start: int = 0, end: int | None = None) -> Self:
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index f2fd9d5d6610f..5fd5748bf5dc7 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -550,12 +550,18 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None):
             return super()._str_find(sub, start, end)
         return self._convert_int_dtype(result)
 
-    def _str_get_dummies(self, sep: str = "|"):
-        dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep)
+    def _str_get_dummies(
+        self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype = None
+    ):
+        if dtype is None:
+            dtype = np.int64
+        dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(
+            sep, dummy_na, dtype
+        )
         if len(labels) == 0:
-            return np.empty(shape=(0, 0), dtype=np.int64), labels
+            return np.empty(shape=(0, 0), dtype=dtype), labels
         dummies = np.vstack(dummies_pa.to_numpy())
-        return dummies.astype(np.int64, copy=False), labels
+        return dummies.astype(dtype, copy=False), labels
 
     def _convert_int_dtype(self, result):
         return Int64Dtype().__from_arrow__(result)
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index 7494a43caf004..a0e37ab597796 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -51,9 +51,12 @@
 if TYPE_CHECKING:
     from collections.abc import (
         Hashable,
+        Iterable,
         Iterator,
     )
 
+    from pandas._typing import NpDtype
+
     from pandas import (
         DataFrame,
         Index,
@@ -2357,7 +2360,14 @@ def wrap(
         return self._wrap_result(result)
 
     @forbid_nonstring_types(["bytes"])
-    def get_dummies(self, sep: str = "|"):
+    def get_dummies(
+        self,
+        sep: str = "|",
+        prefix: str | Iterable[str] | dict[str, str] | None = None,
+        prefix_sep: str = "_",
+        dummy_na: bool = False,
+        dtype: NpDtype | None = None,
+    ):
         """
         Return DataFrame of dummy/indicator variables for Series.
 
@@ -2368,6 +2378,15 @@ def get_dummies(self, sep: str = "|"):
         ----------
         sep : str, default "|"
             String to split on.
+        prefix : str, list of str, or dict of str, default None
+            String to append DataFrame column names.
+            Pass a list with length equal to the number of columns
+            when calling get_dummies on a DataFrame. Alternatively, `prefix`
+            can be a dictionary mapping column names to prefixes.
+        prefix_sep : str, default '_'
+            If appending prefix, separator/delimiter to use.
+        dummy_na : bool, default False
+            Add a column to indicate NaNs, if False NaNs are ignored.
 
         Returns
         -------
@@ -2392,10 +2411,58 @@ def get_dummies(self, sep: str = "|"):
         0  1  1  0
         1  0  0  0
         2  1  0  1
+
+        >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(dummy_na=True)
+           a  b  c  NaN
+        0  1  1  0    0
+        1  0  0  0    1
+        2  1  0  1    0
+
+        >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(prefix="prefix")
+              prefix_a  prefix_b  prefix_c
+        0          1         1         0
+        1          0         0         0
+        2          1         0         1
+
+        >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(
+        ...     prefix={"a": "alpha", "b": "beta", "c": "gamma"}
+        ... )
+              alpha_a  beta_b  gamma_c
+        0          1       1       0
+        1          0       0       0
+        2          1       0       1
+
+        >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(dtype=bool)
+                a      b      c
+        0   True   True    False
+        1   False  False   False
+        2   True   False   True
         """
         # we need to cast to Series of strings as only that has all
         # methods available for making the dummies...
-        result, name = self._data.array._str_get_dummies(sep)
+        result, name = self._data.array._str_get_dummies(sep, dummy_na, dtype)
+        name = [np.nan if x == "NaN" else x for x in name]
+        if isinstance(prefix, str):
+            name = [f"{prefix}{prefix_sep}{col}" for col in name]
+        elif isinstance(prefix, dict):
+            if len(prefix) != len(name):
+                len_msg = (
+                    f"Length of 'prefix' ({len(prefix)}) did not match the "
+                    "length of the columns being encoded "
+                    f"({len(name)})."
+                )
+                raise ValueError(len_msg)
+            name = [f"{prefix[col]}{prefix_sep}{col}" for col in name]
+        elif isinstance(prefix, list):
+            if len(prefix) != len(name):
+                len_msg = (
+                    f"Length of 'prefix' ({len(prefix)}) did not match the "
+                    "length of the columns being encoded "
+                    f"({len(name)})."
+                )
+                raise ValueError(len_msg)
+            name = [f"{prefix[i]}{prefix_sep}{col}" for i, col in enumerate(name)]
+
         return self._wrap_result(
             result,
             name=name,
diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py
index c1f94abff428a..84580ba9c5972 100644
--- a/pandas/core/strings/base.py
+++ b/pandas/core/strings/base.py
@@ -14,6 +14,7 @@
     import re
 
     from pandas._typing import (
+        NpDtype,
         Scalar,
         Self,
     )
@@ -161,7 +162,9 @@ def _str_wrap(self, width: int, **kwargs):
         pass
 
     @abc.abstractmethod
-    def _str_get_dummies(self, sep: str = "|"):
+    def _str_get_dummies(
+        self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype = None
+    ):
         pass
 
     @abc.abstractmethod
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index bdcf55e61d2d1..05102325c2aaf 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -372,9 +372,13 @@ def _str_wrap(self, width: int, **kwargs):
         tw = textwrap.TextWrapper(**kwargs)
         return self._str_map(lambda s: "\n".join(tw.wrap(s)))
 
-    def _str_get_dummies(self, sep: str = "|"):
+    def _str_get_dummies(
+        self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype | None = None
+    ):
         from pandas import Series
 
+        if dtype is None:
+            dtype = np.int64
         arr = Series(self).fillna("")
         try:
             arr = sep + arr + sep
@@ -386,7 +390,7 @@ def _str_get_dummies(self, sep: str = "|"):
             tags.update(ts)
         tags2 = sorted(tags - {""})
 
-        dummies = np.empty((len(arr), len(tags2)), dtype=np.int64)
+        dummies = np.empty((len(arr), len(tags2)), dtype=dtype)
 
         def _isin(test_elements: str, element: str) -> bool:
             return element in test_elements
@@ -396,6 +400,10 @@ def _isin(test_elements: str, element: str) -> bool:
             dummies[:, i] = lib.map_infer(
                 arr.to_numpy(), functools.partial(_isin, element=pat)
             )
+        if dummy_na:
+            nan_col = Series(self).isna().astype(dtype).to_numpy()
+            dummies = np.column_stack((dummies, nan_col))
+            tags2.append(np.nan)
         return dummies, tags2
 
     def _str_upper(self):
diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py
index 31386e4e342ae..59afc3ca7dc05 100644
--- a/pandas/tests/strings/test_get_dummies.py
+++ b/pandas/tests/strings/test_get_dummies.py
@@ -51,3 +51,59 @@ def test_get_dummies_with_name_dummy_index():
         [(1, 1, 0, 0), (0, 0, 1, 1), (0, 1, 0, 1)], names=("a", "b", "c", "name")
     )
     tm.assert_index_equal(result, expected)
+
+
+def test_get_dummies_with_prefix(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies(sep="|", prefix="prefix")
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
+        columns=["prefix_a", "prefix_b", "prefix_c"],
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_get_dummies_with_prefix_sep(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies(sep="|", prefix=None, prefix_sep="__")
+    expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=["a", "b", "c"])
+    tm.assert_frame_equal(result, expected)
+
+    result = s.str.get_dummies(sep="|", prefix="col", prefix_sep="__")
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
+        columns=["col__a", "col__b", "col__c"],
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_get_dummies_with_dummy_na(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies(sep="|", dummy_na=True)
+    expected = DataFrame(
+        [[1, 1, 0, 0], [1, 0, 1, 0], [0, 0, 0, 1]],
+        columns=["a", "b", "c", np.nan],
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_get_dummies_with_dtype(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies(sep="|", dtype=bool)
+    expected = DataFrame(
+        [[True, True, False], [True, False, True], [False, False, False]],
+        columns=["a", "b", "c"],
+    )
+    tm.assert_frame_equal(result, expected)
+    assert (result.dtypes == bool).all()
+
+
+def test_get_dummies_with_prefix_dict(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    prefix = {"a": "alpha", "b": "beta", "c": "gamma"}
+    result = s.str.get_dummies(sep="|", prefix=prefix)
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
+        columns=["alpha_a", "beta_b", "gamma_c"],
+    )
+    tm.assert_frame_equal(result, expected)

From dafb61d16cb46457e9caa2ec72825dc52919dd10 Mon Sep 17 00:00:00 2001
From: Aaron Chu-Carroll <aaron.chucarroll@gmail.com>
Date: Wed, 21 Aug 2024 14:43:09 -0400
Subject: [PATCH 02/22] Fix import issue

---
 pandas/core/arrays/string_arrow.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 5fd5748bf5dc7..ab0768d988a5c 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -59,6 +59,7 @@
         ArrayLike,
         AxisInt,
         Dtype,
+        NpDtype,
         Scalar,
         Self,
         npt,

From bb79ef2bb97ec636679a6944209743bba3e3747e Mon Sep 17 00:00:00 2001
From: Aaron Chu-Carroll <aaron.chucarroll@gmail.com>
Date: Wed, 21 Aug 2024 16:46:42 -0400
Subject: [PATCH 03/22] Fix typing of dtype

---
 pandas/core/strings/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py
index 84580ba9c5972..87aa7c26f1304 100644
--- a/pandas/core/strings/base.py
+++ b/pandas/core/strings/base.py
@@ -163,7 +163,7 @@ def _str_wrap(self, width: int, **kwargs):
 
     @abc.abstractmethod
     def _str_get_dummies(
-        self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype = None
+        self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype | None = None
     ):
         pass
 

From 24be84f44d18af6c7d13ed16de830852c42c00eb Mon Sep 17 00:00:00 2001
From: Aaron Chu-Carroll <aaron.chucarroll@gmail.com>
Date: Wed, 21 Aug 2024 17:08:05 -0400
Subject: [PATCH 04/22] Fix NaN type issue

---
 pandas/core/strings/object_array.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index 05102325c2aaf..c5f3768ddf4ce 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -403,7 +403,7 @@ def _isin(test_elements: str, element: str) -> bool:
         if dummy_na:
             nan_col = Series(self).isna().astype(dtype).to_numpy()
             dummies = np.column_stack((dummies, nan_col))
-            tags2.append(np.nan)
+            tags2.append("NaN")
         return dummies, tags2
 
     def _str_upper(self):

From 09b2fad68410dcc486aa2e3ee95a9a0e420f03d1 Mon Sep 17 00:00:00 2001
From: Aaron Chu-Carroll <aaron.chucarroll@gmail.com>
Date: Wed, 21 Aug 2024 17:20:43 -0400
Subject: [PATCH 05/22] Support categorical string backend

---
 pandas/core/arrays/categorical.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 64e5eec43a5c1..9b4447016280c 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -2744,11 +2744,15 @@ def _str_map(
         result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype)
         return take_nd(result, codes, fill_value=na_value)
 
-    def _str_get_dummies(self, sep: str = "|"):
+    def _str_get_dummies(
+        self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype = None
+    ):
         # sep may not be in categories. Just bail on this.
         from pandas.core.arrays import NumpyExtensionArray
 
-        return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep)
+        return NumpyExtensionArray(self.astype(str))._str_get_dummies(
+            sep, dummy_na, dtype
+        )
 
     # ------------------------------------------------------------------------
     # GroupBy Methods

From 50ed90c571aa569dd622b36c07a1025aeb793908 Mon Sep 17 00:00:00 2001
From: Aaron Chu-Carroll <aaron.chucarroll@gmail.com>
Date: Wed, 21 Aug 2024 17:46:12 -0400
Subject: [PATCH 06/22] Fix dtype type hints

---
 pandas/core/arrays/arrow/array.py  | 2 +-
 pandas/core/arrays/categorical.py  | 2 +-
 pandas/core/arrays/string_arrow.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index a374eb69b8528..c17729590c183 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2534,7 +2534,7 @@ def _str_findall(self, pat: str, flags: int = 0) -> Self:
         return type(self)(pa.chunked_array(result))
 
     def _str_get_dummies(
-        self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype = None
+        self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype | None = None
     ):
         if dtype is None:
             dtype = np.bool_
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 9b4447016280c..b25db2affd451 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -2745,7 +2745,7 @@ def _str_map(
         return take_nd(result, codes, fill_value=na_value)
 
     def _str_get_dummies(
-        self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype = None
+        self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype | None = None
     ):
         # sep may not be in categories. Just bail on this.
         from pandas.core.arrays import NumpyExtensionArray
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index ab0768d988a5c..5df2edc6c5018 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -552,7 +552,7 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None):
         return self._convert_int_dtype(result)
 
     def _str_get_dummies(
-        self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype = None
+        self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype | None = None
     ):
         if dtype is None:
             dtype = np.int64

From 9e95485d1fe3d65bc9fe158db9517134b728d616 Mon Sep 17 00:00:00 2001
From: Aaron Chu-Carroll <aaron.chucarroll@gmail.com>
Date: Wed, 21 Aug 2024 18:09:30 -0400
Subject: [PATCH 07/22] Add dtype to get_dummies docstring

---
 pandas/core/strings/accessor.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index a0e37ab597796..c25684e55e12b 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -2387,6 +2387,8 @@ def get_dummies(
             If appending prefix, separator/delimiter to use.
         dummy_na : bool, default False
             Add a column to indicate NaNs, if False NaNs are ignored.
+        dtype: dtype, default int64
+            Data type for new columns. Only a single dtype is allowed.
 
         Returns
         -------

From 9a47768b02cf4d622c04704be39cb937da13d759 Mon Sep 17 00:00:00 2001
From: Aaron Chu-Carroll <aaron.chucarroll@gmail.com>
Date: Wed, 21 Aug 2024 18:47:53 -0400
Subject: [PATCH 08/22] Fix get_dummies dtype docstring

---
 pandas/core/strings/accessor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index c25684e55e12b..b7630c0f1fe64 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -2387,7 +2387,7 @@ def get_dummies(
             If appending prefix, separator/delimiter to use.
         dummy_na : bool, default False
             Add a column to indicate NaNs, if False NaNs are ignored.
-        dtype: dtype, default int64
+        dtype : dtype, default np.int64
             Data type for new columns. Only a single dtype is allowed.
 
         Returns

From 9702bf7e80d3f4deeef1ab866edfdaeb254d0fd7 Mon Sep 17 00:00:00 2001
From: Aaron Chu-Carroll <aaron.chucarroll@gmail.com>
Date: Tue, 3 Sep 2024 15:52:45 -0400
Subject: [PATCH 09/22] remove changes for unnecessary args

---
 pandas/core/arrays/arrow/array.py        |  19 +----
 pandas/core/arrays/categorical.py        |   8 +-
 pandas/core/arrays/string_arrow.py       |   6 +-
 pandas/core/strings/accessor.py          |  57 +------------
 pandas/core/strings/base.py              |   4 +-
 pandas/core/strings/object_array.py      |   8 +-
 pandas/tests/strings/test_get_dummies.py | 102 +++++++++++++----------
 7 files changed, 67 insertions(+), 137 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index c17729590c183..55b5ee319e5f6 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2533,23 +2533,13 @@ def _str_findall(self, pat: str, flags: int = 0) -> Self:
         result = self._apply_elementwise(predicate)
         return type(self)(pa.chunked_array(result))
 
-    def _str_get_dummies(
-        self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype | None = None
-    ):
+    def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
         if dtype is None:
             dtype = np.bool_
         split = pc.split_pattern(self._pa_array, sep)
         flattened_values = pc.list_flatten(split)
-        if dummy_na:
-            nan_mask = self._pa_array.is_null()
-            flattened_values = flattened_values.fill_null(pa.NA)
         uniques = flattened_values.unique()
         uniques_sorted = uniques.take(pa.compute.array_sort_indices(uniques))
-        if dummy_na:
-            if "__nan__" not in uniques_sorted.to_pylist():
-                uniques_sorted = pa.concat_arrays(
-                    [uniques_sorted, pa.array(["__nan__"], type=uniques_sorted.type)]
-                )
         lengths = pc.list_value_length(split).fill_null(0).to_numpy()
         n_rows = len(self)
         n_cols = len(uniques)
@@ -2558,14 +2548,7 @@ def _str_get_dummies(
         dummies = np.zeros(n_rows * n_cols, dtype=dtype)
         dummies[indices] = True
         dummies = dummies.reshape((n_rows, n_cols))
-        if dummy_na:
-            nan_column = nan_mask.to_numpy().reshape(-1, 1)
-            dummies = np.hstack([dummies, nan_column])
         result = type(self)(pa.array(list(dummies)))
-        if dummy_na:
-            uniques_sorted = pa.array(
-                ["NaN" if x == "__nan__" else x for x in uniques_sorted.to_pylist()]
-            )
         return result, uniques_sorted.to_pylist()
 
     def _str_index(self, sub: str, start: int = 0, end: int | None = None) -> Self:
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index b25db2affd451..c609eab256205 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -2744,15 +2744,11 @@ def _str_map(
         result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype)
         return take_nd(result, codes, fill_value=na_value)
 
-    def _str_get_dummies(
-        self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype | None = None
-    ):
+    def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
         # sep may not be in categories. Just bail on this.
         from pandas.core.arrays import NumpyExtensionArray
 
-        return NumpyExtensionArray(self.astype(str))._str_get_dummies(
-            sep, dummy_na, dtype
-        )
+        return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep, dtype)
 
     # ------------------------------------------------------------------------
     # GroupBy Methods
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 5df2edc6c5018..67c23a2b8847c 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -551,13 +551,11 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None):
             return super()._str_find(sub, start, end)
         return self._convert_int_dtype(result)
 
-    def _str_get_dummies(
-        self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype | None = None
-    ):
+    def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
         if dtype is None:
             dtype = np.int64
         dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(
-            sep, dummy_na, dtype
+            sep, dtype
         )
         if len(labels) == 0:
             return np.empty(shape=(0, 0), dtype=dtype), labels
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index b7630c0f1fe64..18a684278c528 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -51,7 +51,6 @@
 if TYPE_CHECKING:
     from collections.abc import (
         Hashable,
-        Iterable,
         Iterator,
     )
 
@@ -2363,9 +2362,6 @@ def wrap(
     def get_dummies(
         self,
         sep: str = "|",
-        prefix: str | Iterable[str] | dict[str, str] | None = None,
-        prefix_sep: str = "_",
-        dummy_na: bool = False,
         dtype: NpDtype | None = None,
     ):
         """
@@ -2378,15 +2374,6 @@ def get_dummies(
         ----------
         sep : str, default "|"
             String to split on.
-        prefix : str, list of str, or dict of str, default None
-            String to append DataFrame column names.
-            Pass a list with length equal to the number of columns
-            when calling get_dummies on a DataFrame. Alternatively, `prefix`
-            can be a dictionary mapping column names to prefixes.
-        prefix_sep : str, default '_'
-            If appending prefix, separator/delimiter to use.
-        dummy_na : bool, default False
-            Add a column to indicate NaNs, if False NaNs are ignored.
         dtype : dtype, default np.int64
             Data type for new columns. Only a single dtype is allowed.
 
@@ -2414,26 +2401,6 @@ def get_dummies(
         1  0  0  0
         2  1  0  1
 
-        >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(dummy_na=True)
-           a  b  c  NaN
-        0  1  1  0    0
-        1  0  0  0    1
-        2  1  0  1    0
-
-        >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(prefix="prefix")
-              prefix_a  prefix_b  prefix_c
-        0          1         1         0
-        1          0         0         0
-        2          1         0         1
-
-        >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(
-        ...     prefix={"a": "alpha", "b": "beta", "c": "gamma"}
-        ... )
-              alpha_a  beta_b  gamma_c
-        0          1       1       0
-        1          0       0       0
-        2          1       0       1
-
         >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(dtype=bool)
                 a      b      c
         0   True   True    False
@@ -2442,29 +2409,7 @@ def get_dummies(
         """
         # we need to cast to Series of strings as only that has all
         # methods available for making the dummies...
-        result, name = self._data.array._str_get_dummies(sep, dummy_na, dtype)
-        name = [np.nan if x == "NaN" else x for x in name]
-        if isinstance(prefix, str):
-            name = [f"{prefix}{prefix_sep}{col}" for col in name]
-        elif isinstance(prefix, dict):
-            if len(prefix) != len(name):
-                len_msg = (
-                    f"Length of 'prefix' ({len(prefix)}) did not match the "
-                    "length of the columns being encoded "
-                    f"({len(name)})."
-                )
-                raise ValueError(len_msg)
-            name = [f"{prefix[col]}{prefix_sep}{col}" for col in name]
-        elif isinstance(prefix, list):
-            if len(prefix) != len(name):
-                len_msg = (
-                    f"Length of 'prefix' ({len(prefix)}) did not match the "
-                    "length of the columns being encoded "
-                    f"({len(name)})."
-                )
-                raise ValueError(len_msg)
-            name = [f"{prefix[i]}{prefix_sep}{col}" for i, col in enumerate(name)]
-
+        result, name = self._data.array._str_get_dummies(sep, dtype)
         return self._wrap_result(
             result,
             name=name,
diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py
index 87aa7c26f1304..4053e787c824f 100644
--- a/pandas/core/strings/base.py
+++ b/pandas/core/strings/base.py
@@ -162,9 +162,7 @@ def _str_wrap(self, width: int, **kwargs):
         pass
 
     @abc.abstractmethod
-    def _str_get_dummies(
-        self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype | None = None
-    ):
+    def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
         pass
 
     @abc.abstractmethod
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index c5f3768ddf4ce..c9a6518e42ca0 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -372,9 +372,7 @@ def _str_wrap(self, width: int, **kwargs):
         tw = textwrap.TextWrapper(**kwargs)
         return self._str_map(lambda s: "\n".join(tw.wrap(s)))
 
-    def _str_get_dummies(
-        self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype | None = None
-    ):
+    def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
         from pandas import Series
 
         if dtype is None:
@@ -400,10 +398,6 @@ def _isin(test_elements: str, element: str) -> bool:
             dummies[:, i] = lib.map_infer(
                 arr.to_numpy(), functools.partial(_isin, element=pat)
             )
-        if dummy_na:
-            nan_col = Series(self).isna().astype(dtype).to_numpy()
-            dummies = np.column_stack((dummies, nan_col))
-            tags2.append("NaN")
         return dummies, tags2
 
     def _str_upper(self):
diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py
index 59afc3ca7dc05..50859710f4a45 100644
--- a/pandas/tests/strings/test_get_dummies.py
+++ b/pandas/tests/strings/test_get_dummies.py
@@ -32,78 +32,94 @@ def test_get_dummies_index():
     tm.assert_index_equal(result, expected)
 
 
-def test_get_dummies_with_name_dummy(any_string_dtype):
-    # GH 12180
-    # Dummies named 'name' should work as expected
-    s = Series(["a", "b,name", "b"], dtype=any_string_dtype)
-    result = s.str.get_dummies(",")
-    expected = DataFrame([[1, 0, 0], [0, 1, 1], [0, 1, 0]], columns=["a", "b", "name"])
+def test_get_dummies_int8_dtype():
+    s = Series(["1|2", "1|3", np.nan], dtype="string")
+    result = s.str.get_dummies("|", dtype=np.int8)
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("123"), dtype=np.int8
+    )
     tm.assert_frame_equal(result, expected)
+    assert (result.dtypes == np.int8).all()
 
 
-def test_get_dummies_with_name_dummy_index():
-    # GH 12180
-    # Dummies named 'name' should work as expected
-    idx = Index(["a|b", "name|c", "b|name"])
-    result = idx.str.get_dummies("|")
+def test_get_dummies_uint8_dtype():
+    s = Series(["a|b", "a|c", np.nan], dtype="string")
+    result = s.str.get_dummies("|", dtype=np.uint8)
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=np.uint8
+    )
+    tm.assert_frame_equal(result, expected)
+    assert (result.dtypes == np.uint8).all()
 
-    expected = MultiIndex.from_tuples(
-        [(1, 1, 0, 0), (0, 0, 1, 1), (0, 1, 0, 1)], names=("a", "b", "c", "name")
+
+def test_get_dummies_int16_dtype():
+    s = Series(["a|b", "a|c", np.nan], dtype="string")
+    result = s.str.get_dummies("|", dtype=np.int16)
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=np.int16
     )
-    tm.assert_index_equal(result, expected)
+    tm.assert_frame_equal(result, expected)
+    assert (result.dtypes == np.int16).all()
 
 
-def test_get_dummies_with_prefix(any_string_dtype):
-    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
-    result = s.str.get_dummies(sep="|", prefix="prefix")
+def test_get_dummies_uint16_dtype():
+    s = Series(["a|b", "a|c", np.nan], dtype="string")
+    result = s.str.get_dummies("|", dtype=np.uint16)
     expected = DataFrame(
-        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
-        columns=["prefix_a", "prefix_b", "prefix_c"],
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=np.uint16
     )
     tm.assert_frame_equal(result, expected)
+    assert (result.dtypes == np.uint16).all()
 
 
-def test_get_dummies_with_prefix_sep(any_string_dtype):
-    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
-    result = s.str.get_dummies(sep="|", prefix=None, prefix_sep="__")
-    expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=["a", "b", "c"])
+def test_get_dummies_int32_dtype():
+    s = Series(["x|y", "x|z", np.nan], dtype="string")
+    result = s.str.get_dummies("|", dtype=np.int32)
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("xyz"), dtype=np.int32
+    )
     tm.assert_frame_equal(result, expected)
+    assert (result.dtypes == np.int32).all()
+
 
-    result = s.str.get_dummies(sep="|", prefix="col", prefix_sep="__")
+def test_get_dummies_uint32_dtype():
+    s = Series(["x|y", "x|z", np.nan], dtype="string")
+    result = s.str.get_dummies("|", dtype=np.uint32)
     expected = DataFrame(
-        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
-        columns=["col__a", "col__b", "col__c"],
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("xyz"), dtype=np.uint32
     )
     tm.assert_frame_equal(result, expected)
+    assert (result.dtypes == np.uint32).all()
 
 
-def test_get_dummies_with_dummy_na(any_string_dtype):
-    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
-    result = s.str.get_dummies(sep="|", dummy_na=True)
+def test_get_dummies_int64_dtype():
+    s = Series(["foo|bar", "foo|baz", np.nan], dtype="string")
+    result = s.str.get_dummies("|", dtype=np.int64)
     expected = DataFrame(
-        [[1, 1, 0, 0], [1, 0, 1, 0], [0, 0, 0, 1]],
-        columns=["a", "b", "c", np.nan],
+        [[1, 0, 1], [0, 1, 1], [0, 0, 0]], columns=["bar", "baz", "foo"], dtype=np.int64
     )
     tm.assert_frame_equal(result, expected)
+    assert (result.dtypes == np.int64).all()
 
 
-def test_get_dummies_with_dtype(any_string_dtype):
-    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
-    result = s.str.get_dummies(sep="|", dtype=bool)
+def test_get_dummies_uint64_dtype():
+    s = Series(["foo|bar", "foo|baz", np.nan], dtype="string")
+    result = s.str.get_dummies("|", dtype=np.uint64)
     expected = DataFrame(
-        [[True, True, False], [True, False, True], [False, False, False]],
-        columns=["a", "b", "c"],
+        [[1, 0, 1], [0, 1, 1], [0, 0, 0]],
+        columns=["bar", "baz", "foo"],
+        dtype=np.uint64,
     )
     tm.assert_frame_equal(result, expected)
-    assert (result.dtypes == bool).all()
+    assert (result.dtypes == np.uint64).all()
 
 
-def test_get_dummies_with_prefix_dict(any_string_dtype):
-    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
-    prefix = {"a": "alpha", "b": "beta", "c": "gamma"}
-    result = s.str.get_dummies(sep="|", prefix=prefix)
+def test_get_dummies_bool_dtype():
+    s = Series(["a|b", "a|c", np.nan], dtype="string")
+    result = s.str.get_dummies("|", dtype=bool)
     expected = DataFrame(
-        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
-        columns=["alpha_a", "beta_b", "gamma_c"],
+        [[True, True, False], [True, False, True], [False, False, False]],
+        columns=["a", "b", "c"],
     )
     tm.assert_frame_equal(result, expected)
+    assert (result.dtypes == bool).all()

From 163fe09b5b5871d030e4b4552338f8cac5e06c40 Mon Sep 17 00:00:00 2001
From: Aaron Chu-Carroll <aaron.chucarroll@gmail.com>
Date: Thu, 5 Sep 2024 00:07:26 -0400
Subject: [PATCH 10/22] parametrize dtype tests

---
 pandas/tests/strings/test_get_dummies.py | 97 +++---------------------
 1 file changed, 9 insertions(+), 88 deletions(-)

diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py
index 50859710f4a45..4f75ff05bf0b2 100644
--- a/pandas/tests/strings/test_get_dummies.py
+++ b/pandas/tests/strings/test_get_dummies.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pytest
 
 from pandas import (
     DataFrame,
@@ -32,94 +33,14 @@ def test_get_dummies_index():
     tm.assert_index_equal(result, expected)
 
 
-def test_get_dummies_int8_dtype():
-    s = Series(["1|2", "1|3", np.nan], dtype="string")
-    result = s.str.get_dummies("|", dtype=np.int8)
-    expected = DataFrame(
-        [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("123"), dtype=np.int8
-    )
-    tm.assert_frame_equal(result, expected)
-    assert (result.dtypes == np.int8).all()
-
-
-def test_get_dummies_uint8_dtype():
-    s = Series(["a|b", "a|c", np.nan], dtype="string")
-    result = s.str.get_dummies("|", dtype=np.uint8)
-    expected = DataFrame(
-        [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=np.uint8
-    )
-    tm.assert_frame_equal(result, expected)
-    assert (result.dtypes == np.uint8).all()
-
-
-def test_get_dummies_int16_dtype():
-    s = Series(["a|b", "a|c", np.nan], dtype="string")
-    result = s.str.get_dummies("|", dtype=np.int16)
-    expected = DataFrame(
-        [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=np.int16
-    )
-    tm.assert_frame_equal(result, expected)
-    assert (result.dtypes == np.int16).all()
-
-
-def test_get_dummies_uint16_dtype():
-    s = Series(["a|b", "a|c", np.nan], dtype="string")
-    result = s.str.get_dummies("|", dtype=np.uint16)
-    expected = DataFrame(
-        [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=np.uint16
-    )
-    tm.assert_frame_equal(result, expected)
-    assert (result.dtypes == np.uint16).all()
-
-
-def test_get_dummies_int32_dtype():
-    s = Series(["x|y", "x|z", np.nan], dtype="string")
-    result = s.str.get_dummies("|", dtype=np.int32)
-    expected = DataFrame(
-        [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("xyz"), dtype=np.int32
-    )
-    tm.assert_frame_equal(result, expected)
-    assert (result.dtypes == np.int32).all()
-
-
-def test_get_dummies_uint32_dtype():
-    s = Series(["x|y", "x|z", np.nan], dtype="string")
-    result = s.str.get_dummies("|", dtype=np.uint32)
-    expected = DataFrame(
-        [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("xyz"), dtype=np.uint32
-    )
-    tm.assert_frame_equal(result, expected)
-    assert (result.dtypes == np.uint32).all()
-
-
-def test_get_dummies_int64_dtype():
-    s = Series(["foo|bar", "foo|baz", np.nan], dtype="string")
-    result = s.str.get_dummies("|", dtype=np.int64)
-    expected = DataFrame(
-        [[1, 0, 1], [0, 1, 1], [0, 0, 0]], columns=["bar", "baz", "foo"], dtype=np.int64
-    )
-    tm.assert_frame_equal(result, expected)
-    assert (result.dtypes == np.int64).all()
-
-
-def test_get_dummies_uint64_dtype():
-    s = Series(["foo|bar", "foo|baz", np.nan], dtype="string")
-    result = s.str.get_dummies("|", dtype=np.uint64)
-    expected = DataFrame(
-        [[1, 0, 1], [0, 1, 1], [0, 0, 0]],
-        columns=["bar", "baz", "foo"],
-        dtype=np.uint64,
-    )
-    tm.assert_frame_equal(result, expected)
-    assert (result.dtypes == np.uint64).all()
-
-
-def test_get_dummies_bool_dtype():
-    s = Series(["a|b", "a|c", np.nan], dtype="string")
-    result = s.str.get_dummies("|", dtype=bool)
+@pytest.mark.parametrize(
+    "dtype",
+    [np.uint8, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64, bool],
+)
+def test_get_dummies_with_dtype(any_string_dtype, dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies("|", dtype=dtype)
     expected = DataFrame(
-        [[True, True, False], [True, False, True], [False, False, False]],
-        columns=["a", "b", "c"],
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=dtype
     )
     tm.assert_frame_equal(result, expected)
-    assert (result.dtypes == bool).all()

From d68bece25840dc7f88d83c9287f123368d673105 Mon Sep 17 00:00:00 2001
From: Aaron Chu-Carroll <aaron.chucarroll@gmail.com>
Date: Thu, 5 Sep 2024 11:32:26 -0400
Subject: [PATCH 11/22] support pyarrow and nullable dtypes

---
 pandas/core/arrays/arrow/array.py        |  9 ++++++++-
 pandas/core/arrays/string_arrow.py       |  8 +++++++-
 pandas/core/strings/accessor.py          | 15 +++++++++++++++
 pandas/core/strings/object_array.py      |  9 ++++++++-
 pandas/tests/strings/test_get_dummies.py | 22 +++++++++++++++++++++-
 5 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 11495bb110b77..c6ae1fa3311b1 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -41,6 +41,7 @@
     is_list_like,
     is_numeric_dtype,
     is_scalar,
+    pandas_dtype,
 )
 from pandas.core.dtypes.dtypes import DatetimeTZDtype
 from pandas.core.dtypes.missing import isna
@@ -2552,7 +2553,13 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
         n_cols = len(uniques)
         indices = pc.index_in(flattened_values, uniques_sorted).to_numpy()
         indices = indices + np.arange(n_rows).repeat(lengths) * n_cols
-        dummies = np.zeros(n_rows * n_cols, dtype=dtype)
+        _dtype = pandas_dtype(dtype)
+        dummy_dtype: NpDtype
+        if isinstance(_dtype, np.dtype):
+            dummy_dtype = _dtype
+        else:
+            dummy_dtype = np.bool_
+        dummies = np.zeros(n_rows * n_cols, dtype=dummy_dtype)
         dummies[indices] = True
         dummies = dummies.reshape((n_rows, n_cols))
         result = type(self)(pa.array(list(dummies)))
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 8eb11f187fcb7..7e59523810f22 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -471,7 +471,13 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
         if len(labels) == 0:
             return np.empty(shape=(0, 0), dtype=dtype), labels
         dummies = np.vstack(dummies_pa.to_numpy())
-        return dummies.astype(dtype, copy=False), labels
+        _dtype = pandas_dtype(dtype)
+        dummy_dtype: NpDtype
+        if isinstance(_dtype, np.dtype):
+            dummy_dtype = _dtype
+        else:
+            dummy_dtype = np.bool_
+        return dummies.astype(dummy_dtype, copy=False), labels
 
     def _convert_int_result(self, result):
         if self.dtype.na_value is np.nan:
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index f440cdf8ee74b..f78c6b93bee71 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -26,6 +26,7 @@
 from pandas.core.dtypes.common import (
     ensure_object,
     is_bool_dtype,
+    is_extension_array_dtype,
     is_integer,
     is_list_like,
     is_object_dtype,
@@ -2481,9 +2482,23 @@ def get_dummies(
         1   False  False   False
         2   True   False   True
         """
+        from pandas.core.frame import DataFrame
+
         # we need to cast to Series of strings as only that has all
         # methods available for making the dummies...
         result, name = self._data.array._str_get_dummies(sep, dtype)
+        if is_extension_array_dtype(dtype):
+            return self._wrap_result(
+                DataFrame(result, columns=name, dtype=dtype),
+                name=name,
+                returns_string=False,
+            )
+        if isinstance(dtype, ArrowDtype):
+            return self._wrap_result(
+                DataFrame(result, columns=name, dtype=dtype),
+                name=name,
+                returns_string=False,
+            )
         return self._wrap_result(
             result,
             name=name,
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index 4ea0ff3744172..4a229390713bc 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -18,6 +18,7 @@
 import pandas._libs.ops as libops
 from pandas.util._exceptions import find_stack_level
 
+from pandas.core.dtypes.common import pandas_dtype
 from pandas.core.dtypes.missing import isna
 
 from pandas.core.strings.base import BaseStringArrayMethods
@@ -414,7 +415,13 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
             tags.update(ts)
         tags2 = sorted(tags - {""})
 
-        dummies = np.empty((len(arr), len(tags2)), dtype=dtype)
+        _dtype = pandas_dtype(dtype)
+        dummy_dtype: NpDtype
+        if isinstance(_dtype, np.dtype):
+            dummy_dtype = _dtype
+        else:
+            dummy_dtype = np.bool_
+        dummies = np.empty((len(arr), len(tags2)), dtype=dummy_dtype)
 
         def _isin(test_elements: str, element: str) -> bool:
             return element in test_elements
diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py
index 4f75ff05bf0b2..2dae9e4ed6033 100644
--- a/pandas/tests/strings/test_get_dummies.py
+++ b/pandas/tests/strings/test_get_dummies.py
@@ -1,7 +1,9 @@
 import numpy as np
+import pyarrow as pa
 import pytest
 
 from pandas import (
+    ArrowDtype,
     DataFrame,
     Index,
     MultiIndex,
@@ -35,7 +37,25 @@ def test_get_dummies_index():
 
 @pytest.mark.parametrize(
     "dtype",
-    [np.uint8, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64, bool],
+    [
+        np.uint8,
+        np.int16,
+        np.uint16,
+        np.int32,
+        np.uint32,
+        np.int64,
+        np.uint64,
+        bool,
+        ArrowDtype(pa.int8()),
+        ArrowDtype(pa.int16()),
+        ArrowDtype(pa.int32()),
+        ArrowDtype(pa.int64()),
+        "Int8",
+        "Int16",
+        "Int32",
+        "Int64",
+        "boolean",
+    ],
 )
 def test_get_dummies_with_dtype(any_string_dtype, dtype):
     s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)

From 0fd24012d7fbd76bb910441c69269138c67c4f4c Mon Sep 17 00:00:00 2001
From: Aaron Chu-Carroll <aaron.chucarroll@gmail.com>
Date: Thu, 5 Sep 2024 11:41:18 -0400
Subject: [PATCH 12/22] fix pyarrow import error

---
 pandas/tests/strings/test_get_dummies.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py
index 2dae9e4ed6033..975e8955df529 100644
--- a/pandas/tests/strings/test_get_dummies.py
+++ b/pandas/tests/strings/test_get_dummies.py
@@ -1,5 +1,4 @@
 import numpy as np
-import pyarrow as pa
 import pytest
 
 from pandas import (
@@ -11,6 +10,11 @@
     _testing as tm,
 )
 
+try:
+    import pyarrow as pa
+except ImportError:
+    pa = None
+
 
 def test_get_dummies(any_string_dtype):
     s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)

From 920c865c355d8a1bdcb2ac5990c8aac6267f0874 Mon Sep 17 00:00:00 2001
From: Aaron Chu-Carroll <aaron.chucarroll@gmail.com>
Date: Thu, 5 Sep 2024 11:49:29 -0400
Subject: [PATCH 13/22] skip pyarrow tests when not present

---
 pandas/tests/strings/test_get_dummies.py | 30 ++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py
index 975e8955df529..6813f55e95156 100644
--- a/pandas/tests/strings/test_get_dummies.py
+++ b/pandas/tests/strings/test_get_dummies.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 
+import pandas.util._test_decorators as td
+
 from pandas import (
     ArrowDtype,
     DataFrame,
@@ -50,10 +52,6 @@ def test_get_dummies_index():
         np.int64,
         np.uint64,
         bool,
-        ArrowDtype(pa.int8()),
-        ArrowDtype(pa.int16()),
-        ArrowDtype(pa.int32()),
-        ArrowDtype(pa.int64()),
         "Int8",
         "Int16",
         "Int32",
@@ -68,3 +66,27 @@ def test_get_dummies_with_dtype(any_string_dtype, dtype):
         [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=dtype
     )
     tm.assert_frame_equal(result, expected)
+
+
+@td.skip_if_no("pyarrow")
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        ArrowDtype(pa.int8()),
+        ArrowDtype(pa.uint8()),
+        ArrowDtype(pa.int16()),
+        ArrowDtype(pa.uint16()),
+        ArrowDtype(pa.int32()),
+        ArrowDtype(pa.uint32()),
+        ArrowDtype(pa.int64()),
+        ArrowDtype(pa.uint64()),
+        ArrowDtype(pa.bool_()),
+    ],
+)
+def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies("|", dtype=dtype)
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=dtype
+    )
+    tm.assert_frame_equal(result, expected)

From 800f787182f4a237a0c6a0a2c4a4cb7d5dde63eb Mon Sep 17 00:00:00 2001
From: Aaron Chu-Carroll <aaron.chucarroll@gmail.com>
Date: Thu, 5 Sep 2024 12:20:12 -0400
Subject: [PATCH 14/22] split pyarrow tests

---
 pandas/tests/strings/test_get_dummies.py | 118 +++++++++++++++++++----
 1 file changed, 101 insertions(+), 17 deletions(-)

diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py
index 6813f55e95156..76af39dfc1ea8 100644
--- a/pandas/tests/strings/test_get_dummies.py
+++ b/pandas/tests/strings/test_get_dummies.py
@@ -69,24 +69,108 @@ def test_get_dummies_with_dtype(any_string_dtype, dtype):
 
 
 @td.skip_if_no("pyarrow")
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        ArrowDtype(pa.int8()),
-        ArrowDtype(pa.uint8()),
-        ArrowDtype(pa.int16()),
-        ArrowDtype(pa.uint16()),
-        ArrowDtype(pa.int32()),
-        ArrowDtype(pa.uint32()),
-        ArrowDtype(pa.int64()),
-        ArrowDtype(pa.uint64()),
-        ArrowDtype(pa.bool_()),
-    ],
-)
-def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype):
+def test_get_dummies_with_pyarrow_dtype_int8(any_string_dtype):
     s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
-    result = s.str.get_dummies("|", dtype=dtype)
+    result = s.str.get_dummies("|", dtype=ArrowDtype(pa.int8()))
     expected = DataFrame(
-        [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=dtype
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
+        columns=list("abc"),
+        dtype=ArrowDtype(pa.int8()),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@td.skip_if_no("pyarrow")
+def test_get_dummies_with_pyarrow_dtype_uint8(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies("|", dtype=ArrowDtype(pa.uint8()))
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
+        columns=list("abc"),
+        dtype=ArrowDtype(pa.uint8()),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@td.skip_if_no("pyarrow")
+def test_get_dummies_with_pyarrow_dtype_int16(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies("|", dtype=ArrowDtype(pa.int16()))
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
+        columns=list("abc"),
+        dtype=ArrowDtype(pa.int16()),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@td.skip_if_no("pyarrow")
+def test_get_dummies_with_pyarrow_dtype_uint16(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies("|", dtype=ArrowDtype(pa.uint16()))
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
+        columns=list("abc"),
+        dtype=ArrowDtype(pa.uint16()),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@td.skip_if_no("pyarrow")
+def test_get_dummies_with_pyarrow_dtype_int32(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies("|", dtype=ArrowDtype(pa.int32()))
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
+        columns=list("abc"),
+        dtype=ArrowDtype(pa.int32()),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@td.skip_if_no("pyarrow")
+def test_get_dummies_with_pyarrow_dtype_uint32(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies("|", dtype=ArrowDtype(pa.uint32()))
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
+        columns=list("abc"),
+        dtype=ArrowDtype(pa.uint32()),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@td.skip_if_no("pyarrow")
+def test_get_dummies_with_pyarrow_dtype_int64(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies("|", dtype=ArrowDtype(pa.int64()))
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
+        columns=list("abc"),
+        dtype=ArrowDtype(pa.int64()),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@td.skip_if_no("pyarrow")
+def test_get_dummies_with_pyarrow_dtype_uint64(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies("|", dtype=ArrowDtype(pa.uint64()))
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
+        columns=list("abc"),
+        dtype=ArrowDtype(pa.uint64()),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@td.skip_if_no("pyarrow")
+def test_get_dummies_with_pyarrow_dtype_bool(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies("|", dtype=ArrowDtype(pa.bool_()))
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
+        columns=list("abc"),
+        dtype=ArrowDtype(pa.bool_()),
     )
     tm.assert_frame_equal(result, expected)

From 6cbc3e8008228d67071c088ee2c10dc791f7a724 Mon Sep 17 00:00:00 2001
From: Aaron Chu-Carroll <aaron.chucarroll@gmail.com>
Date: Sat, 7 Sep 2024 15:02:14 -0400
Subject: [PATCH 15/22] parametrize pyarrow tests

---
 pandas/tests/strings/test_get_dummies.py | 117 ++++-------------------
 1 file changed, 17 insertions(+), 100 deletions(-)

diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py
index 76af39dfc1ea8..2aa5d568176d4 100644
--- a/pandas/tests/strings/test_get_dummies.py
+++ b/pandas/tests/strings/test_get_dummies.py
@@ -4,7 +4,6 @@
 import pandas.util._test_decorators as td
 
 from pandas import (
-    ArrowDtype,
     DataFrame,
     Index,
     MultiIndex,
@@ -69,108 +68,26 @@ def test_get_dummies_with_dtype(any_string_dtype, dtype):
 
 
 @td.skip_if_no("pyarrow")
-def test_get_dummies_with_pyarrow_dtype_int8(any_string_dtype):
-    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
-    result = s.str.get_dummies("|", dtype=ArrowDtype(pa.int8()))
-    expected = DataFrame(
-        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
-        columns=list("abc"),
-        dtype=ArrowDtype(pa.int8()),
-    )
-    tm.assert_frame_equal(result, expected)
-
-
-@td.skip_if_no("pyarrow")
-def test_get_dummies_with_pyarrow_dtype_uint8(any_string_dtype):
-    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
-    result = s.str.get_dummies("|", dtype=ArrowDtype(pa.uint8()))
-    expected = DataFrame(
-        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
-        columns=list("abc"),
-        dtype=ArrowDtype(pa.uint8()),
-    )
-    tm.assert_frame_equal(result, expected)
-
-
-@td.skip_if_no("pyarrow")
-def test_get_dummies_with_pyarrow_dtype_int16(any_string_dtype):
-    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
-    result = s.str.get_dummies("|", dtype=ArrowDtype(pa.int16()))
-    expected = DataFrame(
-        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
-        columns=list("abc"),
-        dtype=ArrowDtype(pa.int16()),
-    )
-    tm.assert_frame_equal(result, expected)
-
-
-@td.skip_if_no("pyarrow")
-def test_get_dummies_with_pyarrow_dtype_uint16(any_string_dtype):
-    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
-    result = s.str.get_dummies("|", dtype=ArrowDtype(pa.uint16()))
-    expected = DataFrame(
-        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
-        columns=list("abc"),
-        dtype=ArrowDtype(pa.uint16()),
-    )
-    tm.assert_frame_equal(result, expected)
-
-
-@td.skip_if_no("pyarrow")
-def test_get_dummies_with_pyarrow_dtype_int32(any_string_dtype):
-    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
-    result = s.str.get_dummies("|", dtype=ArrowDtype(pa.int32()))
-    expected = DataFrame(
-        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
-        columns=list("abc"),
-        dtype=ArrowDtype(pa.int32()),
-    )
-    tm.assert_frame_equal(result, expected)
-
-
-@td.skip_if_no("pyarrow")
-def test_get_dummies_with_pyarrow_dtype_uint32(any_string_dtype):
-    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
-    result = s.str.get_dummies("|", dtype=ArrowDtype(pa.uint32()))
-    expected = DataFrame(
-        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
-        columns=list("abc"),
-        dtype=ArrowDtype(pa.uint32()),
-    )
-    tm.assert_frame_equal(result, expected)
-
-
-@td.skip_if_no("pyarrow")
-def test_get_dummies_with_pyarrow_dtype_int64(any_string_dtype):
-    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
-    result = s.str.get_dummies("|", dtype=ArrowDtype(pa.int64()))
-    expected = DataFrame(
-        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
-        columns=list("abc"),
-        dtype=ArrowDtype(pa.int64()),
-    )
-    tm.assert_frame_equal(result, expected)
-
-
-@td.skip_if_no("pyarrow")
-def test_get_dummies_with_pyarrow_dtype_uint64(any_string_dtype):
-    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
-    result = s.str.get_dummies("|", dtype=ArrowDtype(pa.uint64()))
-    expected = DataFrame(
-        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
-        columns=list("abc"),
-        dtype=ArrowDtype(pa.uint64()),
-    )
-    tm.assert_frame_equal(result, expected)
-
-
-@td.skip_if_no("pyarrow")
-def test_get_dummies_with_pyarrow_dtype_bool(any_string_dtype):
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "int8[pyarrow]",
+        "uint8[pyarrow]",
+        "int16[pyarrow]",
+        "uint16[pyarrow]",
+        "int32[pyarrow]",
+        "uint32[pyarrow]",
+        "int64[pyarrow]",
+        "uint64[pyarrow]",
+        "bool[pyarrow]",
+    ],
+)
+def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype):
     s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
-    result = s.str.get_dummies("|", dtype=ArrowDtype(pa.bool_()))
+    result = s.str.get_dummies("|", dtype=dtype)
     expected = DataFrame(
         [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
         columns=list("abc"),
-        dtype=ArrowDtype(pa.bool_()),
+        dtype=dtype,
     )
     tm.assert_frame_equal(result, expected)

From 532e139a847557a2959092b1f58200a85bb25834 Mon Sep 17 00:00:00 2001
From: Aaron Chu-Carroll <aaron.chucarroll@gmail.com>
Date: Sat, 7 Sep 2024 15:03:50 -0400
Subject: [PATCH 16/22] change var name to dummies_dtype

---
 pandas/core/arrays/arrow/array.py   | 8 ++++----
 pandas/core/arrays/string_arrow.py  | 8 ++++----
 pandas/core/strings/object_array.py | 8 ++++----
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 93fba2fe18a76..bb7f8c7351975 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2527,12 +2527,12 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
         indices = pc.index_in(flattened_values, uniques_sorted).to_numpy()
         indices = indices + np.arange(n_rows).repeat(lengths) * n_cols
         _dtype = pandas_dtype(dtype)
-        dummy_dtype: NpDtype
+        dummies_dtype: NpDtype
         if isinstance(_dtype, np.dtype):
-            dummy_dtype = _dtype
+            dummies_dtype = _dtype
         else:
-            dummy_dtype = np.bool_
-        dummies = np.zeros(n_rows * n_cols, dtype=dummy_dtype)
+            dummies_dtype = np.bool_
+        dummies = np.zeros(n_rows * n_cols, dtype=dummies_dtype)
         dummies[indices] = True
         dummies = dummies.reshape((n_rows, n_cols))
         result = type(self)(pa.array(list(dummies)))
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index e2beb0efdbee2..3ac2bbea1f3ff 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -446,12 +446,12 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
             return np.empty(shape=(0, 0), dtype=dtype), labels
         dummies = np.vstack(dummies_pa.to_numpy())
         _dtype = pandas_dtype(dtype)
-        dummy_dtype: NpDtype
+        dummies_dtype: NpDtype
         if isinstance(_dtype, np.dtype):
-            dummy_dtype = _dtype
+            dummies_dtype = _dtype
         else:
-            dummy_dtype = np.bool_
-        return dummies.astype(dummy_dtype, copy=False), labels
+            dummies_dtype = np.bool_
+        return dummies.astype(dummies_dtype, copy=False), labels
 
     def _convert_int_result(self, result):
         if self.dtype.na_value is np.nan:
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index 4a229390713bc..6211c7b528db9 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -416,12 +416,12 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
         tags2 = sorted(tags - {""})
 
         _dtype = pandas_dtype(dtype)
-        dummy_dtype: NpDtype
+        dummies_dtype: NpDtype
         if isinstance(_dtype, np.dtype):
-            dummy_dtype = _dtype
+            dummies_dtype = _dtype
         else:
-            dummy_dtype = np.bool_
-        dummies = np.empty((len(arr), len(tags2)), dtype=dummy_dtype)
+            dummies_dtype = np.bool_
+        dummies = np.empty((len(arr), len(tags2)), dtype=dummies_dtype)
 
         def _isin(test_elements: str, element: str) -> bool:
             return element in test_elements

From cd5c2ab178af5d2ed50db8bb207f082692756e78 Mon Sep 17 00:00:00 2001
From: Aaron Chu-Carroll <aaron.chucarroll@gmail.com>
Date: Sat, 7 Sep 2024 15:57:24 -0400
Subject: [PATCH 17/22] fix string issue

---
 pandas/core/arrays/arrow/array.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index bb7f8c7351975..7b690e0068ea3 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2533,6 +2533,8 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
         else:
             dummies_dtype = np.bool_
         dummies = np.zeros(n_rows * n_cols, dtype=dummies_dtype)
+        if dtype == str:
+            dummies[:] = False
         dummies[indices] = True
         dummies = dummies.reshape((n_rows, n_cols))
         result = type(self)(pa.array(list(dummies)))

From 822b3f4f38d16cd683b0a9bf5b06b00331946ca6 Mon Sep 17 00:00:00 2001
From: Aaron Chu-Carroll <aaron.chucarroll@gmail.com>
Date: Sat, 7 Sep 2024 15:58:41 -0400
Subject: [PATCH 18/22] consolidate conditionals

---
 pandas/core/strings/accessor.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index f78c6b93bee71..6d10365a1b968 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -2487,13 +2487,7 @@ def get_dummies(
         # we need to cast to Series of strings as only that has all
         # methods available for making the dummies...
         result, name = self._data.array._str_get_dummies(sep, dtype)
-        if is_extension_array_dtype(dtype):
-            return self._wrap_result(
-                DataFrame(result, columns=name, dtype=dtype),
-                name=name,
-                returns_string=False,
-            )
-        if isinstance(dtype, ArrowDtype):
+        if is_extension_array_dtype(dtype) or isinstance(dtype, ArrowDtype):
             return self._wrap_result(
                 DataFrame(result, columns=name, dtype=dtype),
                 name=name,

From ba05a8de6dbad1b1a47f4d1f1994eb0adff641bc Mon Sep 17 00:00:00 2001
From: Aaron Chu-Carroll <aaron.chucarroll@gmail.com>
Date: Sat, 7 Sep 2024 15:59:13 -0400
Subject: [PATCH 19/22] add tests for str and pyarrow strings

---
 pandas/tests/strings/test_get_dummies.py | 30 ++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py
index 2aa5d568176d4..be72442d80320 100644
--- a/pandas/tests/strings/test_get_dummies.py
+++ b/pandas/tests/strings/test_get_dummies.py
@@ -40,6 +40,7 @@ def test_get_dummies_index():
     tm.assert_index_equal(result, expected)
 
 
+# GH#47872
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -67,6 +68,7 @@ def test_get_dummies_with_dtype(any_string_dtype, dtype):
     tm.assert_frame_equal(result, expected)
 
 
+# GH#47872
 @td.skip_if_no("pyarrow")
 @pytest.mark.parametrize(
     "dtype",
@@ -91,3 +93,31 @@ def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype):
         dtype=dtype,
     )
     tm.assert_frame_equal(result, expected)
+
+
+# GH#47872
+def test_get_dummies_with_str_dtype(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies("|", dtype=str)
+    expected = DataFrame(
+        [["T", "T", "F"], ["T", "F", "T"], ["F", "F", "F"]],
+        columns=list("abc"),
+        dtype=str,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+# GH#47872
+def test_get_dummies_with_pa_str_dtype(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies("|", dtype="str[pyarrow]")
+    expected = DataFrame(
+        [
+            ["true", "true", "false"],
+            ["true", "false", "true"],
+            ["false", "false", "false"],
+        ],
+        columns=list("abc"),
+        dtype="str[pyarrow]",
+    )
+    tm.assert_frame_equal(result, expected)

From 37dddb895c336f601483737eafd0a0c0bb096894 Mon Sep 17 00:00:00 2001
From: Aaron Chu-Carroll <aaron.chucarroll@gmail.com>
Date: Sat, 7 Sep 2024 18:45:23 -0400
Subject: [PATCH 20/22] skip pyarrow string tests if not present

---
 pandas/tests/strings/test_get_dummies.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py
index be72442d80320..0656f505dc745 100644
--- a/pandas/tests/strings/test_get_dummies.py
+++ b/pandas/tests/strings/test_get_dummies.py
@@ -108,6 +108,7 @@ def test_get_dummies_with_str_dtype(any_string_dtype):
 
 
 # GH#47872
+@td.skip_if_no("pyarrow")
 def test_get_dummies_with_pa_str_dtype(any_string_dtype):
     s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
     result = s.str.get_dummies("|", dtype="str[pyarrow]")

From 6fbe183c7adf374d5e15399b3c4c1b48759a6540 Mon Sep 17 00:00:00 2001
From: Aaron Chu-Carroll <aaron.chucarroll@gmail.com>
Date: Mon, 9 Sep 2024 16:43:39 -0400
Subject: [PATCH 21/22] add info to whatsnew doc

---
 doc/source/whatsnew/v3.0.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index cd353b60d1a6e..3fb0e52b024bb 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -54,6 +54,7 @@ Other enhancements
 - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
 - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
 - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
+- :func:`str.get_dummies` now accepts a  ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
 - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
 - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`)
 - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)

From 87a1ee8229242c6233905d1b5dc973503329f75a Mon Sep 17 00:00:00 2001
From: Aaron Chu-Carroll <aaron.chucarroll@gmail.com>
Date: Mon, 9 Sep 2024 16:45:19 -0400
Subject: [PATCH 22/22] change func to meth in doc info

---
 doc/source/whatsnew/v3.0.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 3fb0e52b024bb..8362a430f37fe 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -54,8 +54,8 @@ Other enhancements
 - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
 - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
 - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
-- :func:`str.get_dummies` now accepts a  ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
 - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
+- :meth:`str.get_dummies` now accepts a  ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
 - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`)
 - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
 - Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`)