pandas-dev · aaronchucarroll · Aug 19, 2024 · Aug 19, 2024 · Aug 19, 2024 · Aug 19, 2024
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2744,15 +2744,6 @@ def _str_map(
         result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype)
         return take_nd(result, codes, fill_value=na_value)
 
-    def _str_get_dummies(self, sep: str = "|"):
-        # sep may not be in categories. Just bail on this.
-        from pandas.core.arrays import NumpyExtensionArray
-
-        return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep)
-
-    # ------------------------------------------------------------------------
-    # GroupBy Methods
-
     def _groupby_op(
         self,
         *,

diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
@@ -2356,8 +2356,22 @@ def wrap(
         )
         return self._wrap_result(result)
 
+    from collections.abc import Iterable
+    from typing import TYPE_CHECKING
+
+    if TYPE_CHECKING:
+        from pandas._typing import NpDtype
+
     @forbid_nonstring_types(["bytes"])
-    def get_dummies(self, sep: str = "|"):
+    def get_dummies(
+        self,
+        sep: str = "|",
+        prefix: str | Iterable[str] | dict[str, str] | None = None,
+        prefix_sep: str | None = "_",
+        dummy_na: bool = False,
+        sparse: bool = False,
+        dtype: NpDtype | None = np.int64,
+    ):
         """
         Return DataFrame of dummy/indicator variables for Series.
 
@@ -2368,6 +2382,21 @@ def get_dummies(self, sep: str = "|"):
         ----------
         sep : str, default "|"
             String to split on.
+        prefix : str, list of str, or dict of str, default None
+        String to append DataFrame column names.
+        Pass a list with length equal to the number of columns
+        when calling get_dummies on a DataFrame. Alternatively, `prefix`
+        can be a dictionary mapping column names to prefixes.
+        prefix_sep : str, default '_'
+            If appending prefix, separator/delimiter to use. Or pass a
+            list or dictionary as with `prefix`.
+        dummy_na : bool, default False
+            Add a column to indicate NaNs, if False NaNs are ignored.
+        sparse : bool, default False
+            Whether the dummy-encoded columns should be backed by
+            a :class:`SparseArray` (True) or a regular NumPy array (False).
+        dtype : dtype, default bool
+            Data type for new columns. Only a single dtype is allowed.
 
         Returns
         -------
@@ -2395,13 +2424,80 @@ def get_dummies(self, sep: str = "|"):
         """
         # we need to cast to Series of strings as only that has all
         # methods available for making the dummies...
-        result, name = self._data.array._str_get_dummies(sep)
-        return self._wrap_result(
-            result,
-            name=name,
-            expand=True,
-            returns_string=False,
+        # result, name = self._data.array._str_get_dummies(sep)
+        # return self._wrap_result(
+        #     result,
+        #     name=name,
+        #     expand=True,
+        #     returns_string=False,
+        # )
+        from pandas import (
+            MultiIndex,
+            Series,
+        )
+        from pandas.core.reshape.encoding import get_dummies
+
+        input_series = (
+            Series(self._data) if isinstance(self._data, ABCIndex) else self._data
+        )
+        if isinstance(self._data.dtype, ArrowDtype):
+            import pyarrow as pa
+
+            dtype = ArrowDtype(pa.bool_())
+        string_series = input_series.apply(lambda x: str(x) if not isna(x) else x)
+        split_series = string_series.str.split(sep, expand=True).stack()
+        valid_split_series = split_series[
+            (split_series.astype(str) != "None")
+            & ~(
+                split_series.index.get_level_values(0).duplicated(keep="first")
+                & split_series.isna()
+            )
+        ]
+
+        dummy_df = get_dummies(
+            valid_split_series, None, None, dummy_na, None, sparse, False, dtype
         )
+        grouped_dummies = dummy_df.groupby(level=0)
+        if dtype == bool:
+            result_df = grouped_dummies.any()
+        else:
+            result_df = grouped_dummies.sum()
+
+        if isinstance(prefix, str):
+            result_df.columns = [
+                f"{prefix}{prefix_sep}{col}" for col in result_df.columns
+            ]
+        elif isinstance(prefix, dict):
+            if len(prefix) != len(result_df.columns):
+                len_msg = (
+                    f"Length of 'prefix' ({len(prefix)}) did not match the "
+                    "length of the columns being encoded "
+                    f"({len(result_df.columns)})."
+                )
+                raise ValueError(len_msg)
+            result_df.columns = [
+                f"{prefix[col]}{prefix_sep}{col}" for col in result_df.columns
+            ]
+        elif isinstance(prefix, list):
+            if len(prefix) != len(result_df.columns):
+                len_msg = (
+                    f"Length of 'prefix' ({len(prefix)}) did not match the "
+                    "length of the columns being encoded "
+                    f"({len(result_df.columns)})."
+                )
+                raise ValueError(len_msg)
+            result_df.columns = [
+                f"{prefix[i]}{prefix_sep}{col}"
+                for i, col in enumerate(result_df.columns)
+            ]
+
+        if isinstance(self._data, ABCIndex):
+            return MultiIndex.from_frame(result_df)
+
+        result_df.attrs = self._data.attrs
+        if dtype is not None and not sparse:
+            return result_df.astype(dtype)
+        return result_df
 
     @forbid_nonstring_types(["bytes"])
     def translate(self, table):

diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py
@@ -160,10 +160,6 @@ def _str_translate(self, table):
     def _str_wrap(self, width: int, **kwargs):
         pass
 
-    @abc.abstractmethod
-    def _str_get_dummies(self, sep: str = "|"):
-        pass
-
     @abc.abstractmethod
     def _str_isalnum(self):
         pass

diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import functools
 import re
 import textwrap
 from typing import (
@@ -372,32 +371,6 @@ def _str_wrap(self, width: int, **kwargs):
         tw = textwrap.TextWrapper(**kwargs)
         return self._str_map(lambda s: "\n".join(tw.wrap(s)))
 
-    def _str_get_dummies(self, sep: str = "|"):
-        from pandas import Series
-
-        arr = Series(self).fillna("")
-        try:
-            arr = sep + arr + sep
-        except (TypeError, NotImplementedError):
-            arr = sep + arr.astype(str) + sep
-
-        tags: set[str] = set()
-        for ts in Series(arr, copy=False).str.split(sep):
-            tags.update(ts)
-        tags2 = sorted(tags - {""})
-
-        dummies = np.empty((len(arr), len(tags2)), dtype=np.int64)
-
-        def _isin(test_elements: str, element: str) -> bool:
-            return element in test_elements
-
-        for i, t in enumerate(tags2):
-            pat = sep + t + sep
-            dummies[:, i] = lib.map_infer(
-                arr.to_numpy(), functools.partial(_isin, element=pat)
-            )
-        return dummies, tags2
-
     def _str_upper(self):
         return self._str_map(lambda x: x.upper())
 

diff --git a/pandas/tests/strings/conftest.py b/pandas/tests/strings/conftest.py
@@ -96,7 +96,10 @@
     )
 )
 ids, _, _ = zip(*_any_string_method)  # use method name as fixture-id
-missing_methods = {f for f in dir(StringMethods) if not f.startswith("_")} - set(ids)
+NON_METHODS = {"TYPE_CHECKING", "Iterable"}
+missing_methods = (
+    {f for f in dir(StringMethods) if not f.startswith("_")} - set(ids) - NON_METHODS
+)
 
 # test that the above list captures all methods of StringMethods
 assert not missing_methods

diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py
@@ -5,6 +5,7 @@
     Index,
     MultiIndex,
     Series,
+    SparseDtype,
     _testing as tm,
 )
 
@@ -51,3 +52,71 @@ def test_get_dummies_with_name_dummy_index():
         [(1, 1, 0, 0), (0, 0, 1, 1), (0, 1, 0, 1)], names=("a", "b", "c", "name")
     )
     tm.assert_index_equal(result, expected)
+
+
+def test_get_dummies_with_prefix(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies(sep="|", prefix="prefix")
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
+        columns=["prefix_a", "prefix_b", "prefix_c"],
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_get_dummies_with_prefix_sep(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies(sep="|", prefix=None, prefix_sep="__")
+    expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=["a", "b", "c"])
+    tm.assert_frame_equal(result, expected)
+
+    result = s.str.get_dummies(sep="|", prefix="col", prefix_sep="__")
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
+        columns=["col__a", "col__b", "col__c"],
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_get_dummies_with_dummy_na(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies(sep="|", dummy_na=True)
+    expected = DataFrame(
+        [[1, 1, 0, 0], [1, 0, 1, 0], [0, 0, 0, 1]],
+        columns=["a", "b", "c", np.nan],
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_get_dummies_with_sparse(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies(sep="|", sparse=True)
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
+        columns=["a", "b", "c"],
+        dtype="Sparse[int]",
+    )
+    tm.assert_frame_equal(result, expected)
+    assert all(isinstance(dtype, SparseDtype) for dtype in result.dtypes)
+
+
+def test_get_dummies_with_dtype(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies(sep="|", dtype=bool)
+    expected = DataFrame(
+        [[True, True, False], [True, False, True], [False, False, False]],
+        columns=["a", "b", "c"],
+    )
+    tm.assert_frame_equal(result, expected)
+    assert (result.dtypes == bool).all()
+
+
+def test_get_dummies_with_prefix_dict(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    prefix = {"a": "alpha", "b": "beta", "c": "gamma"}
+    result = s.str.get_dummies(sep="|", prefix=prefix)
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
+        columns=["alpha_a", "beta_b", "gamma_c"],
+    )
+    tm.assert_frame_equal(result, expected)