Skip to content

ENH: Series.str.get_dummies() defers to pd.get_dummies() #59554

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
9 changes: 0 additions & 9 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2744,15 +2744,6 @@ def _str_map(
result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype)
return take_nd(result, codes, fill_value=na_value)

def _str_get_dummies(self, sep: str = "|"):
# sep may not be in categories. Just bail on this.
from pandas.core.arrays import NumpyExtensionArray

return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep)

# ------------------------------------------------------------------------
# GroupBy Methods

def _groupby_op(
self,
*,
Expand Down
110 changes: 103 additions & 7 deletions pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2356,8 +2356,22 @@ def wrap(
)
return self._wrap_result(result)

from collections.abc import Iterable
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from pandas._typing import NpDtype

@forbid_nonstring_types(["bytes"])
def get_dummies(self, sep: str = "|"):
def get_dummies(
self,
sep: str = "|",
prefix: str | Iterable[str] | dict[str, str] | None = None,
prefix_sep: str | None = "_",
dummy_na: bool = False,
sparse: bool = False,
dtype: NpDtype | None = np.int64,
):
"""
Return DataFrame of dummy/indicator variables for Series.

Expand All @@ -2368,6 +2382,21 @@ def get_dummies(self, sep: str = "|"):
----------
sep : str, default "|"
String to split on.
prefix : str, list of str, or dict of str, default None
String to append DataFrame column names.
Pass a list with length equal to the number of columns
when calling get_dummies on a DataFrame. Alternatively, `prefix`
can be a dictionary mapping column names to prefixes.
prefix_sep : str, default '_'
If appending prefix, separator/delimiter to use. Or pass a
list or dictionary as with `prefix`.
dummy_na : bool, default False
Add a column to indicate NaNs, if False NaNs are ignored.
sparse : bool, default False
Whether the dummy-encoded columns should be backed by
a :class:`SparseArray` (True) or a regular NumPy array (False).
dtype : dtype, default bool
Data type for new columns. Only a single dtype is allowed.

Returns
-------
Expand Down Expand Up @@ -2395,13 +2424,80 @@ def get_dummies(self, sep: str = "|"):
"""
# we need to cast to Series of strings as only that has all
# methods available for making the dummies...
result, name = self._data.array._str_get_dummies(sep)
return self._wrap_result(
result,
name=name,
expand=True,
returns_string=False,
# result, name = self._data.array._str_get_dummies(sep)
# return self._wrap_result(
# result,
# name=name,
# expand=True,
# returns_string=False,
# )
from pandas import (
MultiIndex,
Series,
)
from pandas.core.reshape.encoding import get_dummies

input_series = (
Series(self._data) if isinstance(self._data, ABCIndex) else self._data
)
if isinstance(self._data.dtype, ArrowDtype):
import pyarrow as pa

dtype = ArrowDtype(pa.bool_())
string_series = input_series.apply(lambda x: str(x) if not isna(x) else x)
split_series = string_series.str.split(sep, expand=True).stack()
valid_split_series = split_series[
(split_series.astype(str) != "None")
& ~(
split_series.index.get_level_values(0).duplicated(keep="first")
& split_series.isna()
)
]

dummy_df = get_dummies(
valid_split_series, None, None, dummy_na, None, sparse, False, dtype
)
grouped_dummies = dummy_df.groupby(level=0)
if dtype == bool:
result_df = grouped_dummies.any()
else:
result_df = grouped_dummies.sum()

if isinstance(prefix, str):
result_df.columns = [
f"{prefix}{prefix_sep}{col}" for col in result_df.columns
]
elif isinstance(prefix, dict):
if len(prefix) != len(result_df.columns):
len_msg = (
f"Length of 'prefix' ({len(prefix)}) did not match the "
"length of the columns being encoded "
f"({len(result_df.columns)})."
)
raise ValueError(len_msg)
result_df.columns = [
f"{prefix[col]}{prefix_sep}{col}" for col in result_df.columns
]
elif isinstance(prefix, list):
if len(prefix) != len(result_df.columns):
len_msg = (
f"Length of 'prefix' ({len(prefix)}) did not match the "
"length of the columns being encoded "
f"({len(result_df.columns)})."
)
raise ValueError(len_msg)
result_df.columns = [
f"{prefix[i]}{prefix_sep}{col}"
for i, col in enumerate(result_df.columns)
]

if isinstance(self._data, ABCIndex):
return MultiIndex.from_frame(result_df)

result_df.attrs = self._data.attrs
if dtype is not None and not sparse:
return result_df.astype(dtype)
return result_df

@forbid_nonstring_types(["bytes"])
def translate(self, table):
Expand Down
4 changes: 0 additions & 4 deletions pandas/core/strings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,10 +160,6 @@ def _str_translate(self, table):
def _str_wrap(self, width: int, **kwargs):
pass

@abc.abstractmethod
def _str_get_dummies(self, sep: str = "|"):
pass

@abc.abstractmethod
def _str_isalnum(self):
pass
Expand Down
27 changes: 0 additions & 27 deletions pandas/core/strings/object_array.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import functools
import re
import textwrap
from typing import (
Expand Down Expand Up @@ -372,32 +371,6 @@ def _str_wrap(self, width: int, **kwargs):
tw = textwrap.TextWrapper(**kwargs)
return self._str_map(lambda s: "\n".join(tw.wrap(s)))

def _str_get_dummies(self, sep: str = "|"):
from pandas import Series

arr = Series(self).fillna("")
try:
arr = sep + arr + sep
except (TypeError, NotImplementedError):
arr = sep + arr.astype(str) + sep

tags: set[str] = set()
for ts in Series(arr, copy=False).str.split(sep):
tags.update(ts)
tags2 = sorted(tags - {""})

dummies = np.empty((len(arr), len(tags2)), dtype=np.int64)

def _isin(test_elements: str, element: str) -> bool:
return element in test_elements

for i, t in enumerate(tags2):
pat = sep + t + sep
dummies[:, i] = lib.map_infer(
arr.to_numpy(), functools.partial(_isin, element=pat)
)
return dummies, tags2

def _str_upper(self):
return self._str_map(lambda x: x.upper())

Expand Down
5 changes: 4 additions & 1 deletion pandas/tests/strings/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,10 @@
)
)
ids, _, _ = zip(*_any_string_method) # use method name as fixture-id
missing_methods = {f for f in dir(StringMethods) if not f.startswith("_")} - set(ids)
NON_METHODS = {"TYPE_CHECKING", "Iterable"}
missing_methods = (
{f for f in dir(StringMethods) if not f.startswith("_")} - set(ids) - NON_METHODS
)

# test that the above list captures all methods of StringMethods
assert not missing_methods
Expand Down
69 changes: 69 additions & 0 deletions pandas/tests/strings/test_get_dummies.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
Index,
MultiIndex,
Series,
SparseDtype,
_testing as tm,
)

Expand Down Expand Up @@ -51,3 +52,71 @@ def test_get_dummies_with_name_dummy_index():
[(1, 1, 0, 0), (0, 0, 1, 1), (0, 1, 0, 1)], names=("a", "b", "c", "name")
)
tm.assert_index_equal(result, expected)


def test_get_dummies_with_prefix(any_string_dtype):
s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
result = s.str.get_dummies(sep="|", prefix="prefix")
expected = DataFrame(
[[1, 1, 0], [1, 0, 1], [0, 0, 0]],
columns=["prefix_a", "prefix_b", "prefix_c"],
)
tm.assert_frame_equal(result, expected)


def test_get_dummies_with_prefix_sep(any_string_dtype):
s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
result = s.str.get_dummies(sep="|", prefix=None, prefix_sep="__")
expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=["a", "b", "c"])
tm.assert_frame_equal(result, expected)

result = s.str.get_dummies(sep="|", prefix="col", prefix_sep="__")
expected = DataFrame(
[[1, 1, 0], [1, 0, 1], [0, 0, 0]],
columns=["col__a", "col__b", "col__c"],
)
tm.assert_frame_equal(result, expected)


def test_get_dummies_with_dummy_na(any_string_dtype):
s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
result = s.str.get_dummies(sep="|", dummy_na=True)
expected = DataFrame(
[[1, 1, 0, 0], [1, 0, 1, 0], [0, 0, 0, 1]],
columns=["a", "b", "c", np.nan],
)
tm.assert_frame_equal(result, expected)


def test_get_dummies_with_sparse(any_string_dtype):
s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
result = s.str.get_dummies(sep="|", sparse=True)
expected = DataFrame(
[[1, 1, 0], [1, 0, 1], [0, 0, 0]],
columns=["a", "b", "c"],
dtype="Sparse[int]",
)
tm.assert_frame_equal(result, expected)
assert all(isinstance(dtype, SparseDtype) for dtype in result.dtypes)


def test_get_dummies_with_dtype(any_string_dtype):
s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
result = s.str.get_dummies(sep="|", dtype=bool)
expected = DataFrame(
[[True, True, False], [True, False, True], [False, False, False]],
columns=["a", "b", "c"],
)
tm.assert_frame_equal(result, expected)
assert (result.dtypes == bool).all()


def test_get_dummies_with_prefix_dict(any_string_dtype):
s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
prefix = {"a": "alpha", "b": "beta", "c": "gamma"}
result = s.str.get_dummies(sep="|", prefix=prefix)
expected = DataFrame(
[[1, 1, 0], [1, 0, 1], [0, 0, 0]],
columns=["alpha_a", "beta_b", "gamma_c"],
)
tm.assert_frame_equal(result, expected)
Loading