Skip to content

Commit 4c3b968

Browse files
ENH: Series.str.get_dummies() raise on string type (#59786)
1 parent 1d33e4c commit 4c3b968

File tree

5 files changed

+11
-38
lines changed

5 files changed

+11
-38
lines changed

doc/source/whatsnew/v3.0.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@ Other enhancements
6565
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
6666
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
6767
- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
68+
- :meth:`Series.str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
6869
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
69-
- :meth:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
7070
- Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`)
7171
- Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`)
7272
- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)

pandas/core/arrays/arrow/array.py

-2
Original file line numberDiff line numberDiff line change
@@ -2531,8 +2531,6 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
25312531
else:
25322532
dummies_dtype = np.bool_
25332533
dummies = np.zeros(n_rows * n_cols, dtype=dummies_dtype)
2534-
if dtype == str:
2535-
dummies[:] = False
25362534
dummies[indices] = True
25372535
dummies = dummies.reshape((n_rows, n_cols))
25382536
result = type(self)(pa.array(list(dummies)))

pandas/core/strings/accessor.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
is_extension_array_dtype,
3030
is_integer,
3131
is_list_like,
32+
is_numeric_dtype,
3233
is_object_dtype,
3334
is_re,
3435
)
@@ -2524,10 +2525,12 @@ def get_dummies(
25242525
"""
25252526
from pandas.core.frame import DataFrame
25262527

2528+
if dtype is not None and not (is_numeric_dtype(dtype) or is_bool_dtype(dtype)):
2529+
raise ValueError("Only numeric or boolean dtypes are supported for 'dtype'")
25272530
# we need to cast to Series of strings as only that has all
25282531
# methods available for making the dummies...
25292532
result, name = self._data.array._str_get_dummies(sep, dtype)
2530-
if is_extension_array_dtype(dtype) or isinstance(dtype, ArrowDtype):
2533+
if is_extension_array_dtype(dtype):
25312534
return self._wrap_result(
25322535
DataFrame(result, columns=name, dtype=dtype),
25332536
name=name,

pandas/core/strings/object_array.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -434,7 +434,7 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
434434
dummies_dtype = _dtype
435435
else:
436436
dummies_dtype = np.bool_
437-
dummies = np.empty((len(arr), len(tags2)), dtype=dummies_dtype)
437+
dummies = np.empty((len(arr), len(tags2)), dtype=dummies_dtype, order="F")
438438

439439
def _isin(test_elements: str, element: str) -> bool:
440440
return element in test_elements

pandas/tests/strings/test_get_dummies.py

+5-33
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,16 @@
11
import numpy as np
22
import pytest
33

4-
from pandas._config import using_string_dtype
5-
64
import pandas.util._test_decorators as td
75

86
from pandas import (
9-
ArrowDtype,
107
DataFrame,
118
Index,
129
MultiIndex,
1310
Series,
1411
_testing as tm,
1512
)
1613

17-
try:
18-
import pyarrow as pa
19-
except ImportError:
20-
pa = None
21-
2214

2315
def test_get_dummies(any_string_dtype):
2416
s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
@@ -99,32 +91,12 @@ def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype):
9991

10092

10193
# GH#47872
102-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
10394
def test_get_dummies_with_str_dtype(any_string_dtype):
10495
s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
105-
result = s.str.get_dummies("|", dtype=str)
106-
expected = DataFrame(
107-
[["T", "T", "F"], ["T", "F", "T"], ["F", "F", "F"]],
108-
columns=list("abc"),
109-
dtype=str,
110-
)
111-
tm.assert_frame_equal(result, expected)
112-
11396

114-
# GH#47872
115-
@td.skip_if_no("pyarrow")
116-
def test_get_dummies_with_pa_str_dtype(any_string_dtype):
117-
import pyarrow as pa
97+
msg = "Only numeric or boolean dtypes are supported for 'dtype'"
98+
with pytest.raises(ValueError, match=msg):
99+
s.str.get_dummies("|", dtype=str)
118100

119-
s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
120-
result = s.str.get_dummies("|", dtype=ArrowDtype(pa.string()))
121-
expected = DataFrame(
122-
[
123-
["true", "true", "false"],
124-
["true", "false", "true"],
125-
["false", "false", "false"],
126-
],
127-
columns=list("abc"),
128-
dtype=ArrowDtype(pa.string()),
129-
)
130-
tm.assert_frame_equal(result, expected)
101+
with pytest.raises(ValueError, match=msg):
102+
s.str.get_dummies("|", dtype="datetime64[ns]")

0 commit comments

Comments
 (0)