diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 517ac7a4b44b9..1d8d0f6a74cb1 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -65,8 +65,8 @@ Other enhancements - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) +- :meth:`Series.str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) -- :meth:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) - Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`) - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`) - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 5c32b05868383..e7f6b911f2fb1 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2531,8 +2531,6 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): else: dummies_dtype = np.bool_ dummies = np.zeros(n_rows * n_cols, dtype=dummies_dtype) - if dtype == str: - dummies[:] = False dummies[indices] = True dummies = dummies.reshape((n_rows, n_cols)) result = type(self)(pa.array(list(dummies))) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index d3ccd11281a77..5b35b5e393012 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -29,6 +29,7 @@ is_extension_array_dtype, is_integer, is_list_like, + is_numeric_dtype, is_object_dtype, is_re, ) @@ -2524,10 +2525,12 @@ def get_dummies( """ from pandas.core.frame import DataFrame + if dtype is not None and not (is_numeric_dtype(dtype) or is_bool_dtype(dtype)): + raise ValueError("Only numeric or boolean dtypes are supported for 'dtype'") # we need to cast to Series of strings as only that has all # methods available for making the dummies... result, name = self._data.array._str_get_dummies(sep, dtype) - if is_extension_array_dtype(dtype) or isinstance(dtype, ArrowDtype): + if is_extension_array_dtype(dtype): return self._wrap_result( DataFrame(result, columns=name, dtype=dtype), name=name, diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index a07ab9534f491..0adb7b51cf2b7 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -434,7 +434,7 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): dummies_dtype = _dtype else: dummies_dtype = np.bool_ - dummies = np.empty((len(arr), len(tags2)), dtype=dummies_dtype) + dummies = np.empty((len(arr), len(tags2)), dtype=dummies_dtype, order="F") def _isin(test_elements: str, element: str) -> bool: return element in test_elements diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index 541b0ea150ba6..16e10c6fcdccd 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -1,12 +1,9 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td from pandas import ( - ArrowDtype, DataFrame, Index, MultiIndex, @@ -14,11 +11,6 @@ _testing as tm, ) -try: - import pyarrow as pa -except ImportError: - pa = None - def test_get_dummies(any_string_dtype): s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) @@ -99,32 +91,12 @@ def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype): # GH#47872 -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_get_dummies_with_str_dtype(any_string_dtype): s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - result = s.str.get_dummies("|", dtype=str) - expected = DataFrame( - [["T", "T", "F"], ["T", "F", "T"], ["F", "F", "F"]], - columns=list("abc"), - dtype=str, - ) - tm.assert_frame_equal(result, expected) - -# GH#47872 -@td.skip_if_no("pyarrow") -def test_get_dummies_with_pa_str_dtype(any_string_dtype): - import pyarrow as pa + msg = "Only numeric or boolean dtypes are supported for 'dtype'" + with pytest.raises(ValueError, match=msg): + s.str.get_dummies("|", dtype=str) - s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - result = s.str.get_dummies("|", dtype=ArrowDtype(pa.string())) - expected = DataFrame( - [ - ["true", "true", "false"], - ["true", "false", "true"], - ["false", "false", "false"], - ], - columns=list("abc"), - dtype=ArrowDtype(pa.string()), - ) - tm.assert_frame_equal(result, expected) + with pytest.raises(ValueError, match=msg): + s.str.get_dummies("|", dtype="datetime64[ns]")