From e6f9527097471a5dc82dc7afb0ff6ca841c8fcd3 Mon Sep 17 00:00:00 2001 From: Aaron Chu-Carroll Date: Wed, 21 Aug 2024 14:26:54 -0400 Subject: [PATCH 01/22] Add prefix, prefix_sep, dummy_na, and dtype args to StringMethods get_dummies() --- pandas/core/arrays/arrow/array.py | 23 +++++++- pandas/core/arrays/string_arrow.py | 14 +++-- pandas/core/strings/accessor.py | 71 +++++++++++++++++++++++- pandas/core/strings/base.py | 5 +- pandas/core/strings/object_array.py | 12 +++- pandas/tests/strings/test_get_dummies.py | 56 +++++++++++++++++++ 6 files changed, 170 insertions(+), 11 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 3d55513ab914c..a374eb69b8528 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2533,20 +2533,39 @@ def _str_findall(self, pat: str, flags: int = 0) -> Self: result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_get_dummies(self, sep: str = "|"): + def _str_get_dummies( + self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype = None + ): + if dtype is None: + dtype = np.bool_ split = pc.split_pattern(self._pa_array, sep) flattened_values = pc.list_flatten(split) + if dummy_na: + nan_mask = self._pa_array.is_null() + flattened_values = flattened_values.fill_null(pa.NA) uniques = flattened_values.unique() uniques_sorted = uniques.take(pa.compute.array_sort_indices(uniques)) + if dummy_na: + if "__nan__" not in uniques_sorted.to_pylist(): + uniques_sorted = pa.concat_arrays( + [uniques_sorted, pa.array(["__nan__"], type=uniques_sorted.type)] + ) lengths = pc.list_value_length(split).fill_null(0).to_numpy() n_rows = len(self) n_cols = len(uniques) indices = pc.index_in(flattened_values, uniques_sorted).to_numpy() indices = indices + np.arange(n_rows).repeat(lengths) * n_cols - dummies = np.zeros(n_rows * n_cols, dtype=np.bool_) + dummies = np.zeros(n_rows * n_cols, dtype=dtype) dummies[indices] = True dummies = dummies.reshape((n_rows, n_cols)) + if dummy_na: + nan_column = nan_mask.to_numpy().reshape(-1, 1) + dummies = np.hstack([dummies, nan_column]) result = type(self)(pa.array(list(dummies))) + if dummy_na: + uniques_sorted = pa.array( + ["NaN" if x == "__nan__" else x for x in uniques_sorted.to_pylist()] + ) return result, uniques_sorted.to_pylist() def _str_index(self, sub: str, start: int = 0, end: int | None = None) -> Self: diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index f2fd9d5d6610f..5fd5748bf5dc7 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -550,12 +550,18 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): return super()._str_find(sub, start, end) return self._convert_int_dtype(result) - def _str_get_dummies(self, sep: str = "|"): - dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep) + def _str_get_dummies( + self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype = None + ): + if dtype is None: + dtype = np.int64 + dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies( + sep, dummy_na, dtype + ) if len(labels) == 0: - return np.empty(shape=(0, 0), dtype=np.int64), labels + return np.empty(shape=(0, 0), dtype=dtype), labels dummies = np.vstack(dummies_pa.to_numpy()) - return dummies.astype(np.int64, copy=False), labels + return dummies.astype(dtype, copy=False), labels def _convert_int_dtype(self, result): return Int64Dtype().__from_arrow__(result) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 7494a43caf004..a0e37ab597796 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -51,9 +51,12 @@ if TYPE_CHECKING: from collections.abc import ( Hashable, + Iterable, Iterator, ) + from pandas._typing import NpDtype + from pandas import ( DataFrame, Index, @@ -2357,7 +2360,14 @@ def wrap( return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) - def get_dummies(self, sep: str = "|"): + def get_dummies( + self, + sep: str = "|", + prefix: str | Iterable[str] | dict[str, str] | None = None, + prefix_sep: str = "_", + dummy_na: bool = False, + dtype: NpDtype | None = None, + ): """ Return DataFrame of dummy/indicator variables for Series. @@ -2368,6 +2378,15 @@ def get_dummies(self, sep: str = "|"): ---------- sep : str, default "|" String to split on. + prefix : str, list of str, or dict of str, default None + String to append DataFrame column names. + Pass a list with length equal to the number of columns + when calling get_dummies on a DataFrame. Alternatively, `prefix` + can be a dictionary mapping column names to prefixes. + prefix_sep : str, default '_' + If appending prefix, separator/delimiter to use. + dummy_na : bool, default False + Add a column to indicate NaNs, if False NaNs are ignored. Returns ------- @@ -2392,10 +2411,58 @@ def get_dummies(self, sep: str = "|"): 0 1 1 0 1 0 0 0 2 1 0 1 + + >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(dummy_na=True) + a b c NaN + 0 1 1 0 0 + 1 0 0 0 1 + 2 1 0 1 0 + + >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(prefix="prefix") + prefix_a prefix_b prefix_c + 0 1 1 0 + 1 0 0 0 + 2 1 0 1 + + >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies( + ... prefix={"a": "alpha", "b": "beta", "c": "gamma"} + ... ) + alpha_a beta_b gamma_c + 0 1 1 0 + 1 0 0 0 + 2 1 0 1 + + >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(dtype=bool) + a b c + 0 True True False + 1 False False False + 2 True False True """ # we need to cast to Series of strings as only that has all # methods available for making the dummies... - result, name = self._data.array._str_get_dummies(sep) + result, name = self._data.array._str_get_dummies(sep, dummy_na, dtype) + name = [np.nan if x == "NaN" else x for x in name] + if isinstance(prefix, str): + name = [f"{prefix}{prefix_sep}{col}" for col in name] + elif isinstance(prefix, dict): + if len(prefix) != len(name): + len_msg = ( + f"Length of 'prefix' ({len(prefix)}) did not match the " + "length of the columns being encoded " + f"({len(name)})." + ) + raise ValueError(len_msg) + name = [f"{prefix[col]}{prefix_sep}{col}" for col in name] + elif isinstance(prefix, list): + if len(prefix) != len(name): + len_msg = ( + f"Length of 'prefix' ({len(prefix)}) did not match the " + "length of the columns being encoded " + f"({len(name)})." + ) + raise ValueError(len_msg) + name = [f"{prefix[i]}{prefix_sep}{col}" for i, col in enumerate(name)] + return self._wrap_result( result, name=name, diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index c1f94abff428a..84580ba9c5972 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -14,6 +14,7 @@ import re from pandas._typing import ( + NpDtype, Scalar, Self, ) @@ -161,7 +162,9 @@ def _str_wrap(self, width: int, **kwargs): pass @abc.abstractmethod - def _str_get_dummies(self, sep: str = "|"): + def _str_get_dummies( + self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype = None + ): pass @abc.abstractmethod diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index bdcf55e61d2d1..05102325c2aaf 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -372,9 +372,13 @@ def _str_wrap(self, width: int, **kwargs): tw = textwrap.TextWrapper(**kwargs) return self._str_map(lambda s: "\n".join(tw.wrap(s))) - def _str_get_dummies(self, sep: str = "|"): + def _str_get_dummies( + self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype | None = None + ): from pandas import Series + if dtype is None: + dtype = np.int64 arr = Series(self).fillna("") try: arr = sep + arr + sep @@ -386,7 +390,7 @@ def _str_get_dummies(self, sep: str = "|"): tags.update(ts) tags2 = sorted(tags - {""}) - dummies = np.empty((len(arr), len(tags2)), dtype=np.int64) + dummies = np.empty((len(arr), len(tags2)), dtype=dtype) def _isin(test_elements: str, element: str) -> bool: return element in test_elements @@ -396,6 +400,10 @@ def _isin(test_elements: str, element: str) -> bool: dummies[:, i] = lib.map_infer( arr.to_numpy(), functools.partial(_isin, element=pat) ) + if dummy_na: + nan_col = Series(self).isna().astype(dtype).to_numpy() + dummies = np.column_stack((dummies, nan_col)) + tags2.append(np.nan) return dummies, tags2 def _str_upper(self): diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index 31386e4e342ae..59afc3ca7dc05 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -51,3 +51,59 @@ def test_get_dummies_with_name_dummy_index(): [(1, 1, 0, 0), (0, 0, 1, 1), (0, 1, 0, 1)], names=("a", "b", "c", "name") ) tm.assert_index_equal(result, expected) + + +def test_get_dummies_with_prefix(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies(sep="|", prefix="prefix") + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], + columns=["prefix_a", "prefix_b", "prefix_c"], + ) + tm.assert_frame_equal(result, expected) + + +def test_get_dummies_with_prefix_sep(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies(sep="|", prefix=None, prefix_sep="__") + expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=["a", "b", "c"]) + tm.assert_frame_equal(result, expected) + + result = s.str.get_dummies(sep="|", prefix="col", prefix_sep="__") + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], + columns=["col__a", "col__b", "col__c"], + ) + tm.assert_frame_equal(result, expected) + + +def test_get_dummies_with_dummy_na(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies(sep="|", dummy_na=True) + expected = DataFrame( + [[1, 1, 0, 0], [1, 0, 1, 0], [0, 0, 0, 1]], + columns=["a", "b", "c", np.nan], + ) + tm.assert_frame_equal(result, expected) + + +def test_get_dummies_with_dtype(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies(sep="|", dtype=bool) + expected = DataFrame( + [[True, True, False], [True, False, True], [False, False, False]], + columns=["a", "b", "c"], + ) + tm.assert_frame_equal(result, expected) + assert (result.dtypes == bool).all() + + +def test_get_dummies_with_prefix_dict(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + prefix = {"a": "alpha", "b": "beta", "c": "gamma"} + result = s.str.get_dummies(sep="|", prefix=prefix) + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], + columns=["alpha_a", "beta_b", "gamma_c"], + ) + tm.assert_frame_equal(result, expected) From dafb61d16cb46457e9caa2ec72825dc52919dd10 Mon Sep 17 00:00:00 2001 From: Aaron Chu-Carroll Date: Wed, 21 Aug 2024 14:43:09 -0400 Subject: [PATCH 02/22] Fix import issue --- pandas/core/arrays/string_arrow.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 5fd5748bf5dc7..ab0768d988a5c 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -59,6 +59,7 @@ ArrayLike, AxisInt, Dtype, + NpDtype, Scalar, Self, npt, From bb79ef2bb97ec636679a6944209743bba3e3747e Mon Sep 17 00:00:00 2001 From: Aaron Chu-Carroll Date: Wed, 21 Aug 2024 16:46:42 -0400 Subject: [PATCH 03/22] Fix typing of dtype --- pandas/core/strings/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index 84580ba9c5972..87aa7c26f1304 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -163,7 +163,7 @@ def _str_wrap(self, width: int, **kwargs): @abc.abstractmethod def _str_get_dummies( - self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype = None + self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype | None = None ): pass From 24be84f44d18af6c7d13ed16de830852c42c00eb Mon Sep 17 00:00:00 2001 From: Aaron Chu-Carroll Date: Wed, 21 Aug 2024 17:08:05 -0400 Subject: [PATCH 04/22] Fix NaN type issue --- pandas/core/strings/object_array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 05102325c2aaf..c5f3768ddf4ce 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -403,7 +403,7 @@ def _isin(test_elements: str, element: str) -> bool: if dummy_na: nan_col = Series(self).isna().astype(dtype).to_numpy() dummies = np.column_stack((dummies, nan_col)) - tags2.append(np.nan) + tags2.append("NaN") return dummies, tags2 def _str_upper(self): From 09b2fad68410dcc486aa2e3ee95a9a0e420f03d1 Mon Sep 17 00:00:00 2001 From: Aaron Chu-Carroll Date: Wed, 21 Aug 2024 17:20:43 -0400 Subject: [PATCH 05/22] Support categorical string backend --- pandas/core/arrays/categorical.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 64e5eec43a5c1..9b4447016280c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2744,11 +2744,15 @@ def _str_map( result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype) return take_nd(result, codes, fill_value=na_value) - def _str_get_dummies(self, sep: str = "|"): + def _str_get_dummies( + self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype = None + ): # sep may not be in categories. Just bail on this. from pandas.core.arrays import NumpyExtensionArray - return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep) + return NumpyExtensionArray(self.astype(str))._str_get_dummies( + sep, dummy_na, dtype + ) # ------------------------------------------------------------------------ # GroupBy Methods From 50ed90c571aa569dd622b36c07a1025aeb793908 Mon Sep 17 00:00:00 2001 From: Aaron Chu-Carroll Date: Wed, 21 Aug 2024 17:46:12 -0400 Subject: [PATCH 06/22] Fix dtype type hints --- pandas/core/arrays/arrow/array.py | 2 +- pandas/core/arrays/categorical.py | 2 +- pandas/core/arrays/string_arrow.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index a374eb69b8528..c17729590c183 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2534,7 +2534,7 @@ def _str_findall(self, pat: str, flags: int = 0) -> Self: return type(self)(pa.chunked_array(result)) def _str_get_dummies( - self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype = None + self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype | None = None ): if dtype is None: dtype = np.bool_ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 9b4447016280c..b25db2affd451 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2745,7 +2745,7 @@ def _str_map( return take_nd(result, codes, fill_value=na_value) def _str_get_dummies( - self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype = None + self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype | None = None ): # sep may not be in categories. Just bail on this. from pandas.core.arrays import NumpyExtensionArray diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index ab0768d988a5c..5df2edc6c5018 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -552,7 +552,7 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): return self._convert_int_dtype(result) def _str_get_dummies( - self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype = None + self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype | None = None ): if dtype is None: dtype = np.int64 From 9e95485d1fe3d65bc9fe158db9517134b728d616 Mon Sep 17 00:00:00 2001 From: Aaron Chu-Carroll Date: Wed, 21 Aug 2024 18:09:30 -0400 Subject: [PATCH 07/22] Add dtype to get_dummies docstring --- pandas/core/strings/accessor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index a0e37ab597796..c25684e55e12b 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -2387,6 +2387,8 @@ def get_dummies( If appending prefix, separator/delimiter to use. dummy_na : bool, default False Add a column to indicate NaNs, if False NaNs are ignored. + dtype: dtype, default int64 + Data type for new columns. Only a single dtype is allowed. Returns ------- From 9a47768b02cf4d622c04704be39cb937da13d759 Mon Sep 17 00:00:00 2001 From: Aaron Chu-Carroll Date: Wed, 21 Aug 2024 18:47:53 -0400 Subject: [PATCH 08/22] Fix get_dummies dtype docstring --- pandas/core/strings/accessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index c25684e55e12b..b7630c0f1fe64 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -2387,7 +2387,7 @@ def get_dummies( If appending prefix, separator/delimiter to use. dummy_na : bool, default False Add a column to indicate NaNs, if False NaNs are ignored. - dtype: dtype, default int64 + dtype : dtype, default np.int64 Data type for new columns. Only a single dtype is allowed. Returns From 9702bf7e80d3f4deeef1ab866edfdaeb254d0fd7 Mon Sep 17 00:00:00 2001 From: Aaron Chu-Carroll Date: Tue, 3 Sep 2024 15:52:45 -0400 Subject: [PATCH 09/22] remove changes for unnecessary args --- pandas/core/arrays/arrow/array.py | 19 +---- pandas/core/arrays/categorical.py | 8 +- pandas/core/arrays/string_arrow.py | 6 +- pandas/core/strings/accessor.py | 57 +------------ pandas/core/strings/base.py | 4 +- pandas/core/strings/object_array.py | 8 +- pandas/tests/strings/test_get_dummies.py | 102 +++++++++++++---------- 7 files changed, 67 insertions(+), 137 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index c17729590c183..55b5ee319e5f6 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2533,23 +2533,13 @@ def _str_findall(self, pat: str, flags: int = 0) -> Self: result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_get_dummies( - self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype | None = None - ): + def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): if dtype is None: dtype = np.bool_ split = pc.split_pattern(self._pa_array, sep) flattened_values = pc.list_flatten(split) - if dummy_na: - nan_mask = self._pa_array.is_null() - flattened_values = flattened_values.fill_null(pa.NA) uniques = flattened_values.unique() uniques_sorted = uniques.take(pa.compute.array_sort_indices(uniques)) - if dummy_na: - if "__nan__" not in uniques_sorted.to_pylist(): - uniques_sorted = pa.concat_arrays( - [uniques_sorted, pa.array(["__nan__"], type=uniques_sorted.type)] - ) lengths = pc.list_value_length(split).fill_null(0).to_numpy() n_rows = len(self) n_cols = len(uniques) @@ -2558,14 +2548,7 @@ def _str_get_dummies( dummies = np.zeros(n_rows * n_cols, dtype=dtype) dummies[indices] = True dummies = dummies.reshape((n_rows, n_cols)) - if dummy_na: - nan_column = nan_mask.to_numpy().reshape(-1, 1) - dummies = np.hstack([dummies, nan_column]) result = type(self)(pa.array(list(dummies))) - if dummy_na: - uniques_sorted = pa.array( - ["NaN" if x == "__nan__" else x for x in uniques_sorted.to_pylist()] - ) return result, uniques_sorted.to_pylist() def _str_index(self, sub: str, start: int = 0, end: int | None = None) -> Self: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b25db2affd451..c609eab256205 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2744,15 +2744,11 @@ def _str_map( result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype) return take_nd(result, codes, fill_value=na_value) - def _str_get_dummies( - self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype | None = None - ): + def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): # sep may not be in categories. Just bail on this. from pandas.core.arrays import NumpyExtensionArray - return NumpyExtensionArray(self.astype(str))._str_get_dummies( - sep, dummy_na, dtype - ) + return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep, dtype) # ------------------------------------------------------------------------ # GroupBy Methods diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 5df2edc6c5018..67c23a2b8847c 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -551,13 +551,11 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): return super()._str_find(sub, start, end) return self._convert_int_dtype(result) - def _str_get_dummies( - self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype | None = None - ): + def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): if dtype is None: dtype = np.int64 dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies( - sep, dummy_na, dtype + sep, dtype ) if len(labels) == 0: return np.empty(shape=(0, 0), dtype=dtype), labels diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index b7630c0f1fe64..18a684278c528 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -51,7 +51,6 @@ if TYPE_CHECKING: from collections.abc import ( Hashable, - Iterable, Iterator, ) @@ -2363,9 +2362,6 @@ def wrap( def get_dummies( self, sep: str = "|", - prefix: str | Iterable[str] | dict[str, str] | None = None, - prefix_sep: str = "_", - dummy_na: bool = False, dtype: NpDtype | None = None, ): """ @@ -2378,15 +2374,6 @@ def get_dummies( ---------- sep : str, default "|" String to split on. - prefix : str, list of str, or dict of str, default None - String to append DataFrame column names. - Pass a list with length equal to the number of columns - when calling get_dummies on a DataFrame. Alternatively, `prefix` - can be a dictionary mapping column names to prefixes. - prefix_sep : str, default '_' - If appending prefix, separator/delimiter to use. - dummy_na : bool, default False - Add a column to indicate NaNs, if False NaNs are ignored. dtype : dtype, default np.int64 Data type for new columns. Only a single dtype is allowed. @@ -2414,26 +2401,6 @@ def get_dummies( 1 0 0 0 2 1 0 1 - >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(dummy_na=True) - a b c NaN - 0 1 1 0 0 - 1 0 0 0 1 - 2 1 0 1 0 - - >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(prefix="prefix") - prefix_a prefix_b prefix_c - 0 1 1 0 - 1 0 0 0 - 2 1 0 1 - - >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies( - ... prefix={"a": "alpha", "b": "beta", "c": "gamma"} - ... ) - alpha_a beta_b gamma_c - 0 1 1 0 - 1 0 0 0 - 2 1 0 1 - >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(dtype=bool) a b c 0 True True False @@ -2442,29 +2409,7 @@ def get_dummies( """ # we need to cast to Series of strings as only that has all # methods available for making the dummies... - result, name = self._data.array._str_get_dummies(sep, dummy_na, dtype) - name = [np.nan if x == "NaN" else x for x in name] - if isinstance(prefix, str): - name = [f"{prefix}{prefix_sep}{col}" for col in name] - elif isinstance(prefix, dict): - if len(prefix) != len(name): - len_msg = ( - f"Length of 'prefix' ({len(prefix)}) did not match the " - "length of the columns being encoded " - f"({len(name)})." - ) - raise ValueError(len_msg) - name = [f"{prefix[col]}{prefix_sep}{col}" for col in name] - elif isinstance(prefix, list): - if len(prefix) != len(name): - len_msg = ( - f"Length of 'prefix' ({len(prefix)}) did not match the " - "length of the columns being encoded " - f"({len(name)})." - ) - raise ValueError(len_msg) - name = [f"{prefix[i]}{prefix_sep}{col}" for i, col in enumerate(name)] - + result, name = self._data.array._str_get_dummies(sep, dtype) return self._wrap_result( result, name=name, diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index 87aa7c26f1304..4053e787c824f 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -162,9 +162,7 @@ def _str_wrap(self, width: int, **kwargs): pass @abc.abstractmethod - def _str_get_dummies( - self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype | None = None - ): + def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): pass @abc.abstractmethod diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index c5f3768ddf4ce..c9a6518e42ca0 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -372,9 +372,7 @@ def _str_wrap(self, width: int, **kwargs): tw = textwrap.TextWrapper(**kwargs) return self._str_map(lambda s: "\n".join(tw.wrap(s))) - def _str_get_dummies( - self, sep: str = "|", dummy_na: bool = False, dtype: NpDtype | None = None - ): + def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): from pandas import Series if dtype is None: @@ -400,10 +398,6 @@ def _isin(test_elements: str, element: str) -> bool: dummies[:, i] = lib.map_infer( arr.to_numpy(), functools.partial(_isin, element=pat) ) - if dummy_na: - nan_col = Series(self).isna().astype(dtype).to_numpy() - dummies = np.column_stack((dummies, nan_col)) - tags2.append("NaN") return dummies, tags2 def _str_upper(self): diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index 59afc3ca7dc05..50859710f4a45 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -32,78 +32,94 @@ def test_get_dummies_index(): tm.assert_index_equal(result, expected) -def test_get_dummies_with_name_dummy(any_string_dtype): - # GH 12180 - # Dummies named 'name' should work as expected - s = Series(["a", "b,name", "b"], dtype=any_string_dtype) - result = s.str.get_dummies(",") - expected = DataFrame([[1, 0, 0], [0, 1, 1], [0, 1, 0]], columns=["a", "b", "name"]) +def test_get_dummies_int8_dtype(): + s = Series(["1|2", "1|3", np.nan], dtype="string") + result = s.str.get_dummies("|", dtype=np.int8) + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("123"), dtype=np.int8 + ) tm.assert_frame_equal(result, expected) + assert (result.dtypes == np.int8).all() -def test_get_dummies_with_name_dummy_index(): - # GH 12180 - # Dummies named 'name' should work as expected - idx = Index(["a|b", "name|c", "b|name"]) - result = idx.str.get_dummies("|") +def test_get_dummies_uint8_dtype(): + s = Series(["a|b", "a|c", np.nan], dtype="string") + result = s.str.get_dummies("|", dtype=np.uint8) + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=np.uint8 + ) + tm.assert_frame_equal(result, expected) + assert (result.dtypes == np.uint8).all() - expected = MultiIndex.from_tuples( - [(1, 1, 0, 0), (0, 0, 1, 1), (0, 1, 0, 1)], names=("a", "b", "c", "name") + +def test_get_dummies_int16_dtype(): + s = Series(["a|b", "a|c", np.nan], dtype="string") + result = s.str.get_dummies("|", dtype=np.int16) + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=np.int16 ) - tm.assert_index_equal(result, expected) + tm.assert_frame_equal(result, expected) + assert (result.dtypes == np.int16).all() -def test_get_dummies_with_prefix(any_string_dtype): - s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - result = s.str.get_dummies(sep="|", prefix="prefix") +def test_get_dummies_uint16_dtype(): + s = Series(["a|b", "a|c", np.nan], dtype="string") + result = s.str.get_dummies("|", dtype=np.uint16) expected = DataFrame( - [[1, 1, 0], [1, 0, 1], [0, 0, 0]], - columns=["prefix_a", "prefix_b", "prefix_c"], + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=np.uint16 ) tm.assert_frame_equal(result, expected) + assert (result.dtypes == np.uint16).all() -def test_get_dummies_with_prefix_sep(any_string_dtype): - s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - result = s.str.get_dummies(sep="|", prefix=None, prefix_sep="__") - expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=["a", "b", "c"]) +def test_get_dummies_int32_dtype(): + s = Series(["x|y", "x|z", np.nan], dtype="string") + result = s.str.get_dummies("|", dtype=np.int32) + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("xyz"), dtype=np.int32 + ) tm.assert_frame_equal(result, expected) + assert (result.dtypes == np.int32).all() + - result = s.str.get_dummies(sep="|", prefix="col", prefix_sep="__") +def test_get_dummies_uint32_dtype(): + s = Series(["x|y", "x|z", np.nan], dtype="string") + result = s.str.get_dummies("|", dtype=np.uint32) expected = DataFrame( - [[1, 1, 0], [1, 0, 1], [0, 0, 0]], - columns=["col__a", "col__b", "col__c"], + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("xyz"), dtype=np.uint32 ) tm.assert_frame_equal(result, expected) + assert (result.dtypes == np.uint32).all() -def test_get_dummies_with_dummy_na(any_string_dtype): - s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - result = s.str.get_dummies(sep="|", dummy_na=True) +def test_get_dummies_int64_dtype(): + s = Series(["foo|bar", "foo|baz", np.nan], dtype="string") + result = s.str.get_dummies("|", dtype=np.int64) expected = DataFrame( - [[1, 1, 0, 0], [1, 0, 1, 0], [0, 0, 0, 1]], - columns=["a", "b", "c", np.nan], + [[1, 0, 1], [0, 1, 1], [0, 0, 0]], columns=["bar", "baz", "foo"], dtype=np.int64 ) tm.assert_frame_equal(result, expected) + assert (result.dtypes == np.int64).all() -def test_get_dummies_with_dtype(any_string_dtype): - s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - result = s.str.get_dummies(sep="|", dtype=bool) +def test_get_dummies_uint64_dtype(): + s = Series(["foo|bar", "foo|baz", np.nan], dtype="string") + result = s.str.get_dummies("|", dtype=np.uint64) expected = DataFrame( - [[True, True, False], [True, False, True], [False, False, False]], - columns=["a", "b", "c"], + [[1, 0, 1], [0, 1, 1], [0, 0, 0]], + columns=["bar", "baz", "foo"], + dtype=np.uint64, ) tm.assert_frame_equal(result, expected) - assert (result.dtypes == bool).all() + assert (result.dtypes == np.uint64).all() -def test_get_dummies_with_prefix_dict(any_string_dtype): - s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - prefix = {"a": "alpha", "b": "beta", "c": "gamma"} - result = s.str.get_dummies(sep="|", prefix=prefix) +def test_get_dummies_bool_dtype(): + s = Series(["a|b", "a|c", np.nan], dtype="string") + result = s.str.get_dummies("|", dtype=bool) expected = DataFrame( - [[1, 1, 0], [1, 0, 1], [0, 0, 0]], - columns=["alpha_a", "beta_b", "gamma_c"], + [[True, True, False], [True, False, True], [False, False, False]], + columns=["a", "b", "c"], ) tm.assert_frame_equal(result, expected) + assert (result.dtypes == bool).all() From 163fe09b5b5871d030e4b4552338f8cac5e06c40 Mon Sep 17 00:00:00 2001 From: Aaron Chu-Carroll Date: Thu, 5 Sep 2024 00:07:26 -0400 Subject: [PATCH 10/22] parametrize dtype tests --- pandas/tests/strings/test_get_dummies.py | 97 +++--------------------- 1 file changed, 9 insertions(+), 88 deletions(-) diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index 50859710f4a45..4f75ff05bf0b2 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas import ( DataFrame, @@ -32,94 +33,14 @@ def test_get_dummies_index(): tm.assert_index_equal(result, expected) -def test_get_dummies_int8_dtype(): - s = Series(["1|2", "1|3", np.nan], dtype="string") - result = s.str.get_dummies("|", dtype=np.int8) - expected = DataFrame( - [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("123"), dtype=np.int8 - ) - tm.assert_frame_equal(result, expected) - assert (result.dtypes == np.int8).all() - - -def test_get_dummies_uint8_dtype(): - s = Series(["a|b", "a|c", np.nan], dtype="string") - result = s.str.get_dummies("|", dtype=np.uint8) - expected = DataFrame( - [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=np.uint8 - ) - tm.assert_frame_equal(result, expected) - assert (result.dtypes == np.uint8).all() - - -def test_get_dummies_int16_dtype(): - s = Series(["a|b", "a|c", np.nan], dtype="string") - result = s.str.get_dummies("|", dtype=np.int16) - expected = DataFrame( - [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=np.int16 - ) - tm.assert_frame_equal(result, expected) - assert (result.dtypes == np.int16).all() - - -def test_get_dummies_uint16_dtype(): - s = Series(["a|b", "a|c", np.nan], dtype="string") - result = s.str.get_dummies("|", dtype=np.uint16) - expected = DataFrame( - [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=np.uint16 - ) - tm.assert_frame_equal(result, expected) - assert (result.dtypes == np.uint16).all() - - -def test_get_dummies_int32_dtype(): - s = Series(["x|y", "x|z", np.nan], dtype="string") - result = s.str.get_dummies("|", dtype=np.int32) - expected = DataFrame( - [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("xyz"), dtype=np.int32 - ) - tm.assert_frame_equal(result, expected) - assert (result.dtypes == np.int32).all() - - -def test_get_dummies_uint32_dtype(): - s = Series(["x|y", "x|z", np.nan], dtype="string") - result = s.str.get_dummies("|", dtype=np.uint32) - expected = DataFrame( - [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("xyz"), dtype=np.uint32 - ) - tm.assert_frame_equal(result, expected) - assert (result.dtypes == np.uint32).all() - - -def test_get_dummies_int64_dtype(): - s = Series(["foo|bar", "foo|baz", np.nan], dtype="string") - result = s.str.get_dummies("|", dtype=np.int64) - expected = DataFrame( - [[1, 0, 1], [0, 1, 1], [0, 0, 0]], columns=["bar", "baz", "foo"], dtype=np.int64 - ) - tm.assert_frame_equal(result, expected) - assert (result.dtypes == np.int64).all() - - -def test_get_dummies_uint64_dtype(): - s = Series(["foo|bar", "foo|baz", np.nan], dtype="string") - result = s.str.get_dummies("|", dtype=np.uint64) - expected = DataFrame( - [[1, 0, 1], [0, 1, 1], [0, 0, 0]], - columns=["bar", "baz", "foo"], - dtype=np.uint64, - ) - tm.assert_frame_equal(result, expected) - assert (result.dtypes == np.uint64).all() - - -def test_get_dummies_bool_dtype(): - s = Series(["a|b", "a|c", np.nan], dtype="string") - result = s.str.get_dummies("|", dtype=bool) +@pytest.mark.parametrize( + "dtype", + [np.uint8, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64, bool], +) +def test_get_dummies_with_dtype(any_string_dtype, dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype=dtype) expected = DataFrame( - [[True, True, False], [True, False, True], [False, False, False]], - columns=["a", "b", "c"], + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=dtype ) tm.assert_frame_equal(result, expected) - assert (result.dtypes == bool).all() From d68bece25840dc7f88d83c9287f123368d673105 Mon Sep 17 00:00:00 2001 From: Aaron Chu-Carroll Date: Thu, 5 Sep 2024 11:32:26 -0400 Subject: [PATCH 11/22] support pyarrow and nullable dtypes --- pandas/core/arrays/arrow/array.py | 9 ++++++++- pandas/core/arrays/string_arrow.py | 8 +++++++- pandas/core/strings/accessor.py | 15 +++++++++++++++ pandas/core/strings/object_array.py | 9 ++++++++- pandas/tests/strings/test_get_dummies.py | 22 +++++++++++++++++++++- 5 files changed, 59 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 11495bb110b77..c6ae1fa3311b1 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -41,6 +41,7 @@ is_list_like, is_numeric_dtype, is_scalar, + pandas_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna @@ -2552,7 +2553,13 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): n_cols = len(uniques) indices = pc.index_in(flattened_values, uniques_sorted).to_numpy() indices = indices + np.arange(n_rows).repeat(lengths) * n_cols - dummies = np.zeros(n_rows * n_cols, dtype=dtype) + _dtype = pandas_dtype(dtype) + dummy_dtype: NpDtype + if isinstance(_dtype, np.dtype): + dummy_dtype = _dtype + else: + dummy_dtype = np.bool_ + dummies = np.zeros(n_rows * n_cols, dtype=dummy_dtype) dummies[indices] = True dummies = dummies.reshape((n_rows, n_cols)) result = type(self)(pa.array(list(dummies))) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 8eb11f187fcb7..7e59523810f22 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -471,7 +471,13 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): if len(labels) == 0: return np.empty(shape=(0, 0), dtype=dtype), labels dummies = np.vstack(dummies_pa.to_numpy()) - return dummies.astype(dtype, copy=False), labels + _dtype = pandas_dtype(dtype) + dummy_dtype: NpDtype + if isinstance(_dtype, np.dtype): + dummy_dtype = _dtype + else: + dummy_dtype = np.bool_ + return dummies.astype(dummy_dtype, copy=False), labels def _convert_int_result(self, result): if self.dtype.na_value is np.nan: diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index f440cdf8ee74b..f78c6b93bee71 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -26,6 +26,7 @@ from pandas.core.dtypes.common import ( ensure_object, is_bool_dtype, + is_extension_array_dtype, is_integer, is_list_like, is_object_dtype, @@ -2481,9 +2482,23 @@ def get_dummies( 1 False False False 2 True False True """ + from pandas.core.frame import DataFrame + # we need to cast to Series of strings as only that has all # methods available for making the dummies... result, name = self._data.array._str_get_dummies(sep, dtype) + if is_extension_array_dtype(dtype): + return self._wrap_result( + DataFrame(result, columns=name, dtype=dtype), + name=name, + returns_string=False, + ) + if isinstance(dtype, ArrowDtype): + return self._wrap_result( + DataFrame(result, columns=name, dtype=dtype), + name=name, + returns_string=False, + ) return self._wrap_result( result, name=name, diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 4ea0ff3744172..4a229390713bc 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -18,6 +18,7 @@ import pandas._libs.ops as libops from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.missing import isna from pandas.core.strings.base import BaseStringArrayMethods @@ -414,7 +415,13 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): tags.update(ts) tags2 = sorted(tags - {""}) - dummies = np.empty((len(arr), len(tags2)), dtype=dtype) + _dtype = pandas_dtype(dtype) + dummy_dtype: NpDtype + if isinstance(_dtype, np.dtype): + dummy_dtype = _dtype + else: + dummy_dtype = np.bool_ + dummies = np.empty((len(arr), len(tags2)), dtype=dummy_dtype) def _isin(test_elements: str, element: str) -> bool: return element in test_elements diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index 4f75ff05bf0b2..2dae9e4ed6033 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -1,7 +1,9 @@ import numpy as np +import pyarrow as pa import pytest from pandas import ( + ArrowDtype, DataFrame, Index, MultiIndex, @@ -35,7 +37,25 @@ def test_get_dummies_index(): @pytest.mark.parametrize( "dtype", - [np.uint8, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64, bool], + [ + np.uint8, + np.int16, + np.uint16, + np.int32, + np.uint32, + np.int64, + np.uint64, + bool, + ArrowDtype(pa.int8()), + ArrowDtype(pa.int16()), + ArrowDtype(pa.int32()), + ArrowDtype(pa.int64()), + "Int8", + "Int16", + "Int32", + "Int64", + "boolean", + ], ) def test_get_dummies_with_dtype(any_string_dtype, dtype): s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) From 0fd24012d7fbd76bb910441c69269138c67c4f4c Mon Sep 17 00:00:00 2001 From: Aaron Chu-Carroll Date: Thu, 5 Sep 2024 11:41:18 -0400 Subject: [PATCH 12/22] fix pyarrow import error --- pandas/tests/strings/test_get_dummies.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index 2dae9e4ed6033..975e8955df529 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -1,5 +1,4 @@ import numpy as np -import pyarrow as pa import pytest from pandas import ( @@ -11,6 +10,11 @@ _testing as tm, ) +try: + import pyarrow as pa +except ImportError: + pa = None + def test_get_dummies(any_string_dtype): s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) From 920c865c355d8a1bdcb2ac5990c8aac6267f0874 Mon Sep 17 00:00:00 2001 From: Aaron Chu-Carroll Date: Thu, 5 Sep 2024 11:49:29 -0400 Subject: [PATCH 13/22] skip pyarrow tests when not present --- pandas/tests/strings/test_get_dummies.py | 30 ++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index 975e8955df529..6813f55e95156 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( ArrowDtype, DataFrame, @@ -50,10 +52,6 @@ def test_get_dummies_index(): np.int64, np.uint64, bool, - ArrowDtype(pa.int8()), - ArrowDtype(pa.int16()), - ArrowDtype(pa.int32()), - ArrowDtype(pa.int64()), "Int8", "Int16", "Int32", @@ -68,3 +66,27 @@ def test_get_dummies_with_dtype(any_string_dtype, dtype): [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=dtype ) tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("pyarrow") +@pytest.mark.parametrize( + "dtype", + [ + ArrowDtype(pa.int8()), + ArrowDtype(pa.uint8()), + ArrowDtype(pa.int16()), + ArrowDtype(pa.uint16()), + ArrowDtype(pa.int32()), + ArrowDtype(pa.uint32()), + ArrowDtype(pa.int64()), + ArrowDtype(pa.uint64()), + ArrowDtype(pa.bool_()), + ], +) +def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype=dtype) + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=dtype + ) + tm.assert_frame_equal(result, expected) From 800f787182f4a237a0c6a0a2c4a4cb7d5dde63eb Mon Sep 17 00:00:00 2001 From: Aaron Chu-Carroll Date: Thu, 5 Sep 2024 12:20:12 -0400 Subject: [PATCH 14/22] split pyarrow tests --- pandas/tests/strings/test_get_dummies.py | 118 +++++++++++++++++++---- 1 file changed, 101 insertions(+), 17 deletions(-) diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index 6813f55e95156..76af39dfc1ea8 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -69,24 +69,108 @@ def test_get_dummies_with_dtype(any_string_dtype, dtype): @td.skip_if_no("pyarrow") -@pytest.mark.parametrize( - "dtype", - [ - ArrowDtype(pa.int8()), - ArrowDtype(pa.uint8()), - ArrowDtype(pa.int16()), - ArrowDtype(pa.uint16()), - ArrowDtype(pa.int32()), - ArrowDtype(pa.uint32()), - ArrowDtype(pa.int64()), - ArrowDtype(pa.uint64()), - ArrowDtype(pa.bool_()), - ], -) -def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype): +def test_get_dummies_with_pyarrow_dtype_int8(any_string_dtype): s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - result = s.str.get_dummies("|", dtype=dtype) + result = s.str.get_dummies("|", dtype=ArrowDtype(pa.int8())) expected = DataFrame( - [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=dtype + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], + columns=list("abc"), + dtype=ArrowDtype(pa.int8()), + ) + tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_get_dummies_with_pyarrow_dtype_uint8(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype=ArrowDtype(pa.uint8())) + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], + columns=list("abc"), + dtype=ArrowDtype(pa.uint8()), + ) + tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_get_dummies_with_pyarrow_dtype_int16(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype=ArrowDtype(pa.int16())) + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], + columns=list("abc"), + dtype=ArrowDtype(pa.int16()), + ) + tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_get_dummies_with_pyarrow_dtype_uint16(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype=ArrowDtype(pa.uint16())) + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], + columns=list("abc"), + dtype=ArrowDtype(pa.uint16()), + ) + tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_get_dummies_with_pyarrow_dtype_int32(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype=ArrowDtype(pa.int32())) + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], + columns=list("abc"), + dtype=ArrowDtype(pa.int32()), + ) + tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_get_dummies_with_pyarrow_dtype_uint32(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype=ArrowDtype(pa.uint32())) + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], + columns=list("abc"), + dtype=ArrowDtype(pa.uint32()), + ) + tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_get_dummies_with_pyarrow_dtype_int64(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype=ArrowDtype(pa.int64())) + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], + columns=list("abc"), + dtype=ArrowDtype(pa.int64()), + ) + tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_get_dummies_with_pyarrow_dtype_uint64(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype=ArrowDtype(pa.uint64())) + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], + columns=list("abc"), + dtype=ArrowDtype(pa.uint64()), + ) + tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_get_dummies_with_pyarrow_dtype_bool(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype=ArrowDtype(pa.bool_())) + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], + columns=list("abc"), + dtype=ArrowDtype(pa.bool_()), ) tm.assert_frame_equal(result, expected) From 6cbc3e8008228d67071c088ee2c10dc791f7a724 Mon Sep 17 00:00:00 2001 From: Aaron Chu-Carroll Date: Sat, 7 Sep 2024 15:02:14 -0400 Subject: [PATCH 15/22] parametrize pyarrow tests --- pandas/tests/strings/test_get_dummies.py | 117 ++++------------------- 1 file changed, 17 insertions(+), 100 deletions(-) diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index 76af39dfc1ea8..2aa5d568176d4 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -4,7 +4,6 @@ import pandas.util._test_decorators as td from pandas import ( - ArrowDtype, DataFrame, Index, MultiIndex, @@ -69,108 +68,26 @@ def test_get_dummies_with_dtype(any_string_dtype, dtype): @td.skip_if_no("pyarrow") -def test_get_dummies_with_pyarrow_dtype_int8(any_string_dtype): - s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - result = s.str.get_dummies("|", dtype=ArrowDtype(pa.int8())) - expected = DataFrame( - [[1, 1, 0], [1, 0, 1], [0, 0, 0]], - columns=list("abc"), - dtype=ArrowDtype(pa.int8()), - ) - tm.assert_frame_equal(result, expected) - - -@td.skip_if_no("pyarrow") -def test_get_dummies_with_pyarrow_dtype_uint8(any_string_dtype): - s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - result = s.str.get_dummies("|", dtype=ArrowDtype(pa.uint8())) - expected = DataFrame( - [[1, 1, 0], [1, 0, 1], [0, 0, 0]], - columns=list("abc"), - dtype=ArrowDtype(pa.uint8()), - ) - tm.assert_frame_equal(result, expected) - - -@td.skip_if_no("pyarrow") -def test_get_dummies_with_pyarrow_dtype_int16(any_string_dtype): - s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - result = s.str.get_dummies("|", dtype=ArrowDtype(pa.int16())) - expected = DataFrame( - [[1, 1, 0], [1, 0, 1], [0, 0, 0]], - columns=list("abc"), - dtype=ArrowDtype(pa.int16()), - ) - tm.assert_frame_equal(result, expected) - - -@td.skip_if_no("pyarrow") -def test_get_dummies_with_pyarrow_dtype_uint16(any_string_dtype): - s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - result = s.str.get_dummies("|", dtype=ArrowDtype(pa.uint16())) - expected = DataFrame( - [[1, 1, 0], [1, 0, 1], [0, 0, 0]], - columns=list("abc"), - dtype=ArrowDtype(pa.uint16()), - ) - tm.assert_frame_equal(result, expected) - - -@td.skip_if_no("pyarrow") -def test_get_dummies_with_pyarrow_dtype_int32(any_string_dtype): - s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - result = s.str.get_dummies("|", dtype=ArrowDtype(pa.int32())) - expected = DataFrame( - [[1, 1, 0], [1, 0, 1], [0, 0, 0]], - columns=list("abc"), - dtype=ArrowDtype(pa.int32()), - ) - tm.assert_frame_equal(result, expected) - - -@td.skip_if_no("pyarrow") -def test_get_dummies_with_pyarrow_dtype_uint32(any_string_dtype): - s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - result = s.str.get_dummies("|", dtype=ArrowDtype(pa.uint32())) - expected = DataFrame( - [[1, 1, 0], [1, 0, 1], [0, 0, 0]], - columns=list("abc"), - dtype=ArrowDtype(pa.uint32()), - ) - tm.assert_frame_equal(result, expected) - - -@td.skip_if_no("pyarrow") -def test_get_dummies_with_pyarrow_dtype_int64(any_string_dtype): - s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - result = s.str.get_dummies("|", dtype=ArrowDtype(pa.int64())) - expected = DataFrame( - [[1, 1, 0], [1, 0, 1], [0, 0, 0]], - columns=list("abc"), - dtype=ArrowDtype(pa.int64()), - ) - tm.assert_frame_equal(result, expected) - - -@td.skip_if_no("pyarrow") -def test_get_dummies_with_pyarrow_dtype_uint64(any_string_dtype): - s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - result = s.str.get_dummies("|", dtype=ArrowDtype(pa.uint64())) - expected = DataFrame( - [[1, 1, 0], [1, 0, 1], [0, 0, 0]], - columns=list("abc"), - dtype=ArrowDtype(pa.uint64()), - ) - tm.assert_frame_equal(result, expected) - - -@td.skip_if_no("pyarrow") -def test_get_dummies_with_pyarrow_dtype_bool(any_string_dtype): +@pytest.mark.parametrize( + "dtype", + [ + "int8[pyarrow]", + "uint8[pyarrow]", + "int16[pyarrow]", + "uint16[pyarrow]", + "int32[pyarrow]", + "uint32[pyarrow]", + "int64[pyarrow]", + "uint64[pyarrow]", + "bool[pyarrow]", + ], +) +def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype): s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - result = s.str.get_dummies("|", dtype=ArrowDtype(pa.bool_())) + result = s.str.get_dummies("|", dtype=dtype) expected = DataFrame( [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), - dtype=ArrowDtype(pa.bool_()), + dtype=dtype, ) tm.assert_frame_equal(result, expected) From 532e139a847557a2959092b1f58200a85bb25834 Mon Sep 17 00:00:00 2001 From: Aaron Chu-Carroll Date: Sat, 7 Sep 2024 15:03:50 -0400 Subject: [PATCH 16/22] change var name to dummies_dtype --- pandas/core/arrays/arrow/array.py | 8 ++++---- pandas/core/arrays/string_arrow.py | 8 ++++---- pandas/core/strings/object_array.py | 8 ++++---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 93fba2fe18a76..bb7f8c7351975 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2527,12 +2527,12 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): indices = pc.index_in(flattened_values, uniques_sorted).to_numpy() indices = indices + np.arange(n_rows).repeat(lengths) * n_cols _dtype = pandas_dtype(dtype) - dummy_dtype: NpDtype + dummies_dtype: NpDtype if isinstance(_dtype, np.dtype): - dummy_dtype = _dtype + dummies_dtype = _dtype else: - dummy_dtype = np.bool_ - dummies = np.zeros(n_rows * n_cols, dtype=dummy_dtype) + dummies_dtype = np.bool_ + dummies = np.zeros(n_rows * n_cols, dtype=dummies_dtype) dummies[indices] = True dummies = dummies.reshape((n_rows, n_cols)) result = type(self)(pa.array(list(dummies))) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index e2beb0efdbee2..3ac2bbea1f3ff 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -446,12 +446,12 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): return np.empty(shape=(0, 0), dtype=dtype), labels dummies = np.vstack(dummies_pa.to_numpy()) _dtype = pandas_dtype(dtype) - dummy_dtype: NpDtype + dummies_dtype: NpDtype if isinstance(_dtype, np.dtype): - dummy_dtype = _dtype + dummies_dtype = _dtype else: - dummy_dtype = np.bool_ - return dummies.astype(dummy_dtype, copy=False), labels + dummies_dtype = np.bool_ + return dummies.astype(dummies_dtype, copy=False), labels def _convert_int_result(self, result): if self.dtype.na_value is np.nan: diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 4a229390713bc..6211c7b528db9 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -416,12 +416,12 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): tags2 = sorted(tags - {""}) _dtype = pandas_dtype(dtype) - dummy_dtype: NpDtype + dummies_dtype: NpDtype if isinstance(_dtype, np.dtype): - dummy_dtype = _dtype + dummies_dtype = _dtype else: - dummy_dtype = np.bool_ - dummies = np.empty((len(arr), len(tags2)), dtype=dummy_dtype) + dummies_dtype = np.bool_ + dummies = np.empty((len(arr), len(tags2)), dtype=dummies_dtype) def _isin(test_elements: str, element: str) -> bool: return element in test_elements From cd5c2ab178af5d2ed50db8bb207f082692756e78 Mon Sep 17 00:00:00 2001 From: Aaron Chu-Carroll Date: Sat, 7 Sep 2024 15:57:24 -0400 Subject: [PATCH 17/22] fix string issue --- pandas/core/arrays/arrow/array.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index bb7f8c7351975..7b690e0068ea3 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2533,6 +2533,8 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): else: dummies_dtype = np.bool_ dummies = np.zeros(n_rows * n_cols, dtype=dummies_dtype) + if dtype == str: + dummies[:] = False dummies[indices] = True dummies = dummies.reshape((n_rows, n_cols)) result = type(self)(pa.array(list(dummies))) From 822b3f4f38d16cd683b0a9bf5b06b00331946ca6 Mon Sep 17 00:00:00 2001 From: Aaron Chu-Carroll Date: Sat, 7 Sep 2024 15:58:41 -0400 Subject: [PATCH 18/22] consolidate conditionals --- pandas/core/strings/accessor.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index f78c6b93bee71..6d10365a1b968 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -2487,13 +2487,7 @@ def get_dummies( # we need to cast to Series of strings as only that has all # methods available for making the dummies... result, name = self._data.array._str_get_dummies(sep, dtype) - if is_extension_array_dtype(dtype): - return self._wrap_result( - DataFrame(result, columns=name, dtype=dtype), - name=name, - returns_string=False, - ) - if isinstance(dtype, ArrowDtype): + if is_extension_array_dtype(dtype) or isinstance(dtype, ArrowDtype): return self._wrap_result( DataFrame(result, columns=name, dtype=dtype), name=name, From ba05a8de6dbad1b1a47f4d1f1994eb0adff641bc Mon Sep 17 00:00:00 2001 From: Aaron Chu-Carroll Date: Sat, 7 Sep 2024 15:59:13 -0400 Subject: [PATCH 19/22] add tests for str and pyarrow strings --- pandas/tests/strings/test_get_dummies.py | 30 ++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index 2aa5d568176d4..be72442d80320 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -40,6 +40,7 @@ def test_get_dummies_index(): tm.assert_index_equal(result, expected) +# GH#47872 @pytest.mark.parametrize( "dtype", [ @@ -67,6 +68,7 @@ def test_get_dummies_with_dtype(any_string_dtype, dtype): tm.assert_frame_equal(result, expected) +# GH#47872 @td.skip_if_no("pyarrow") @pytest.mark.parametrize( "dtype", @@ -91,3 +93,31 @@ def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype): dtype=dtype, ) tm.assert_frame_equal(result, expected) + + +# GH#47872 +def test_get_dummies_with_str_dtype(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype=str) + expected = DataFrame( + [["T", "T", "F"], ["T", "F", "T"], ["F", "F", "F"]], + columns=list("abc"), + dtype=str, + ) + tm.assert_frame_equal(result, expected) + + +# GH#47872 +def test_get_dummies_with_pa_str_dtype(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype="str[pyarrow]") + expected = DataFrame( + [ + ["true", "true", "false"], + ["true", "false", "true"], + ["false", "false", "false"], + ], + columns=list("abc"), + dtype="str[pyarrow]", + ) + tm.assert_frame_equal(result, expected) From 37dddb895c336f601483737eafd0a0c0bb096894 Mon Sep 17 00:00:00 2001 From: Aaron Chu-Carroll Date: Sat, 7 Sep 2024 18:45:23 -0400 Subject: [PATCH 20/22] skip pyarrow string tests if not present --- pandas/tests/strings/test_get_dummies.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index be72442d80320..0656f505dc745 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -108,6 +108,7 @@ def test_get_dummies_with_str_dtype(any_string_dtype): # GH#47872 +@td.skip_if_no("pyarrow") def test_get_dummies_with_pa_str_dtype(any_string_dtype): s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) result = s.str.get_dummies("|", dtype="str[pyarrow]") From 6fbe183c7adf374d5e15399b3c4c1b48759a6540 Mon Sep 17 00:00:00 2001 From: Aaron Chu-Carroll Date: Mon, 9 Sep 2024 16:43:39 -0400 Subject: [PATCH 21/22] add info to whatsnew doc --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index cd353b60d1a6e..3fb0e52b024bb 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -54,6 +54,7 @@ Other enhancements - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) +- :func:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`) - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) From 87a1ee8229242c6233905d1b5dc973503329f75a Mon Sep 17 00:00:00 2001 From: Aaron Chu-Carroll Date: Mon, 9 Sep 2024 16:45:19 -0400 Subject: [PATCH 22/22] change func to meth in doc info --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 3fb0e52b024bb..8362a430f37fe 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -54,8 +54,8 @@ Other enhancements - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) -- :func:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) +- :meth:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`) - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) - Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`)