-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: Add dtype argument to StringMethods get_dummies() #59577
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 20 commits
e6f9527
dafb61d
bb79ef2
24be84f
09b2fad
50ed90c
9e95485
9a47768
0c94bff
9702bf7
8793516
bad1038
163fe09
3d75fdc
d68bece
c2aa7d5
0fd2401
920c865
800f787
d8149e6
6cbc3e8
532e139
cd5c2ab
822b3f4
ba05a8d
37dddb8
6fbe183
87a1ee8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,6 +26,7 @@ | |
from pandas.core.dtypes.common import ( | ||
ensure_object, | ||
is_bool_dtype, | ||
is_extension_array_dtype, | ||
is_integer, | ||
is_list_like, | ||
is_object_dtype, | ||
|
@@ -54,6 +55,8 @@ | |
Iterator, | ||
) | ||
|
||
from pandas._typing import NpDtype | ||
|
||
from pandas import ( | ||
DataFrame, | ||
Index, | ||
|
@@ -2431,7 +2434,11 @@ def wrap( | |
return self._wrap_result(result) | ||
|
||
@forbid_nonstring_types(["bytes"]) | ||
def get_dummies(self, sep: str = "|"): | ||
def get_dummies( | ||
self, | ||
sep: str = "|", | ||
dtype: NpDtype | None = None, | ||
): | ||
""" | ||
Return DataFrame of dummy/indicator variables for Series. | ||
|
||
|
@@ -2442,6 +2449,8 @@ def get_dummies(self, sep: str = "|"): | |
---------- | ||
sep : str, default "|" | ||
String to split on. | ||
dtype : dtype, default np.int64 | ||
Data type for new columns. Only a single dtype is allowed. | ||
|
||
Returns | ||
------- | ||
|
@@ -2466,10 +2475,30 @@ def get_dummies(self, sep: str = "|"): | |
0 1 1 0 | ||
1 0 0 0 | ||
2 1 0 1 | ||
|
||
>>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(dtype=bool) | ||
a b c | ||
0 True True False | ||
1 False False False | ||
2 True False True | ||
""" | ||
from pandas.core.frame import DataFrame | ||
|
||
# we need to cast to Series of strings as only that has all | ||
# methods available for making the dummies... | ||
result, name = self._data.array._str_get_dummies(sep) | ||
result, name = self._data.array._str_get_dummies(sep, dtype) | ||
if is_extension_array_dtype(dtype): | ||
return self._wrap_result( | ||
DataFrame(result, columns=name, dtype=dtype), | ||
name=name, | ||
returns_string=False, | ||
) | ||
if isinstance(dtype, ArrowDtype): | ||
return self._wrap_result( | ||
DataFrame(result, columns=name, dtype=dtype), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you can use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Making this change causes failures because the numpy.ndarray does not take non-numpy dtypes. It doesn't seem like _wrap_result handles this case. |
||
name=name, | ||
returns_string=False, | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you consolidate these two using |
||
return self._wrap_result( | ||
result, | ||
name=name, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is on the verge (and arguably, is) a nitpick, but I think it'd be better to call it
dummies_dtype
being the dtype of thedummies
array throughout.