-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
REF: more explicit dtypes in strings.accessor #41727
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
d9629d6
4204274
20e1388
20919a1
469a517
f18ad50
2346970
d6542dd
8ed20e6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,7 +13,10 @@ | |
import numpy as np | ||
|
||
import pandas._libs.lib as lib | ||
from pandas._typing import FrameOrSeriesUnion | ||
from pandas._typing import ( | ||
DtypeObj, | ||
FrameOrSeriesUnion, | ||
) | ||
from pandas.util._decorators import Appender | ||
|
||
from pandas.core.dtypes.common import ( | ||
|
@@ -122,7 +125,7 @@ def _map_and_wrap(name, docstring): | |
@forbid_nonstring_types(["bytes"], name=name) | ||
def wrapper(self): | ||
result = getattr(self._data.array, f"_str_{name}")() | ||
return self._wrap_result(result) | ||
return self._wrap_result(result, returns_bool=True) | ||
|
||
wrapper.__doc__ = docstring | ||
return wrapper | ||
|
@@ -209,8 +212,12 @@ def _validate(data): | |
# see _libs/lib.pyx for list of inferred types | ||
allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"] | ||
|
||
values = getattr(data, "values", data) # Series / Index | ||
values = getattr(values, "categories", values) # categorical / normal | ||
# TODO: avoid kludge for tests.extension.test_numpy | ||
from pandas.core.internals.managers import _extract_array | ||
|
||
data = _extract_array(data) | ||
|
||
values = getattr(data, "categories", data) # categorical / normal | ||
|
||
inferred_dtype = lib.infer_dtype(values, skipna=True) | ||
|
||
|
@@ -242,6 +249,7 @@ def _wrap_result( | |
expand: bool | None = None, | ||
fill_value=np.nan, | ||
returns_string=True, | ||
returns_bool: bool = False, | ||
): | ||
from pandas import ( | ||
Index, | ||
|
@@ -319,19 +327,26 @@ def cons_row(x): | |
else: | ||
index = self._orig.index | ||
# This is a mess. | ||
dtype: str | None | ||
if self._is_string and returns_string: | ||
dtype = self._orig.dtype | ||
dtype: DtypeObj | str | None | ||
if self._is_string: | ||
if returns_bool: | ||
dtype = "boolean" | ||
elif returns_string: | ||
dtype = self._orig.dtype | ||
else: | ||
dtype = result.dtype | ||
elif returns_bool: | ||
dtype = result.dtype # i.e. bool | ||
else: | ||
dtype = None | ||
dtype = getattr(result, "dtype", None) | ||
|
||
if expand: | ||
cons = self._orig._constructor_expanddim | ||
result = cons(result, columns=name, index=index, dtype=dtype) | ||
else: | ||
# Must be a Series | ||
cons = self._orig._constructor | ||
result = cons(result, name=name, index=index) | ||
result = cons(result, name=name, index=index, dtype=dtype) | ||
result = result.__finalize__(self._orig, method="str") | ||
if name is not None and result.ndim == 1: | ||
# __finalize__ might copy over the original name, but we may | ||
|
@@ -369,7 +384,7 @@ def _get_series_list(self, others): | |
if isinstance(others, ABCSeries): | ||
return [others] | ||
elif isinstance(others, ABCIndex): | ||
return [Series(others._values, index=idx)] | ||
return [Series(others._values, index=idx, dtype=others.dtype)] | ||
elif isinstance(others, ABCDataFrame): | ||
return [others[x] for x in others] | ||
elif isinstance(others, np.ndarray) and others.ndim == 2: | ||
|
@@ -547,7 +562,7 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): | |
sep = "" | ||
|
||
if isinstance(self._orig, ABCIndex): | ||
data = Series(self._orig, index=self._orig) | ||
data = Series(self._orig, index=self._orig, dtype=self._orig.dtype) | ||
else: # Series | ||
data = self._orig | ||
|
||
|
@@ -2145,7 +2160,7 @@ def startswith(self, pat, na=None): | |
dtype: bool | ||
""" | ||
result = self._data.array._str_startswith(pat, na=na) | ||
return self._wrap_result(result, returns_string=False) | ||
return self._wrap_result(result, returns_string=False, returns_bool=True) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we just change _wrap_result so we don't have 2 keywords? maybe rename to returns_dtype and pass a string? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thats tough bc the end-result dtype depends on self._is_string There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looking at this again it really doesnt look worth the trouble to try to combine these args There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think either keywords should really be necessary here, the issue is that the arrays passed when expand is True are object arrays of lists/tuples which method returns bool, where the dtype of the result is not bool? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
makes sense, ill give this a try There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. e.g There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the wrapped result would be object in this case? This is for a object dtype array? for StringArray/ArrowString array the return type should always be boolean. object dtype accessors do cast their results, except extract, so if the result is all bools and no nans, the result dtype should be np.bool There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
the result to be wrapped dtype should be np.bool There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these conversions are handled in _str_map in ObjectStringArrayMixin |
||
|
||
@forbid_nonstring_types(["bytes"]) | ||
def endswith(self, pat, na=None): | ||
|
@@ -2202,7 +2217,7 @@ def endswith(self, pat, na=None): | |
dtype: bool | ||
""" | ||
result = self._data.array._str_endswith(pat, na=na) | ||
return self._wrap_result(result, returns_string=False) | ||
return self._wrap_result(result, returns_string=False, returns_bool=True) | ||
|
||
@forbid_nonstring_types(["bytes"]) | ||
def findall(self, pat, flags=0): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
umm cn you avoid this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can revert to use the
getattr
pattern, either way is an anti-pattern