Skip to content

REGR: Different results from DataFrame.apply and str accessor #39188

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ Fixed regressions
- Fixed regression in :func:`read_excel` with non-rawbyte file handles (:issue:`38788`)
- Fixed regression in :meth:`Rolling.skew` and :meth:`Rolling.kurt` modifying the object inplace (:issue:`38908`)
- Fixed regression in :meth:`read_csv` and other read functions were the encoding error policy (``errors``) did not default to ``"replace"`` when no encoding was specified (:issue:`38989`)
- Fixed regression in :meth:`DataFrame.apply` with ``axis=1`` using str accessor in apply function (:issue:`38979`)
- Fixed regression in :meth:`DataFrame.replace` raising ``ValueError`` when :class:`DataFrame` has dtype ``bytes`` (:issue:`38900`)
- Fixed regression in :meth:`DataFrameGroupBy.diff` raising for ``int8`` and ``int16`` columns (:issue:`39050`)
- Fixed regression that raised ``AttributeError`` with PyArrow versions [0.16.0, 1.0.0) (:issue:`38801`)
Expand Down
83 changes: 41 additions & 42 deletions pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def wrapper(self, *args, **kwargs):
def _map_and_wrap(name, docstring):
@forbid_nonstring_types(["bytes"], name=name)
def wrapper(self):
result = getattr(self._array, f"_str_{name}")()
result = getattr(self._data.array, f"_str_{name}")()
return self._wrap_result(result)

wrapper.__doc__ = docstring
Expand Down Expand Up @@ -149,8 +149,7 @@ def __init__(self, data):
self._inferred_dtype = self._validate(data)
self._is_categorical = is_categorical_dtype(data.dtype)
self._is_string = isinstance(data.dtype, StringDtype)
array = data.array
self._array = array
self._data = data

self._index = self._name = None
if isinstance(data, ABCSeries):
Expand Down Expand Up @@ -214,7 +213,7 @@ def _validate(data):
return inferred_dtype

def __getitem__(self, key):
result = self._array._str_getitem(key)
result = self._data.array._str_getitem(key)
return self._wrap_result(result)

def __iter__(self):
Expand Down Expand Up @@ -739,13 +738,13 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):
@Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"})
@forbid_nonstring_types(["bytes"])
def split(self, pat=None, n=-1, expand=False):
result = self._array._str_split(pat, n, expand)
result = self._data.array._str_split(pat, n, expand)
return self._wrap_result(result, returns_string=expand, expand=expand)

@Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"})
@forbid_nonstring_types(["bytes"])
def rsplit(self, pat=None, n=-1, expand=False):
result = self._array._str_rsplit(pat, n=n)
result = self._data.array._str_rsplit(pat, n=n)
return self._wrap_result(result, expand=expand, returns_string=expand)

_shared_docs[
Expand Down Expand Up @@ -841,7 +840,7 @@ def rsplit(self, pat=None, n=-1, expand=False):
)
@forbid_nonstring_types(["bytes"])
def partition(self, sep=" ", expand=True):
result = self._array._str_partition(sep, expand)
result = self._data.array._str_partition(sep, expand)
return self._wrap_result(result, expand=expand, returns_string=expand)

@Appender(
Expand All @@ -855,7 +854,7 @@ def partition(self, sep=" ", expand=True):
)
@forbid_nonstring_types(["bytes"])
def rpartition(self, sep=" ", expand=True):
result = self._array._str_rpartition(sep, expand)
result = self._data.array._str_rpartition(sep, expand)
return self._wrap_result(result, expand=expand, returns_string=expand)

def get(self, i):
Expand Down Expand Up @@ -909,7 +908,7 @@ def get(self, i):
5 None
dtype: object
"""
result = self._array._str_get(i)
result = self._data.array._str_get(i)
return self._wrap_result(result)

@forbid_nonstring_types(["bytes"])
Expand Down Expand Up @@ -975,7 +974,7 @@ def join(self, sep):
4 NaN
dtype: object
"""
result = self._array._str_join(sep)
result = self._data.array._str_join(sep)
return self._wrap_result(result)

@forbid_nonstring_types(["bytes"])
Expand Down Expand Up @@ -1103,7 +1102,7 @@ def contains(self, pat, case=True, flags=0, na=None, regex=True):
4 False
dtype: bool
"""
result = self._array._str_contains(pat, case, flags, na, regex)
result = self._data.array._str_contains(pat, case, flags, na, regex)
return self._wrap_result(result, fill_value=na, returns_string=False)

@forbid_nonstring_types(["bytes"])
Expand Down Expand Up @@ -1135,7 +1134,7 @@ def match(self, pat, case=True, flags=0, na=None):
re.match.
extract : Extract matched groups.
"""
result = self._array._str_match(pat, case=case, flags=flags, na=na)
result = self._data.array._str_match(pat, case=case, flags=flags, na=na)
return self._wrap_result(result, fill_value=na, returns_string=False)

@forbid_nonstring_types(["bytes"])
Expand Down Expand Up @@ -1168,7 +1167,7 @@ def fullmatch(self, pat, case=True, flags=0, na=None):
matches the regular expression.
extract : Extract matched groups.
"""
result = self._array._str_fullmatch(pat, case=case, flags=flags, na=na)
result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na)
return self._wrap_result(result, fill_value=na, returns_string=False)

@forbid_nonstring_types(["bytes"])
Expand Down Expand Up @@ -1304,7 +1303,7 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=None):
)
warnings.warn(msg, FutureWarning, stacklevel=3)
regex = True
result = self._array._str_replace(
result = self._data.array._str_replace(
pat, repl, n=n, case=case, flags=flags, regex=regex
)
return self._wrap_result(result)
Expand Down Expand Up @@ -1350,7 +1349,7 @@ def repeat(self, repeats):
2 ccc
dtype: object
"""
result = self._array._str_repeat(repeats)
result = self._data.array._str_repeat(repeats)
return self._wrap_result(result)

@forbid_nonstring_types(["bytes"])
Expand Down Expand Up @@ -1418,7 +1417,7 @@ def pad(self, width, side="left", fillchar=" "):
msg = f"width must be of integer type, not {type(width).__name__}"
raise TypeError(msg)

result = self._array._str_pad(width, side=side, fillchar=fillchar)
result = self._data.array._str_pad(width, side=side, fillchar=fillchar)
return self._wrap_result(result)

_shared_docs[
Expand Down Expand Up @@ -1592,7 +1591,7 @@ def slice(self, start=None, stop=None, step=None):
2 cm
dtype: object
"""
result = self._array._str_slice(start, stop, step)
result = self._data.array._str_slice(start, stop, step)
return self._wrap_result(result)

@forbid_nonstring_types(["bytes"])
Expand Down Expand Up @@ -1668,7 +1667,7 @@ def slice_replace(self, start=None, stop=None, repl=None):
4 aXde
dtype: object
"""
result = self._array._str_slice_replace(start, stop, repl)
result = self._data.array._str_slice_replace(start, stop, repl)
return self._wrap_result(result)

def decode(self, encoding, errors="strict"):
Expand All @@ -1694,7 +1693,7 @@ def decode(self, encoding, errors="strict"):
else:
decoder = codecs.getdecoder(encoding)
f = lambda x: decoder(x, errors)[0]
arr = self._array
arr = self._data.array
# assert isinstance(arr, (StringArray,))
result = arr._str_map(f)
return self._wrap_result(result)
Expand All @@ -1715,7 +1714,7 @@ def encode(self, encoding, errors="strict"):
-------
encoded : Series/Index of objects
"""
result = self._array._str_encode(encoding, errors)
result = self._data.array._str_encode(encoding, errors)
return self._wrap_result(result, returns_string=False)

_shared_docs[
Expand Down Expand Up @@ -1793,7 +1792,7 @@ def encode(self, encoding, errors="strict"):
)
@forbid_nonstring_types(["bytes"])
def strip(self, to_strip=None):
result = self._array._str_strip(to_strip)
result = self._data.array._str_strip(to_strip)
return self._wrap_result(result)

@Appender(
Expand All @@ -1802,7 +1801,7 @@ def strip(self, to_strip=None):
)
@forbid_nonstring_types(["bytes"])
def lstrip(self, to_strip=None):
result = self._array._str_lstrip(to_strip)
result = self._data.array._str_lstrip(to_strip)
return self._wrap_result(result)

@Appender(
Expand All @@ -1811,7 +1810,7 @@ def lstrip(self, to_strip=None):
)
@forbid_nonstring_types(["bytes"])
def rstrip(self, to_strip=None):
result = self._array._str_rstrip(to_strip)
result = self._data.array._str_rstrip(to_strip)
return self._wrap_result(result)

@forbid_nonstring_types(["bytes"])
Expand Down Expand Up @@ -1870,7 +1869,7 @@ def wrap(self, width, **kwargs):
1 another line\nto be\nwrapped
dtype: object
"""
result = self._array._str_wrap(width, **kwargs)
result = self._data.array._str_wrap(width, **kwargs)
return self._wrap_result(result)

@forbid_nonstring_types(["bytes"])
Expand Down Expand Up @@ -1912,7 +1911,7 @@ def get_dummies(self, sep="|"):
"""
# we need to cast to Series of strings as only that has all
# methods available for making the dummies...
result, name = self._array._str_get_dummies(sep)
result, name = self._data.array._str_get_dummies(sep)
return self._wrap_result(
result,
name=name,
Expand All @@ -1939,7 +1938,7 @@ def translate(self, table):
-------
Series or Index
"""
result = self._array._str_translate(table)
result = self._data.array._str_translate(table)
return self._wrap_result(result)

@forbid_nonstring_types(["bytes"])
Expand Down Expand Up @@ -2007,7 +2006,7 @@ def count(self, pat, flags=0):
>>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a')
Int64Index([0, 0, 2, 1], dtype='int64')
"""
result = self._array._str_count(pat, flags)
result = self._data.array._str_count(pat, flags)
return self._wrap_result(result, returns_string=False)

@forbid_nonstring_types(["bytes"])
Expand Down Expand Up @@ -2064,7 +2063,7 @@ def startswith(self, pat, na=None):
3 False
dtype: bool
"""
result = self._array._str_startswith(pat, na=na)
result = self._data.array._str_startswith(pat, na=na)
return self._wrap_result(result, returns_string=False)

@forbid_nonstring_types(["bytes"])
Expand Down Expand Up @@ -2121,7 +2120,7 @@ def endswith(self, pat, na=None):
3 False
dtype: bool
"""
result = self._array._str_endswith(pat, na=na)
result = self._data.array._str_endswith(pat, na=na)
return self._wrap_result(result, returns_string=False)

@forbid_nonstring_types(["bytes"])
Expand Down Expand Up @@ -2214,7 +2213,7 @@ def findall(self, pat, flags=0):
2 [b, b]
dtype: object
"""
result = self._array._str_findall(pat, flags)
result = self._data.array._str_findall(pat, flags)
return self._wrap_result(result, returns_string=False)

@forbid_nonstring_types(["bytes"])
Expand Down Expand Up @@ -2421,7 +2420,7 @@ def find(self, sub, start=0, end=None):
msg = f"expected a string object, not {type(sub).__name__}"
raise TypeError(msg)

result = self._array._str_find(sub, start, end)
result = self._data.array._str_find(sub, start, end)
return self._wrap_result(result, returns_string=False)

@Appender(
Expand All @@ -2438,7 +2437,7 @@ def rfind(self, sub, start=0, end=None):
msg = f"expected a string object, not {type(sub).__name__}"
raise TypeError(msg)

result = self._array._str_rfind(sub, start=start, end=end)
result = self._data.array._str_rfind(sub, start=start, end=end)
return self._wrap_result(result, returns_string=False)

@forbid_nonstring_types(["bytes"])
Expand All @@ -2458,7 +2457,7 @@ def normalize(self, form):
-------
normalized : Series/Index of objects
"""
result = self._array._str_normalize(form)
result = self._data.array._str_normalize(form)
return self._wrap_result(result)

_shared_docs[
Expand Down Expand Up @@ -2505,7 +2504,7 @@ def index(self, sub, start=0, end=None):
msg = f"expected a string object, not {type(sub).__name__}"
raise TypeError(msg)

result = self._array._str_index(sub, start=start, end=end)
result = self._data.array._str_index(sub, start=start, end=end)
return self._wrap_result(result, returns_string=False)

@Appender(
Expand All @@ -2523,7 +2522,7 @@ def rindex(self, sub, start=0, end=None):
msg = f"expected a string object, not {type(sub).__name__}"
raise TypeError(msg)

result = self._array._str_rindex(sub, start=start, end=end)
result = self._data.array._str_rindex(sub, start=start, end=end)
return self._wrap_result(result, returns_string=False)

def len(self):
Expand Down Expand Up @@ -2572,7 +2571,7 @@ def len(self):
5 3.0
dtype: float64
"""
result = self._array._str_len()
result = self._data.array._str_len()
return self._wrap_result(result, returns_string=False)

_shared_docs[
Expand Down Expand Up @@ -2672,37 +2671,37 @@ def len(self):
@Appender(_shared_docs["casemethods"] % _doc_args["lower"])
@forbid_nonstring_types(["bytes"])
def lower(self):
result = self._array._str_lower()
result = self._data.array._str_lower()
return self._wrap_result(result)

@Appender(_shared_docs["casemethods"] % _doc_args["upper"])
@forbid_nonstring_types(["bytes"])
def upper(self):
result = self._array._str_upper()
result = self._data.array._str_upper()
return self._wrap_result(result)

@Appender(_shared_docs["casemethods"] % _doc_args["title"])
@forbid_nonstring_types(["bytes"])
def title(self):
result = self._array._str_title()
result = self._data.array._str_title()
return self._wrap_result(result)

@Appender(_shared_docs["casemethods"] % _doc_args["capitalize"])
@forbid_nonstring_types(["bytes"])
def capitalize(self):
result = self._array._str_capitalize()
result = self._data.array._str_capitalize()
return self._wrap_result(result)

@Appender(_shared_docs["casemethods"] % _doc_args["swapcase"])
@forbid_nonstring_types(["bytes"])
def swapcase(self):
result = self._array._str_swapcase()
result = self._data.array._str_swapcase()
return self._wrap_result(result)

@Appender(_shared_docs["casemethods"] % _doc_args["casefold"])
@forbid_nonstring_types(["bytes"])
def casefold(self):
result = self._array._str_casefold()
result = self._data.array._str_casefold()
return self._wrap_result(result)

_shared_docs[
Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3670,3 +3670,11 @@ def test_str_get_stringarray_multiple_nans():
result = s.str.get(2)
expected = Series(pd.array([pd.NA, pd.NA, pd.NA, "c"]))
tm.assert_series_equal(result, expected)


def test_str_accessor_in_apply_func():
# https://github.com/pandas-dev/pandas/issues/38979
df = DataFrame(zip("abc", "def"))
expected = Series(["A/D", "B/E", "C/F"])
result = df.apply(lambda f: "/".join(f.str.upper()), axis=1)
tm.assert_series_equal(result, expected)