Backport PR #39188: REGR: Different results from DataFrame.apply and str accessor (#39199)

meeseeksmachine · simonjayhawkins · web-flow · commit b12df468ccdd · 2021-01-15T21:50:58.000-05:00
Co-authored-by: Simon Hawkins &lt;simonjayhawkins@gmail.com&gt;
diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst
@@ -24,6 +24,7 @@ Fixed regressions
 - Fixed regression in :func:`read_excel` with non-rawbyte file handles (:issue:`38788`)
 - Fixed regression in :meth:`Rolling.skew` and :meth:`Rolling.kurt` modifying the object inplace (:issue:`38908`)
 - Fixed regression in :meth:`read_csv` and other read functions were the encoding error policy (``errors``) did not default to ``"replace"`` when no encoding was specified (:issue:`38989`)
+- Fixed regression in :meth:`DataFrame.apply` with ``axis=1`` using str accessor in apply function (:issue:`38979`)
 - Fixed regression in :meth:`DataFrame.replace` raising ``ValueError`` when :class:`DataFrame` has dtype ``bytes`` (:issue:`38900`)
 - Fixed regression in :meth:`DataFrameGroupBy.diff` raising for ``int8`` and ``int16`` columns (:issue:`39050`)
 - Fixed regression that raised ``AttributeError`` with PyArrow versions [0.16.0, 1.0.0) (:issue:`38801`)
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
@@ -109,7 +109,7 @@ def wrapper(self, *args, **kwargs):
 def _map_and_wrap(name, docstring):
     @forbid_nonstring_types(["bytes"], name=name)
     def wrapper(self):
-        result = getattr(self._array, f"_str_{name}")()
+        result = getattr(self._data.array, f"_str_{name}")()
         return self._wrap_result(result)
 
     wrapper.__doc__ = docstring
@@ -154,8 +154,7 @@ def __init__(self, data):
         self._inferred_dtype = self._validate(data)
         self._is_categorical = is_categorical_dtype(data.dtype)
         self._is_string = isinstance(data.dtype, StringDtype)
-        array = data.array
-        self._array = array
+        self._data = data
 
         self._index = self._name = None
         if isinstance(data, ABCSeries):
@@ -219,7 +218,7 @@ def _validate(data):
         return inferred_dtype
 
     def __getitem__(self, key):
-        result = self._array._str_getitem(key)
+        result = self._data.array._str_getitem(key)
         return self._wrap_result(result)
 
     def __iter__(self):
@@ -744,13 +743,13 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):
     @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"})
     @forbid_nonstring_types(["bytes"])
     def split(self, pat=None, n=-1, expand=False):
-        result = self._array._str_split(pat, n, expand)
+        result = self._data.array._str_split(pat, n, expand)
         return self._wrap_result(result, returns_string=expand, expand=expand)
 
     @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"})
     @forbid_nonstring_types(["bytes"])
     def rsplit(self, pat=None, n=-1, expand=False):
-        result = self._array._str_rsplit(pat, n=n)
+        result = self._data.array._str_rsplit(pat, n=n)
         return self._wrap_result(result, expand=expand, returns_string=expand)
 
     _shared_docs[
@@ -846,7 +845,7 @@ def rsplit(self, pat=None, n=-1, expand=False):
     )
     @forbid_nonstring_types(["bytes"])
     def partition(self, sep=" ", expand=True):
-        result = self._array._str_partition(sep, expand)
+        result = self._data.array._str_partition(sep, expand)
         return self._wrap_result(result, expand=expand, returns_string=expand)
 
     @Appender(
@@ -860,7 +859,7 @@ def partition(self, sep=" ", expand=True):
     )
     @forbid_nonstring_types(["bytes"])
     def rpartition(self, sep=" ", expand=True):
-        result = self._array._str_rpartition(sep, expand)
+        result = self._data.array._str_rpartition(sep, expand)
         return self._wrap_result(result, expand=expand, returns_string=expand)
 
     def get(self, i):
@@ -914,7 +913,7 @@ def get(self, i):
         5    None
         dtype: object
         """
-        result = self._array._str_get(i)
+        result = self._data.array._str_get(i)
         return self._wrap_result(result)
 
     @forbid_nonstring_types(["bytes"])
@@ -980,7 +979,7 @@ def join(self, sep):
         4                    NaN
         dtype: object
         """
-        result = self._array._str_join(sep)
+        result = self._data.array._str_join(sep)
         return self._wrap_result(result)
 
     @forbid_nonstring_types(["bytes"])
@@ -1108,7 +1107,7 @@ def contains(self, pat, case=True, flags=0, na=None, regex=True):
         4    False
         dtype: bool
         """
-        result = self._array._str_contains(pat, case, flags, na, regex)
+        result = self._data.array._str_contains(pat, case, flags, na, regex)
         return self._wrap_result(result, fill_value=na, returns_string=False)
 
     @forbid_nonstring_types(["bytes"])
@@ -1140,7 +1139,7 @@ def match(self, pat, case=True, flags=0, na=None):
             re.match.
         extract : Extract matched groups.
         """
-        result = self._array._str_match(pat, case=case, flags=flags, na=na)
+        result = self._data.array._str_match(pat, case=case, flags=flags, na=na)
         return self._wrap_result(result, fill_value=na, returns_string=False)
 
     @forbid_nonstring_types(["bytes"])
@@ -1173,7 +1172,7 @@ def fullmatch(self, pat, case=True, flags=0, na=None):
             matches the regular expression.
         extract : Extract matched groups.
         """
-        result = self._array._str_fullmatch(pat, case=case, flags=flags, na=na)
+        result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na)
         return self._wrap_result(result, fill_value=na, returns_string=False)
 
     @forbid_nonstring_types(["bytes"])
@@ -1309,7 +1308,7 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=None):
                     )
                 warnings.warn(msg, FutureWarning, stacklevel=3)
             regex = True
-        result = self._array._str_replace(
+        result = self._data.array._str_replace(
             pat, repl, n=n, case=case, flags=flags, regex=regex
         )
         return self._wrap_result(result)
@@ -1355,7 +1354,7 @@ def repeat(self, repeats):
         2    ccc
         dtype: object
         """
-        result = self._array._str_repeat(repeats)
+        result = self._data.array._str_repeat(repeats)
         return self._wrap_result(result)
 
     @forbid_nonstring_types(["bytes"])
@@ -1423,7 +1422,7 @@ def pad(self, width, side="left", fillchar=" "):
             msg = f"width must be of integer type, not {type(width).__name__}"
             raise TypeError(msg)
 
-        result = self._array._str_pad(width, side=side, fillchar=fillchar)
+        result = self._data.array._str_pad(width, side=side, fillchar=fillchar)
         return self._wrap_result(result)
 
     _shared_docs[
@@ -1597,7 +1596,7 @@ def slice(self, start=None, stop=None, step=None):
         2    cm
         dtype: object
         """
-        result = self._array._str_slice(start, stop, step)
+        result = self._data.array._str_slice(start, stop, step)
         return self._wrap_result(result)
 
     @forbid_nonstring_types(["bytes"])
@@ -1673,7 +1672,7 @@ def slice_replace(self, start=None, stop=None, repl=None):
         4    aXde
         dtype: object
         """
-        result = self._array._str_slice_replace(start, stop, repl)
+        result = self._data.array._str_slice_replace(start, stop, repl)
         return self._wrap_result(result)
 
     def decode(self, encoding, errors="strict"):
@@ -1699,7 +1698,7 @@ def decode(self, encoding, errors="strict"):
         else:
             decoder = codecs.getdecoder(encoding)
             f = lambda x: decoder(x, errors)[0]
-        arr = self._array
+        arr = self._data.array
         # assert isinstance(arr, (StringArray,))
         result = arr._str_map(f)
         return self._wrap_result(result)
@@ -1720,7 +1719,7 @@ def encode(self, encoding, errors="strict"):
         -------
         encoded : Series/Index of objects
         """
-        result = self._array._str_encode(encoding, errors)
+        result = self._data.array._str_encode(encoding, errors)
         return self._wrap_result(result, returns_string=False)
 
     _shared_docs[
@@ -1798,7 +1797,7 @@ def encode(self, encoding, errors="strict"):
     )
     @forbid_nonstring_types(["bytes"])
     def strip(self, to_strip=None):
-        result = self._array._str_strip(to_strip)
+        result = self._data.array._str_strip(to_strip)
         return self._wrap_result(result)
 
     @Appender(
@@ -1807,7 +1806,7 @@ def strip(self, to_strip=None):
     )
     @forbid_nonstring_types(["bytes"])
     def lstrip(self, to_strip=None):
-        result = self._array._str_lstrip(to_strip)
+        result = self._data.array._str_lstrip(to_strip)
         return self._wrap_result(result)
 
     @Appender(
@@ -1816,7 +1815,7 @@ def lstrip(self, to_strip=None):
     )
     @forbid_nonstring_types(["bytes"])
     def rstrip(self, to_strip=None):
-        result = self._array._str_rstrip(to_strip)
+        result = self._data.array._str_rstrip(to_strip)
         return self._wrap_result(result)
 
     @forbid_nonstring_types(["bytes"])
@@ -1875,7 +1874,7 @@ def wrap(self, width, **kwargs):
         1    another line\nto be\nwrapped
         dtype: object
         """
-        result = self._array._str_wrap(width, **kwargs)
+        result = self._data.array._str_wrap(width, **kwargs)
         return self._wrap_result(result)
 
     @forbid_nonstring_types(["bytes"])
@@ -1917,7 +1916,7 @@ def get_dummies(self, sep="|"):
         """
         # we need to cast to Series of strings as only that has all
         # methods available for making the dummies...
-        result, name = self._array._str_get_dummies(sep)
+        result, name = self._data.array._str_get_dummies(sep)
         return self._wrap_result(
             result,
             name=name,
@@ -1944,7 +1943,7 @@ def translate(self, table):
         -------
         Series or Index
         """
-        result = self._array._str_translate(table)
+        result = self._data.array._str_translate(table)
         return self._wrap_result(result)
 
     @forbid_nonstring_types(["bytes"])
@@ -2012,7 +2011,7 @@ def count(self, pat, flags=0):
         >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a')
         Int64Index([0, 0, 2, 1], dtype='int64')
         """
-        result = self._array._str_count(pat, flags)
+        result = self._data.array._str_count(pat, flags)
         return self._wrap_result(result, returns_string=False)
 
     @forbid_nonstring_types(["bytes"])
@@ -2069,7 +2068,7 @@ def startswith(self, pat, na=None):
         3    False
         dtype: bool
         """
-        result = self._array._str_startswith(pat, na=na)
+        result = self._data.array._str_startswith(pat, na=na)
         return self._wrap_result(result, returns_string=False)
 
     @forbid_nonstring_types(["bytes"])
@@ -2126,7 +2125,7 @@ def endswith(self, pat, na=None):
         3    False
         dtype: bool
         """
-        result = self._array._str_endswith(pat, na=na)
+        result = self._data.array._str_endswith(pat, na=na)
         return self._wrap_result(result, returns_string=False)
 
     @forbid_nonstring_types(["bytes"])
@@ -2219,7 +2218,7 @@ def findall(self, pat, flags=0):
         2    [b, b]
         dtype: object
         """
-        result = self._array._str_findall(pat, flags)
+        result = self._data.array._str_findall(pat, flags)
         return self._wrap_result(result, returns_string=False)
 
     @forbid_nonstring_types(["bytes"])
@@ -2426,7 +2425,7 @@ def find(self, sub, start=0, end=None):
             msg = f"expected a string object, not {type(sub).__name__}"
             raise TypeError(msg)
 
-        result = self._array._str_find(sub, start, end)
+        result = self._data.array._str_find(sub, start, end)
         return self._wrap_result(result, returns_string=False)
 
     @Appender(
@@ -2443,7 +2442,7 @@ def rfind(self, sub, start=0, end=None):
             msg = f"expected a string object, not {type(sub).__name__}"
             raise TypeError(msg)
 
-        result = self._array._str_rfind(sub, start=start, end=end)
+        result = self._data.array._str_rfind(sub, start=start, end=end)
         return self._wrap_result(result, returns_string=False)
 
     @forbid_nonstring_types(["bytes"])
@@ -2463,7 +2462,7 @@ def normalize(self, form):
         -------
         normalized : Series/Index of objects
         """
-        result = self._array._str_normalize(form)
+        result = self._data.array._str_normalize(form)
         return self._wrap_result(result)
 
     _shared_docs[
@@ -2510,7 +2509,7 @@ def index(self, sub, start=0, end=None):
             msg = f"expected a string object, not {type(sub).__name__}"
             raise TypeError(msg)
 
-        result = self._array._str_index(sub, start=start, end=end)
+        result = self._data.array._str_index(sub, start=start, end=end)
         return self._wrap_result(result, returns_string=False)
 
     @Appender(
@@ -2528,7 +2527,7 @@ def rindex(self, sub, start=0, end=None):
             msg = f"expected a string object, not {type(sub).__name__}"
             raise TypeError(msg)
 
-        result = self._array._str_rindex(sub, start=start, end=end)
+        result = self._data.array._str_rindex(sub, start=start, end=end)
         return self._wrap_result(result, returns_string=False)
 
     def len(self):
@@ -2577,7 +2576,7 @@ def len(self):
         5    3.0
         dtype: float64
         """
-        result = self._array._str_len()
+        result = self._data.array._str_len()
         return self._wrap_result(result, returns_string=False)
 
     _shared_docs[
@@ -2677,37 +2676,37 @@ def len(self):
     @Appender(_shared_docs["casemethods"] % _doc_args["lower"])
     @forbid_nonstring_types(["bytes"])
     def lower(self):
-        result = self._array._str_lower()
+        result = self._data.array._str_lower()
         return self._wrap_result(result)
 
     @Appender(_shared_docs["casemethods"] % _doc_args["upper"])
     @forbid_nonstring_types(["bytes"])
     def upper(self):
-        result = self._array._str_upper()
+        result = self._data.array._str_upper()
         return self._wrap_result(result)
 
     @Appender(_shared_docs["casemethods"] % _doc_args["title"])
     @forbid_nonstring_types(["bytes"])
     def title(self):
-        result = self._array._str_title()
+        result = self._data.array._str_title()
         return self._wrap_result(result)
 
     @Appender(_shared_docs["casemethods"] % _doc_args["capitalize"])
     @forbid_nonstring_types(["bytes"])
     def capitalize(self):
-        result = self._array._str_capitalize()
+        result = self._data.array._str_capitalize()
         return self._wrap_result(result)
 
     @Appender(_shared_docs["casemethods"] % _doc_args["swapcase"])
     @forbid_nonstring_types(["bytes"])
     def swapcase(self):
-        result = self._array._str_swapcase()
+        result = self._data.array._str_swapcase()
         return self._wrap_result(result)
 
     @Appender(_shared_docs["casemethods"] % _doc_args["casefold"])
     @forbid_nonstring_types(["bytes"])
     def casefold(self):
-        result = self._array._str_casefold()
+        result = self._data.array._str_casefold()
         return self._wrap_result(result)
 
     _shared_docs[
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -3670,3 +3670,11 @@ def test_str_get_stringarray_multiple_nans():
     result = s.str.get(2)
     expected = Series(pd.array([pd.NA, pd.NA, pd.NA, "c"]))
     tm.assert_series_equal(result, expected)
+
+
+def test_str_accessor_in_apply_func():
+    # https://github.com/pandas-dev/pandas/issues/38979
+    df = DataFrame(zip("abc", "def"))
+    expected = Series(["A/D", "B/E", "C/F"])
+    result = df.apply(lambda f: "/".join(f.str.upper()), axis=1)
+    tm.assert_series_equal(result, expected)