add Series.str.remove(pre|suf)fix (#43328)

janosh · web-flow · commit 0a9f9eed3e3e · 2021-09-06T15:45:57.000-04:00
diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst
@@ -427,6 +427,8 @@ strings and apply several methods to it. These can be accessed like
    Series.str.normalize
    Series.str.pad
    Series.str.partition
+   Series.str.removeprefix
+   Series.str.removesuffix
    Series.str.repeat
    Series.str.replace
    Series.str.rfind
diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
@@ -335,6 +335,19 @@ regular expression object will raise a ``ValueError``.
     ---------------------------------------------------------------------------
     ValueError: case and flags cannot be set when pat is a compiled regex
 
+``removeprefix`` and ``removesuffix`` have the same effect as ``str.removeprefix`` and ``str.removesuffix`` added in Python 3.9
+<https://docs.python.org/3/library/stdtypes.html#str.removeprefix>`__:
+
+.. versionadded:: 1.4.0
+
+.. ipython:: python
+
+   s = pd.Series(["str_foo", "str_bar", "no_prefix"])
+   s.str.removeprefix("str_")
+
+   s = pd.Series(["foo_str", "bar_str", "no_suffix"])
+   s.str.removesuffix("_str")
+
 .. _text.concatenate:
 
 Concatenation
@@ -742,6 +755,8 @@ Method summary
     :meth:`~Series.str.get_dummies`;Split strings on the delimiter returning DataFrame of dummy variables
     :meth:`~Series.str.contains`;Return boolean array if each string contains pattern/regex
     :meth:`~Series.str.replace`;Replace occurrences of pattern/regex/string with some other string or the return value of a callable given the occurrence
+    :meth:`~Series.str.removeprefix`;Remove prefix from string, i.e. only remove if string starts with prefix.
+    :meth:`~Series.str.removesuffix`;Remove suffix from string, i.e. only remove if string ends with suffix.
     :meth:`~Series.str.repeat`;Duplicate values (``s.str.repeat(3)`` equivalent to ``x * 3``)
     :meth:`~Series.str.pad`;"Add whitespace to left, right, or both sides of strings"
     :meth:`~Series.str.center`;Equivalent to ``str.center``
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -104,6 +104,7 @@ Other enhancements
 - :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`)
 - :meth:`read_table` now supports the argument ``storage_options`` (:issue:`39167`)
 - Methods that relied on hashmap based algos such as :meth:`DataFrameGroupBy.value_counts`, :meth:`DataFrameGroupBy.count` and :func:`factorize` ignored imaginary component for complex numbers (:issue:`17927`)
+- Add :meth:`Series.str.removeprefix` and :meth:`Series.str.removesuffix` introduced in Python 3.9 to remove pre-/suffixes from string-type :class:`Series` (:issue:`36944`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
@@ -1907,6 +1907,69 @@ def rstrip(self, to_strip=None):
         result = self._data.array._str_rstrip(to_strip)
         return self._wrap_result(result)
 
+    _shared_docs[
+        "str_removefix"
+    ] = r"""
+    Remove a %(side)s from an object series. If the %(side)s is not present,
+    the original string will be returned.
+
+    Parameters
+    ----------
+    %(side)s : str
+        %(side)s to remove.
+
+    Returns
+    -------
+    Series/Index: object
+        The Series or Index with given %(side)s removed.
+
+    See Also
+    --------
+    Series.str.remove%(other_side)s : Remove a %(other_side)s from an object series.
+
+    Examples
+    --------
+    >>> s = pd.Series(["str_foo", "str_bar", "no_prefix"])
+    >>> s
+    0    str_foo
+    1    str_bar
+    2    no_prefix
+    dtype: object
+    >>> s.str.removeprefix("str_")
+    0    foo
+    1    bar
+    2    no_prefix
+    dtype: object
+
+    >>> s = pd.Series(["foo_str", "bar_str", "no_suffix"])
+    >>> s
+    0    foo_str
+    1    bar_str
+    2    no_suffix
+    dtype: object
+    >>> s.str.removesuffix("_str")
+    0    foo
+    1    bar
+    2    no_suffix
+    dtype: object
+    """
+
+    @Appender(
+        _shared_docs["str_removefix"] % {"side": "prefix", "other_side": "suffix"}
+    )
+    @forbid_nonstring_types(["bytes"])
+    def removeprefix(self, prefix):
+        result = self._data.array._str_removeprefix(prefix)
+        return self._wrap_result(result)
+
+    @Appender(
+        _shared_docs["str_removefix"] % {"side": "suffix", "other_side": "prefix"}
+    )
+    @forbid_nonstring_types(["bytes"])
+    def removesuffix(self, suffix):
+        result = self._data.array._str_removesuffix(suffix)
+        return self._wrap_result(result)
+
     @forbid_nonstring_types(["bytes"])
     def wrap(self, width, **kwargs):
         r"""
diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py
@@ -3,11 +3,15 @@
 import abc
 from collections.abc import Callable  # noqa: PDF001
 import re
+from typing import TYPE_CHECKING
 
 import numpy as np
 
 from pandas._typing import Scalar
 
+if TYPE_CHECKING:
+    from pandas import Series
+
 
 class BaseStringArrayMethods(abc.ABC):
     """
@@ -223,6 +227,14 @@ def _str_lstrip(self, to_strip=None):
     def _str_rstrip(self, to_strip=None):
         pass
 
+    @abc.abstractmethod
+    def _str_removeprefix(self, prefix: str) -> Series:
+        pass
+
+    @abc.abstractmethod
+    def _str_removesuffix(self, suffix: str) -> Series:
+        pass
+
     @abc.abstractmethod
     def _str_split(self, pat=None, n=-1, expand=False):
         pass
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
@@ -3,6 +3,7 @@
 from collections.abc import Callable  # noqa: PDF001
 import re
 import textwrap
+from typing import TYPE_CHECKING
 import unicodedata
 
 import numpy as np
@@ -20,6 +21,9 @@
 
 from pandas.core.strings.base import BaseStringArrayMethods
 
+if TYPE_CHECKING:
+    from pandas import Series
+
 
 class ObjectStringArrayMixin(BaseStringArrayMethods):
     """
@@ -36,7 +40,7 @@ def _str_map(
         self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
     ):
         """
-        Map a callable over valid element of the array.
+        Map a callable over valid elements of the array.
 
         Parameters
         ----------
@@ -414,6 +418,30 @@ def _str_lstrip(self, to_strip=None):
     def _str_rstrip(self, to_strip=None):
         return self._str_map(lambda x: x.rstrip(to_strip))
 
+    def _str_removeprefix(self, prefix: str) -> Series:
+        # outstanding question on whether to use native methods for users
+        # on Python 3.9+ https://git.io/JE9QK, in which case we could do
+        # return self._str_map(str.removeprefix)
+
+        def removeprefix(text: str) -> str:
+            if text.startswith(prefix):
+                return text[len(prefix) :]
+            return text
+
+        return self._str_map(removeprefix)
+
+    def _str_removesuffix(self, suffix: str) -> Series:
+        # this could be used on Python 3.9+
+        # f = lambda x: x.removesuffix(suffix)
+        # return self._str_map(str.removesuffix)
+
+        def removesuffix(text: str) -> str:
+            if text.endswith(suffix):
+                return text[: -len(suffix)]
+            return text
+
+        return self._str_map(removesuffix)
+
     def _str_extract(self, pat: str, flags: int = 0, expand: bool = True):
         regex = re.compile(pat, flags=flags)
         na_value = self._str_na_value
diff --git a/pandas/tests/strings/conftest.py b/pandas/tests/strings/conftest.py
@@ -46,6 +46,8 @@
     ("startswith", ("a",), {}),
     ("startswith", ("a",), {"na": True}),
     ("startswith", ("a",), {"na": False}),
+    ("removeprefix", ("a",), {}),
+    ("removesuffix", ("a",), {}),
     # translating unicode points of "a" to "d"
     ("translate", ({97: 100},), {}),
     ("wrap", (2,), {}),
diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py
@@ -535,6 +535,26 @@ def test_strip_lstrip_rstrip_args(any_string_dtype):
     tm.assert_series_equal(result, expected)
 
 
+@pytest.mark.parametrize(
+    "prefix, expected", [("a", ["b", " b c", "bc"]), ("ab", ["", "a b c", "bc"])]
+)
+def test_removeprefix(any_string_dtype, prefix, expected):
+    ser = Series(["ab", "a b c", "bc"], dtype=any_string_dtype)
+    result = ser.str.removeprefix(prefix)
+    ser_expected = Series(expected, dtype=any_string_dtype)
+    tm.assert_series_equal(result, ser_expected)
+
+
+@pytest.mark.parametrize(
+    "suffix, expected", [("c", ["ab", "a b ", "b"]), ("bc", ["ab", "a b c", ""])]
+)
+def test_removesuffix(any_string_dtype, suffix, expected):
+    ser = Series(["ab", "a b c", "bc"], dtype=any_string_dtype)
+    result = ser.str.removesuffix(suffix)
+    ser_expected = Series(expected, dtype=any_string_dtype)
+    tm.assert_series_equal(result, ser_expected)
+
+
 def test_string_slice_get_syntax(any_string_dtype):
     ser = Series(
         ["YYY", "B", "C", "YYYYYYbYYY", "BYYYcYYY", np.nan, "CYYYBYYY", "dog", "cYYYt"],