diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 3ff3b2bb53fda..a60dab549e66d 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -427,6 +427,8 @@ strings and apply several methods to it. These can be accessed like Series.str.normalize Series.str.pad Series.str.partition + Series.str.removeprefix + Series.str.removesuffix Series.str.repeat Series.str.replace Series.str.rfind diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index db9485f3f2348..d350351075cb6 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -335,6 +335,19 @@ regular expression object will raise a ``ValueError``. --------------------------------------------------------------------------- ValueError: case and flags cannot be set when pat is a compiled regex +``removeprefix`` and ``removesuffix`` have the same effect as ``str.removeprefix`` and ``str.removesuffix`` added in Python 3.9 +`__: + +.. versionadded:: 1.4.0 + +.. ipython:: python + + s = pd.Series(["str_foo", "str_bar", "no_prefix"]) + s.str.removeprefix("str_") + + s = pd.Series(["foo_str", "bar_str", "no_suffix"]) + s.str.removesuffix("_str") + .. _text.concatenate: Concatenation @@ -742,6 +755,8 @@ Method summary :meth:`~Series.str.get_dummies`;Split strings on the delimiter returning DataFrame of dummy variables :meth:`~Series.str.contains`;Return boolean array if each string contains pattern/regex :meth:`~Series.str.replace`;Replace occurrences of pattern/regex/string with some other string or the return value of a callable given the occurrence + :meth:`~Series.str.removeprefix`;Remove prefix from string, i.e. only remove if string starts with prefix. + :meth:`~Series.str.removesuffix`;Remove suffix from string, i.e. only remove if string ends with suffix. :meth:`~Series.str.repeat`;Duplicate values (``s.str.repeat(3)`` equivalent to ``x * 3``) :meth:`~Series.str.pad`;"Add whitespace to left, right, or both sides of strings" :meth:`~Series.str.center`;Equivalent to ``str.center`` diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 985cd2bb553b7..8807e831913c7 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -104,6 +104,7 @@ Other enhancements - :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`) - :meth:`read_table` now supports the argument ``storage_options`` (:issue:`39167`) - Methods that relied on hashmap based algos such as :meth:`DataFrameGroupBy.value_counts`, :meth:`DataFrameGroupBy.count` and :func:`factorize` ignored imaginary component for complex numbers (:issue:`17927`) +- Add :meth:`Series.str.removeprefix` and :meth:`Series.str.removesuffix` introduced in Python 3.9 to remove pre-/suffixes from string-type :class:`Series` (:issue:`36944`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 55a55d0111397..4ea29edb7d41b 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1907,6 +1907,69 @@ def rstrip(self, to_strip=None): result = self._data.array._str_rstrip(to_strip) return self._wrap_result(result) + _shared_docs[ + "str_removefix" + ] = r""" + Remove a %(side)s from an object series. If the %(side)s is not present, + the original string will be returned. + + Parameters + ---------- + %(side)s : str + %(side)s to remove. + + Returns + ------- + Series/Index: object + The Series or Index with given %(side)s removed. + + See Also + -------- + Series.str.remove%(other_side)s : Remove a %(other_side)s from an object series. + + Examples + -------- + >>> s = pd.Series(["str_foo", "str_bar", "no_prefix"]) + >>> s + 0 str_foo + 1 str_bar + 2 no_prefix + dtype: object + >>> s.str.removeprefix("str_") + 0 foo + 1 bar + 2 no_prefix + dtype: object + + >>> s = pd.Series(["foo_str", "bar_str", "no_suffix"]) + >>> s + 0 foo_str + 1 bar_str + 2 no_suffix + dtype: object + >>> s.str.removesuffix("_str") + 0 foo + 1 bar + 2 no_suffix + dtype: object + """ + + @Appender( + _shared_docs["str_removefix"] % {"side": "prefix", "other_side": "suffix"} + ) + @forbid_nonstring_types(["bytes"]) + def removeprefix(self, prefix): + result = self._data.array._str_removeprefix(prefix) + return self._wrap_result(result) + + @Appender( + _shared_docs["str_removefix"] % {"side": "suffix", "other_side": "prefix"} + ) + @forbid_nonstring_types(["bytes"]) + def removesuffix(self, suffix): + result = self._data.array._str_removesuffix(suffix) + return self._wrap_result(result) + @forbid_nonstring_types(["bytes"]) def wrap(self, width, **kwargs): r""" diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index cd71844d3b527..ef0c3f8c2321d 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -3,11 +3,15 @@ import abc from collections.abc import Callable # noqa: PDF001 import re +from typing import TYPE_CHECKING import numpy as np from pandas._typing import Scalar +if TYPE_CHECKING: + from pandas import Series + class BaseStringArrayMethods(abc.ABC): """ @@ -223,6 +227,14 @@ def _str_lstrip(self, to_strip=None): def _str_rstrip(self, to_strip=None): pass + @abc.abstractmethod + def _str_removeprefix(self, prefix: str) -> Series: + pass + + @abc.abstractmethod + def _str_removesuffix(self, suffix: str) -> Series: + pass + @abc.abstractmethod def _str_split(self, pat=None, n=-1, expand=False): pass diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 02bdb7f181583..76ee55ef5f9ad 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -3,6 +3,7 @@ from collections.abc import Callable # noqa: PDF001 import re import textwrap +from typing import TYPE_CHECKING import unicodedata import numpy as np @@ -20,6 +21,9 @@ from pandas.core.strings.base import BaseStringArrayMethods +if TYPE_CHECKING: + from pandas import Series + class ObjectStringArrayMixin(BaseStringArrayMethods): """ @@ -36,7 +40,7 @@ def _str_map( self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True ): """ - Map a callable over valid element of the array. + Map a callable over valid elements of the array. Parameters ---------- @@ -414,6 +418,30 @@ def _str_lstrip(self, to_strip=None): def _str_rstrip(self, to_strip=None): return self._str_map(lambda x: x.rstrip(to_strip)) + def _str_removeprefix(self, prefix: str) -> Series: + # outstanding question on whether to use native methods for users + # on Python 3.9+ https://git.io/JE9QK, in which case we could do + # return self._str_map(str.removeprefix) + + def removeprefix(text: str) -> str: + if text.startswith(prefix): + return text[len(prefix) :] + return text + + return self._str_map(removeprefix) + + def _str_removesuffix(self, suffix: str) -> Series: + # this could be used on Python 3.9+ + # f = lambda x: x.removesuffix(suffix) + # return self._str_map(str.removesuffix) + + def removesuffix(text: str) -> str: + if text.endswith(suffix): + return text[: -len(suffix)] + return text + + return self._str_map(removesuffix) + def _str_extract(self, pat: str, flags: int = 0, expand: bool = True): regex = re.compile(pat, flags=flags) na_value = self._str_na_value diff --git a/pandas/tests/strings/conftest.py b/pandas/tests/strings/conftest.py index 4fedbee91f649..15cc5af97a2d6 100644 --- a/pandas/tests/strings/conftest.py +++ b/pandas/tests/strings/conftest.py @@ -46,6 +46,8 @@ ("startswith", ("a",), {}), ("startswith", ("a",), {"na": True}), ("startswith", ("a",), {"na": False}), + ("removeprefix", ("a",), {}), + ("removesuffix", ("a",), {}), # translating unicode points of "a" to "d" ("translate", ({97: 100},), {}), ("wrap", (2,), {}), diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 98f3fc859976e..ba942d740ac8b 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -535,6 +535,26 @@ def test_strip_lstrip_rstrip_args(any_string_dtype): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "prefix, expected", [("a", ["b", " b c", "bc"]), ("ab", ["", "a b c", "bc"])] +) +def test_removeprefix(any_string_dtype, prefix, expected): + ser = Series(["ab", "a b c", "bc"], dtype=any_string_dtype) + result = ser.str.removeprefix(prefix) + ser_expected = Series(expected, dtype=any_string_dtype) + tm.assert_series_equal(result, ser_expected) + + +@pytest.mark.parametrize( + "suffix, expected", [("c", ["ab", "a b ", "b"]), ("bc", ["ab", "a b c", ""])] +) +def test_removesuffix(any_string_dtype, suffix, expected): + ser = Series(["ab", "a b c", "bc"], dtype=any_string_dtype) + result = ser.str.removesuffix(suffix) + ser_expected = Series(expected, dtype=any_string_dtype) + tm.assert_series_equal(result, ser_expected) + + def test_string_slice_get_syntax(any_string_dtype): ser = Series( ["YYY", "B", "C", "YYYYYYbYYY", "BYYYcYYY", np.nan, "CYYYBYYY", "dog", "cYYYt"],