Skip to content

add Series.str.remove(pre|suf)fix #43328

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Sep 6, 2021
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ Other enhancements
- :meth:`Series.ewm`, :meth:`DataFrame.ewm`, now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview <window.overview>` for performance and functional benefits (:issue:`42273`)
- :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`)
- :meth:`read_table` now supports the argument ``storage_options`` (:issue:`39167`)
- Add :meth:`Series.str.removeprefix` and :meth:`Series.str.removesuffix` introduced in Python 3.9 to remove pre-/suffixes from string-type :class:`Series`.

.. ---------------------------------------------------------------------------

Expand Down
59 changes: 59 additions & 0 deletions pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1907,6 +1907,65 @@ def rstrip(self, to_strip=None):
result = self._data.array._str_rstrip(to_strip)
return self._wrap_result(result)

_shared_docs[
"str_remove"
] = r"""
Remove a %(side)s from an object series. If the %(side)s is not present,
the original string will be returned.

Parameters
----------
%(side)s: str
%(side)s to remove.

Returns
-------
Series/Index: object
The Series or Index with given %(side)s removed.

See also
--------
Series.str.remove%(other_side)s : Remove a %(other_side)s from an object series.

Examples
--------
>>> s = pd.Series(["str_foo", "str_bar", "no_prefix"])
>>> s
0 str_foo
1 str_bar
2 no_prefix
dtype: object
>>> s.str.removeprefix("str_")
0 foo
1 bar
2 no_prefix
dtype: object

>>> s = pd.Series(["foo_str", "bar_str", "no_suffix"])
>>> s
0 foo_str
1 bar_str
2 no_prefix
dtype: object
>>> s.str.removesuffix("_str")
0 foo
1 bar
2 no_prefix
dtype: object
"""

@Appender(_shared_docs["str_remove"] % {"side": "prefix", "other_side": "suffix"})
@forbid_nonstring_types(["bytes"])
def removeprefix(self, prefix):
result = self._data.array._str_removeprefix(prefix)
return self._wrap_result(result)

@Appender(_shared_docs["str_remove"] % {"side": "suffix", "other_side": "prefix"})
@forbid_nonstring_types(["bytes"])
def removesuffix(self, suffix):
result = self._data.array._str_removesuffix(suffix)
return self._wrap_result(result)

@forbid_nonstring_types(["bytes"])
def wrap(self, width, **kwargs):
r"""
Expand Down
12 changes: 12 additions & 0 deletions pandas/core/strings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,15 @@
import abc
from collections.abc import Callable # noqa: PDF001
import re
from typing import TYPE_CHECKING

import numpy as np

from pandas._typing import Scalar

if TYPE_CHECKING:
from pandas import Series


class BaseStringArrayMethods(abc.ABC):
"""
Expand Down Expand Up @@ -223,6 +227,14 @@ def _str_lstrip(self, to_strip=None):
def _str_rstrip(self, to_strip=None):
pass

@abc.abstractmethod
def _str_removeprefix(self, prefix: str) -> Series:
pass

@abc.abstractmethod
def _str_removesuffix(self, suffix: str) -> Series:
pass

@abc.abstractmethod
def _str_split(self, pat=None, n=-1, expand=False):
pass
Expand Down
30 changes: 29 additions & 1 deletion pandas/core/strings/object_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from collections.abc import Callable # noqa: PDF001
import re
import textwrap
from typing import TYPE_CHECKING
import unicodedata

import numpy as np
Expand All @@ -20,6 +21,9 @@

from pandas.core.strings.base import BaseStringArrayMethods

if TYPE_CHECKING:
from pandas import Series


class ObjectStringArrayMixin(BaseStringArrayMethods):
"""
Expand All @@ -36,7 +40,7 @@ def _str_map(
self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
):
"""
Map a callable over valid element of the array.
Map a callable over valid elements of the array.

Parameters
----------
Expand Down Expand Up @@ -414,6 +418,30 @@ def _str_lstrip(self, to_strip=None):
def _str_rstrip(self, to_strip=None):
return self._str_map(lambda x: x.rstrip(to_strip))

def _str_removeprefix(self, prefix: str) -> Series:
# outstanding question on whether to use native methods for users
# on Python 3.9+ https://git.io/JE9QK, in which case we could do
# return self._str_map(str.removeprefix)

def removeprefix(text: str) -> str:
if text.startswith(prefix):
return text[len(prefix) :]
return text

return self._str_map(removeprefix)

def _str_removesuffix(self, suffix: str) -> Series:
# this could be used on Python 3.9+
# f = lambda x: x.removesuffix(suffix)
# return self._str_map(str.removesuffix)

def removesuffix(text: str) -> str:
if text.endswith(suffix):
return text[: len(suffix)]
return text

return self._str_map(removesuffix)

def _str_extract(self, pat: str, flags: int = 0, expand: bool = True):
regex = re.compile(pat, flags=flags)
na_value = self._str_na_value
Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/strings/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@
("startswith", ("a",), {}),
("startswith", ("a",), {"na": True}),
("startswith", ("a",), {"na": False}),
("removeprefix", ("a",), {}),
("removesuffix", ("a",), {}),
# translating unicode points of "a" to "d"
("translate", ({97: 100},), {}),
("wrap", (2,), {}),
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/strings/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,26 @@ def test_strip_lstrip_rstrip_args(any_string_dtype):
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"prefix, expected", [("a", ["b", " b c", "bc"]), ("ab", ["", "a b c", "bc"])]
)
def test_removeprefix(any_string_dtype, prefix, expected):
ser = Series(["ab", "a b c", "bc"], dtype=any_string_dtype)
result = ser.str.removeprefix(prefix)
ser_expected = Series(expected, dtype=any_string_dtype)
tm.assert_series_equal(result, ser_expected)


@pytest.mark.parametrize(
"suffix, expected", [("c", ["ab", "a b ", "b"]), ("bc", ["ab", "a b c", ""])]
)
def test_removesuffix(any_string_dtype, suffix, expected):
ser = Series(["ab", "a b c", "bc"], dtype=any_string_dtype)
result = ser.str.removesuffix(suffix)
ser_expected = Series(expected, dtype=any_string_dtype)
tm.assert_series_equal(result, ser_expected)


def test_string_slice_get_syntax(any_string_dtype):
ser = Series(
["YYY", "B", "C", "YYYYYYbYYY", "BYYYcYYY", np.nan, "CYYYBYYY", "dog", "cYYYt"],
Expand Down