Skip to content

add Series.str.remove(pre|suf)fix #43328

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Sep 6, 2021
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ Other enhancements
- :meth:`Series.ewm`, :meth:`DataFrame.ewm`, now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview <window.overview>` for performance and functional benefits (:issue:`42273`)
- :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`)
- :meth:`read_table` now supports the argument ``storage_options`` (:issue:`39167`)
- Add :meth:`Series.str.removeprefix` and :meth:`Series.str.removesuffix` introduced in Python 3.9 to remove pre-/suffixes from string-type :class:`Series`.

.. ---------------------------------------------------------------------------

Expand Down
59 changes: 59 additions & 0 deletions pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1907,6 +1907,65 @@ def rstrip(self, to_strip=None):
result = self._data.array._str_rstrip(to_strip)
return self._wrap_result(result)

_shared_docs[
"str_remove"
] = r"""
Remove a %(side)s from an object series. If the %(side)s is not present,
the original string will be returned.

Parameters
----------
%(side)s: str
%(side)s to remove.

Returns
-------
Series/Index: object
The Series or Index with given %(side)s removed.

See also
--------
Series.str.remove%(other_side)s : Remove a %(other_side)s from an object series.

Examples
--------
>>> s = pd.Series(["str_foo", "str_bar", "no_prefix"])
>>> s
0 str_foo
1 str_bar
2 no_prefix
dtype: object
>>> s.str.removeprefix("str_")
0 foo
1 bar
2 no_prefix
dtype: object

>>> s = pd.Series(["foo_str", "bar_str", "no_suffix"])
>>> s
0 foo_str
1 bar_str
2 no_prefix
dtype: object
>>> s.str.removesuffix("_str")
0 foo
1 bar
2 no_prefix
dtype: object
"""

@Appender(_shared_docs["str_remove"] % {"side": "prefix", "other_side": "suffix"})
@forbid_nonstring_types(["bytes"])
def removeprefix(self, prefix):
result = self._data.array._str_removeprefix(prefix)
return self._wrap_result(result)

@Appender(_shared_docs["str_remove"] % {"side": "suffix", "other_side": "prefix"})
@forbid_nonstring_types(["bytes"])
def removesuffix(self, suffix):
result = self._data.array._str_removesuffix(suffix)
return self._wrap_result(result)

@forbid_nonstring_types(["bytes"])
def wrap(self, width, **kwargs):
r"""
Expand Down
10 changes: 10 additions & 0 deletions pandas/core/strings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

from pandas._typing import Scalar

from pandas.core.series import Series
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

import of Series is added just for the type annotations and causing the circular import?

try

from typing import TYPE_CHECKING
...
if TYPE_CHECKING:
    from pandas import Series



class BaseStringArrayMethods(abc.ABC):
"""
Expand Down Expand Up @@ -223,6 +225,14 @@ def _str_lstrip(self, to_strip=None):
def _str_rstrip(self, to_strip=None):
pass

@abc.abstractmethod
def _str_removeprefix(self, prefix: str) -> Series:
pass

@abc.abstractmethod
def _str_removesuffix(self, suffix: str) -> Series:
pass

@abc.abstractmethod
def _str_split(self, pat=None, n=-1, expand=False):
pass
Expand Down
27 changes: 26 additions & 1 deletion pandas/core/strings/object_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from pandas.core.dtypes.common import is_scalar
from pandas.core.dtypes.missing import isna

from pandas.core.series import Series
from pandas.core.strings.base import BaseStringArrayMethods


Expand All @@ -36,7 +37,7 @@ def _str_map(
self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
):
"""
Map a callable over valid element of the array.
Map a callable over valid elements of the array.

Parameters
----------
Expand Down Expand Up @@ -414,6 +415,30 @@ def _str_lstrip(self, to_strip=None):
def _str_rstrip(self, to_strip=None):
return self._str_map(lambda x: x.rstrip(to_strip))

def _str_removeprefix(self, prefix: str) -> Series:
# outstanding question on whether to use native methods for users
# on Python 3.9+ https://git.io/JE9QK, in which case we could do
# return self._str_map(str.removeprefix)

def removeprefix(text: str) -> str:
if text.startswith(prefix):
return text[len(prefix) :]
return text

return self._str_map(removeprefix)

def _str_removesuffix(self, suffix: str) -> Series:
# this could be used on Python 3.9+
# f = lambda x: x.removesuffix(suffix)
# return self._str_map(str.removesuffix)

def removesuffix(text: str) -> str:
if text.endswith(suffix):
return text[: len(suffix)]
return text

return self._str_map(removesuffix)

def _str_extract(self, pat: str, flags: int = 0, expand: bool = True):
regex = re.compile(pat, flags=flags)
na_value = self._str_na_value
Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/strings/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@
("startswith", ("a",), {}),
("startswith", ("a",), {"na": True}),
("startswith", ("a",), {"na": False}),
("removeprefix", ("a",)),
("removesuffix", ("a",)),
# translating unicode points of "a" to "d"
("translate", ({97: 100},), {}),
("wrap", (2,), {}),
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/strings/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,26 @@ def test_strip_lstrip_rstrip_args(any_string_dtype):
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"prefix, expected", [("a", ["b", " b c", "bc"]), ("ab", ["", "a b c", "bc"])]
)
def test_removeprefix(any_string_dtype, prefix, expected):
ser = Series(["ab", "a b c", "bc"], dtype=any_string_dtype)
result = ser.str.removeprefix(prefix)
ser_expected = Series(expected, dtype=any_string_dtype)
tm.assert_series_equal(result, ser_expected)


@pytest.mark.parametrize(
"suffix, expected", [("c", ["ab", "a b ", "b"]), ("bc", ["ab", "a b c", ""])]
)
def test_removesuffix(any_string_dtype, suffix, expected):
ser = Series(["ab", "a b c", "bc"], dtype=any_string_dtype)
result = ser.str.removesuffix(suffix)
ser_expected = Series(expected, dtype=any_string_dtype)
tm.assert_series_equal(result, ser_expected)


def test_string_slice_get_syntax(any_string_dtype):
ser = Series(
["YYY", "B", "C", "YYYYYYbYYY", "BYYYcYYY", np.nan, "CYYYBYYY", "dog", "cYYYt"],
Expand Down