From 544dd79f684b6307823072aca3cf09469611981c Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Sun, 17 Jan 2021 12:16:37 +0100 Subject: [PATCH 01/21] add test --- pandas/tests/test_strings.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 8e25d5a43ee3d..a58e3509a9648 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3678,3 +3678,10 @@ def test_str_accessor_in_apply_func(): expected = Series(["A/D", "B/E", "C/F"]) result = df.apply(lambda f: "/".join(f.str.upper()), axis=1) tm.assert_series_equal(result, expected) + + +def test_str_removeprefix(): + df = DataFrame({"A": ["str_string1", "str_string2", "str_string3"]}) + df["A"] = df["A"].str.removeprefix("str") + expected = DataFrame({"A": ["string1", "string2", "string3"]}) + tm.assert_frame_equal(df, expected) From e0ea92d6e7e998a9fbf2d6792baaf17bb55a1a39 Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Sun, 17 Jan 2021 13:51:18 +0100 Subject: [PATCH 02/21] first working version removeprefix --- pandas/core/strings/accessor.py | 5 +++++ pandas/core/strings/base.py | 4 ++++ pandas/core/strings/object_array.py | 3 +++ pandas/tests/test_strings.py | 4 +++- 4 files changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 067fff3e0a744..5b253022a502b 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -2885,6 +2885,11 @@ def casefold(self): "isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"] ) + @forbid_nonstring_types(["bytes"]) + def removeprefix(self, prefix=None): + result = self._data.array._str_removeprefix(prefix) + return self._wrap_result(result) + def cat_safe(list_of_columns: List, sep: str): """ diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index 08064244a2ff9..4842d388897db 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -223,3 +223,7 @@ def _str_split(self, pat=None, n=-1, expand=False): @abc.abstractmethod def _str_rsplit(self, pat=None, n=-1): pass + + @abc.abstractmethod + def _str_removeprefix(self, prefix=None): + pass diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 471f1e521b991..5f15a26090675 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -430,3 +430,6 @@ def _str_lstrip(self, to_strip=None): def _str_rstrip(self, to_strip=None): return self._str_map(lambda x: x.rstrip(to_strip)) + + def _str_removeprefix(self, prefix=None): + return self._str_map(lambda x: x.removeprefix(prefix)) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index a58e3509a9648..ea2f2ed195642 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -96,6 +96,7 @@ def assert_series_or_index_equal(left, right): "title", "upper", "casefold", + "removeprefix" ], [()] * 100, [{}] * 100, @@ -3681,7 +3682,8 @@ def test_str_accessor_in_apply_func(): def test_str_removeprefix(): + # https://github.com/pandas-dev/pandas/issues/36944 df = DataFrame({"A": ["str_string1", "str_string2", "str_string3"]}) - df["A"] = df["A"].str.removeprefix("str") + df["A"] = df["A"].str.removeprefix("str_") expected = DataFrame({"A": ["string1", "string2", "string3"]}) tm.assert_frame_equal(df, expected) From 151b0f75947c93b142857b358a6f50c957486dfe Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Sun, 17 Jan 2021 13:52:16 +0100 Subject: [PATCH 03/21] fixes black --- pandas/tests/test_strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index ea2f2ed195642..de668395ff95f 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -96,7 +96,7 @@ def assert_series_or_index_equal(left, right): "title", "upper", "casefold", - "removeprefix" + "removeprefix", ], [()] * 100, [{}] * 100, From c309c28463738abf5c5f2427d159f7af7483eec4 Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Sun, 17 Jan 2021 14:03:56 +0100 Subject: [PATCH 04/21] whatsnew entry --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index ab00b749d5725..d05de57e207fc 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -52,6 +52,7 @@ Other enhancements - :meth:`DataFrame.apply` can now accept NumPy unary operators as strings, e.g. ``df.apply("sqrt")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) - :meth:`DataFrame.apply` can now accept non-callable DataFrame properties as strings, e.g. ``df.apply("size")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) - :meth:`Series.apply` can now accept list-like or dictionary-like arguments that aren't lists or dictionaries, e.g. ``ser.apply(np.array(["sum", "mean"]))``, which was already the case for :meth:`DataFrame.apply` (:issue:`39140`) +- Added :meth:`Series.str.removeprefix` which has same functionality as `str.removeprefix` from the Python stanard library. .. --------------------------------------------------------------------------- From adae1e0067d12b084a7e227c4bffcbf606d0dbb5 Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Sun, 17 Jan 2021 14:17:28 +0100 Subject: [PATCH 05/21] add two backticks --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index d05de57e207fc..f815423d17a02 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -52,7 +52,7 @@ Other enhancements - :meth:`DataFrame.apply` can now accept NumPy unary operators as strings, e.g. ``df.apply("sqrt")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) - :meth:`DataFrame.apply` can now accept non-callable DataFrame properties as strings, e.g. ``df.apply("size")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) - :meth:`Series.apply` can now accept list-like or dictionary-like arguments that aren't lists or dictionaries, e.g. ``ser.apply(np.array(["sum", "mean"]))``, which was already the case for :meth:`DataFrame.apply` (:issue:`39140`) -- Added :meth:`Series.str.removeprefix` which has same functionality as `str.removeprefix` from the Python stanard library. +- Added :meth:`Series.str.removeprefix` which has same functionality as ``str.removeprefix`` from the Python stanard library. .. --------------------------------------------------------------------------- From 1329b708cdaa945785c3873e76ce04fbe2eb55aa Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Sun, 17 Jan 2021 14:19:41 +0100 Subject: [PATCH 06/21] typo whatsnew --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index f815423d17a02..691886b6d70d2 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -52,7 +52,7 @@ Other enhancements - :meth:`DataFrame.apply` can now accept NumPy unary operators as strings, e.g. ``df.apply("sqrt")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) - :meth:`DataFrame.apply` can now accept non-callable DataFrame properties as strings, e.g. ``df.apply("size")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) - :meth:`Series.apply` can now accept list-like or dictionary-like arguments that aren't lists or dictionaries, e.g. ``ser.apply(np.array(["sum", "mean"]))``, which was already the case for :meth:`DataFrame.apply` (:issue:`39140`) -- Added :meth:`Series.str.removeprefix` which has same functionality as ``str.removeprefix`` from the Python stanard library. +- Added :meth:`Series.str.removeprefix` to remove prefixes from string type Series, which has the same functionality as ``str.removeprefix`` from the Python standard library. .. --------------------------------------------------------------------------- From 08c0b42f584b7a19bb5c8d3d6250dddd6474fc0d Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Sun, 17 Jan 2021 15:40:10 +0100 Subject: [PATCH 07/21] only run test for python 3.9 or higher --- pandas/tests/test_strings.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index de668395ff95f..ab9763a5825bf 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -1,5 +1,6 @@ from datetime import datetime, timedelta import re +import sys import numpy as np import pytest @@ -3686,4 +3687,7 @@ def test_str_removeprefix(): df = DataFrame({"A": ["str_string1", "str_string2", "str_string3"]}) df["A"] = df["A"].str.removeprefix("str_") expected = DataFrame({"A": ["string1", "string2", "string3"]}) - tm.assert_frame_equal(df, expected) + + # str.removeprefix only has to be tested on python >= 3.9 + if sys.version_info[0] >= 3 and sys.version_info[1] >= 9: + tm.assert_frame_equal(df, expected) From 0b3b84eac1d943153d2aa9c170904a4e985cbdbc Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Sun, 17 Jan 2021 15:40:46 +0100 Subject: [PATCH 08/21] whole test in if block --- pandas/tests/test_strings.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index ab9763a5825bf..8f73be6b99ad3 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3684,10 +3684,11 @@ def test_str_accessor_in_apply_func(): def test_str_removeprefix(): # https://github.com/pandas-dev/pandas/issues/36944 - df = DataFrame({"A": ["str_string1", "str_string2", "str_string3"]}) - df["A"] = df["A"].str.removeprefix("str_") - expected = DataFrame({"A": ["string1", "string2", "string3"]}) - # str.removeprefix only has to be tested on python >= 3.9 if sys.version_info[0] >= 3 and sys.version_info[1] >= 9: + df = DataFrame({"A": ["str_string1", "str_string2", "str_string3"]}) + df["A"] = df["A"].str.removeprefix("str_") + expected = DataFrame({"A": ["string1", "string2", "string3"]}) + + # str.removeprefix only has to be tested on python >= 3.9 tm.assert_frame_equal(df, expected) From 5b8eaf3f3abaa551536b008b27a021f2f2f11b05 Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Sun, 17 Jan 2021 15:42:46 +0100 Subject: [PATCH 09/21] make test for version more general --- pandas/tests/test_strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 8f73be6b99ad3..7f440607de81b 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3685,7 +3685,7 @@ def test_str_accessor_in_apply_func(): def test_str_removeprefix(): # https://github.com/pandas-dev/pandas/issues/36944 - if sys.version_info[0] >= 3 and sys.version_info[1] >= 9: + if sys.version_info[0] + sys.version_info[1] >= 12: df = DataFrame({"A": ["str_string1", "str_string2", "str_string3"]}) df["A"] = df["A"].str.removeprefix("str_") expected = DataFrame({"A": ["string1", "string2", "string3"]}) From 33bba5d24beef84d4010b0691961679785e82bdd Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Sun, 17 Jan 2021 15:47:23 +0100 Subject: [PATCH 10/21] use pytest.mark.skipif --- pandas/tests/test_strings.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 7f440607de81b..266f0ad01f300 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3682,13 +3682,13 @@ def test_str_accessor_in_apply_func(): tm.assert_series_equal(result, expected) +@pytest.mark.skipif(sum(sys.version_info[:3]) < 12, reason="Requires python 3.9 or greater") def test_str_removeprefix(): # https://github.com/pandas-dev/pandas/issues/36944 - if sys.version_info[0] + sys.version_info[1] >= 12: - df = DataFrame({"A": ["str_string1", "str_string2", "str_string3"]}) - df["A"] = df["A"].str.removeprefix("str_") - expected = DataFrame({"A": ["string1", "string2", "string3"]}) + df = DataFrame({"A": ["str_string1", "str_string2", "str_string3"]}) + df["A"] = df["A"].str.removeprefix("str_") + expected = DataFrame({"A": ["string1", "string2", "string3"]}) - # str.removeprefix only has to be tested on python >= 3.9 - tm.assert_frame_equal(df, expected) + # str.removeprefix only has to be tested on python >= 3.9 + tm.assert_frame_equal(df, expected) From c5c219711d4236eddd330a4372d9731350ebefb1 Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Sun, 17 Jan 2021 15:51:57 +0100 Subject: [PATCH 11/21] fix test --- pandas/tests/test_strings.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 266f0ad01f300..51911ec611db7 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3682,13 +3682,14 @@ def test_str_accessor_in_apply_func(): tm.assert_series_equal(result, expected) -@pytest.mark.skipif(sum(sys.version_info[:3]) < 12, reason="Requires python 3.9 or greater") +@pytest.mark.skipif( + sum(sys.version_info[:3]) < 12, reason="Requires python 3.9 or greater" +) def test_str_removeprefix(): # https://github.com/pandas-dev/pandas/issues/36944 df = DataFrame({"A": ["str_string1", "str_string2", "str_string3"]}) df["A"] = df["A"].str.removeprefix("str_") - expected = DataFrame({"A": ["string1", "string2", "string3"]}) + result = DataFrame({"A": ["string1", "string2", "string3"]}) - # str.removeprefix only has to be tested on python >= 3.9 - tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df, result) From db5829fb10d73860bf24648b3a1389655efd8ff6 Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Sun, 17 Jan 2021 16:17:58 +0100 Subject: [PATCH 12/21] check for missing argument --- pandas/core/strings/accessor.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 5b253022a502b..49552127570cc 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -2887,8 +2887,11 @@ def casefold(self): @forbid_nonstring_types(["bytes"]) def removeprefix(self, prefix=None): - result = self._data.array._str_removeprefix(prefix) - return self._wrap_result(result) + if not prefix: + return self._data + else: + result = self._data.array._str_removeprefix(prefix) + return self._wrap_result(result) def cat_safe(list_of_columns: List, sep: str): From a9fe5deec1dc804a70178125d799f8b7e704e086 Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Sun, 17 Jan 2021 17:19:08 +0100 Subject: [PATCH 13/21] add removeprefix test any_string_method --- pandas/tests/test_strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 51911ec611db7..2405fa7bb0f51 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -66,6 +66,7 @@ def assert_series_or_index_equal(left, right): ("translate", ({97: 100},), {}), ("wrap", (2,), {}), ("zfill", (10,), {}), + ("removeprefix", ("a ",), {}), ] + list( zip( [ @@ -97,7 +98,6 @@ def assert_series_or_index_equal(left, right): "title", "upper", "casefold", - "removeprefix", ], [()] * 100, [{}] * 100, From 219abf35bd9ca088a0fc67128ef721de5cf0171a Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Sun, 17 Jan 2021 19:15:15 +0100 Subject: [PATCH 14/21] check for version 3.9 --- pandas/tests/test_strings.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 2405fa7bb0f51..3af77b7d10da6 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -309,8 +309,18 @@ def test_api_for_categorical(self, any_string_method): method_name, args, kwargs = any_string_method - result = getattr(c.str, method_name)(*args, **kwargs) - expected = getattr(s.str, method_name)(*args, **kwargs) + # we expect an AttributeError when str.removeprefix is tested on <= 3.9 + if ( + sys.version_info[0] <= 3 + and sys.version_info[1] < 9 + and method_name == "removeprefix" + ): + with pytest.raises(AttributeError): + result = getattr(c.str, method_name)(*args, **kwargs) + expected = getattr(s.str, method_name)(*args, **kwargs) + else: + result = getattr(c.str, method_name)(*args, **kwargs) + expected = getattr(s.str, method_name)(*args, **kwargs) if isinstance(result, DataFrame): tm.assert_frame_equal(result, expected) @@ -3683,7 +3693,8 @@ def test_str_accessor_in_apply_func(): @pytest.mark.skipif( - sum(sys.version_info[:3]) < 12, reason="Requires python 3.9 or greater" + sys.version_info[0] <= 3 and sys.version_info[1] < 9, + reason="Requires python 3.9 or greater", ) def test_str_removeprefix(): # https://github.com/pandas-dev/pandas/issues/36944 From 91e958fb6db5ac15aedd54d896465675b4e59b6b Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Tue, 26 Jan 2021 12:38:31 +0100 Subject: [PATCH 15/21] implementation not python version dependant --- pandas/core/strings/object_array.py | 7 ++++++- pandas/tests/test_strings.py | 12 ++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 5f15a26090675..dae8c868d054b 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -432,4 +432,9 @@ def _str_rstrip(self, to_strip=None): return self._str_map(lambda x: x.rstrip(to_strip)) def _str_removeprefix(self, prefix=None): - return self._str_map(lambda x: x.removeprefix(prefix)) + f_startswith = lambda x: x.startswith(prefix) + f_slice = lambda x: x[len(prefix):] + + has_prefix = self._str_map(f_startswith) + + return np.where(has_prefix, self._str_map(f_slice), self) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 3af77b7d10da6..1f6c593585f36 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3692,15 +3692,15 @@ def test_str_accessor_in_apply_func(): tm.assert_series_equal(result, expected) -@pytest.mark.skipif( - sys.version_info[0] <= 3 and sys.version_info[1] < 9, - reason="Requires python 3.9 or greater", -) +# @pytest.mark.skipif( +# sys.version_info[0] <= 3 and sys.version_info[1] < 9, +# reason="Requires python 3.9 or greater", +# ) def test_str_removeprefix(): # https://github.com/pandas-dev/pandas/issues/36944 - df = DataFrame({"A": ["str_string1", "str_string2", "str_string3"]}) + df = DataFrame({"A": ["str_string1", "str_string2", "str_string3", "string_no_prefix"]}) df["A"] = df["A"].str.removeprefix("str_") - result = DataFrame({"A": ["string1", "string2", "string3"]}) + result = DataFrame({"A": ["string1", "string2", "string3", "string_no_prefix"]}) tm.assert_frame_equal(df, result) From 8d2a25a6d03cf2dfe97aa6053238295d47e689fa Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Tue, 26 Jan 2021 12:40:19 +0100 Subject: [PATCH 16/21] do not return copy --- pandas/core/strings/accessor.py | 7 ++----- pandas/tests/test_strings.py | 8 +++----- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 49552127570cc..5b253022a502b 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -2887,11 +2887,8 @@ def casefold(self): @forbid_nonstring_types(["bytes"]) def removeprefix(self, prefix=None): - if not prefix: - return self._data - else: - result = self._data.array._str_removeprefix(prefix) - return self._wrap_result(result) + result = self._data.array._str_removeprefix(prefix) + return self._wrap_result(result) def cat_safe(list_of_columns: List, sep: str): diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 1f6c593585f36..d8dad49104a2c 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3692,14 +3692,12 @@ def test_str_accessor_in_apply_func(): tm.assert_series_equal(result, expected) -# @pytest.mark.skipif( -# sys.version_info[0] <= 3 and sys.version_info[1] < 9, -# reason="Requires python 3.9 or greater", -# ) def test_str_removeprefix(): # https://github.com/pandas-dev/pandas/issues/36944 - df = DataFrame({"A": ["str_string1", "str_string2", "str_string3", "string_no_prefix"]}) + df = DataFrame( + {"A": ["str_string1", "str_string2", "str_string3", "string_no_prefix"]} + ) df["A"] = df["A"].str.removeprefix("str_") result = DataFrame({"A": ["string1", "string2", "string3", "string_no_prefix"]}) From 7d0838abff5aa864d588ed09417a33f52e614802 Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Tue, 26 Jan 2021 13:39:17 +0100 Subject: [PATCH 17/21] move test to correct place --- pandas/tests/strings/conftest.py | 1 + pandas/tests/strings/test_strings.py | 12 + pandas/tests/test_strings.py | 3704 -------------------------- 3 files changed, 13 insertions(+), 3704 deletions(-) delete mode 100644 pandas/tests/test_strings.py diff --git a/pandas/tests/strings/conftest.py b/pandas/tests/strings/conftest.py index 4fedbee91f649..1d27b73ab55f1 100644 --- a/pandas/tests/strings/conftest.py +++ b/pandas/tests/strings/conftest.py @@ -50,6 +50,7 @@ ("translate", ({97: 100},), {}), ("wrap", (2,), {}), ("zfill", (10,), {}), + ("removeprefix", ("a ",), {}), ] + list( zip( [ diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 92e7bf258d2d7..6849a64b7fdcc 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -767,3 +767,15 @@ def test_str_accessor_in_apply_func(): expected = Series(["A/D", "B/E", "C/F"]) result = df.apply(lambda f: "/".join(f.str.upper()), axis=1) tm.assert_series_equal(result, expected) + + +def test_str_removeprefix(): + # https://github.com/pandas-dev/pandas/issues/36944 + + df = DataFrame( + {"A": ["str_string1", "str_string2", "str_string3", "string_no_prefix"]} + ) + df["A"] = df["A"].str.removeprefix("str_") + result = DataFrame({"A": ["string1", "string2", "string3", "string_no_prefix"]}) + + tm.assert_frame_equal(df, result) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py deleted file mode 100644 index d8dad49104a2c..0000000000000 --- a/pandas/tests/test_strings.py +++ /dev/null @@ -1,3704 +0,0 @@ -from datetime import datetime, timedelta -import re -import sys - -import numpy as np -import pytest - -from pandas._libs import lib - -import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series, concat, isna, notna -import pandas._testing as tm -import pandas.core.strings as strings - - -def assert_series_or_index_equal(left, right): - if isinstance(left, Series): - tm.assert_series_equal(left, right) - else: # Index - tm.assert_index_equal(left, right) - - -_any_string_method = [ - ("cat", (), {"sep": ","}), - ("cat", (Series(list("zyx")),), {"sep": ",", "join": "left"}), - ("center", (10,), {}), - ("contains", ("a",), {}), - ("count", ("a",), {}), - ("decode", ("UTF-8",), {}), - ("encode", ("UTF-8",), {}), - ("endswith", ("a",), {}), - ("endswith", ("a",), {"na": True}), - ("endswith", ("a",), {"na": False}), - ("extract", ("([a-z]*)",), {"expand": False}), - ("extract", ("([a-z]*)",), {"expand": True}), - ("extractall", ("([a-z]*)",), {}), - ("find", ("a",), {}), - ("findall", ("a",), {}), - ("get", (0,), {}), - # because "index" (and "rindex") fail intentionally - # if the string is not found, search only for empty string - ("index", ("",), {}), - ("join", (",",), {}), - ("ljust", (10,), {}), - ("match", ("a",), {}), - ("fullmatch", ("a",), {}), - ("normalize", ("NFC",), {}), - ("pad", (10,), {}), - ("partition", (" ",), {"expand": False}), - ("partition", (" ",), {"expand": True}), - ("repeat", (3,), {}), - ("replace", ("a", "z"), {}), - ("rfind", ("a",), {}), - ("rindex", ("",), {}), - ("rjust", (10,), {}), - ("rpartition", (" ",), {"expand": False}), - ("rpartition", (" ",), {"expand": True}), - ("slice", (0, 1), {}), - ("slice_replace", (0, 1, "z"), {}), - ("split", (" ",), {"expand": False}), - ("split", (" ",), {"expand": True}), - ("startswith", ("a",), {}), - ("startswith", ("a",), {"na": True}), - ("startswith", ("a",), {"na": False}), - # translating unicode points of "a" to "d" - ("translate", ({97: 100},), {}), - ("wrap", (2,), {}), - ("zfill", (10,), {}), - ("removeprefix", ("a ",), {}), -] + list( - zip( - [ - # methods without positional arguments: zip with empty tuple and empty dict - "capitalize", - "cat", - "get_dummies", - "isalnum", - "isalpha", - "isdecimal", - "isdigit", - "islower", - "isnumeric", - "isspace", - "istitle", - "isupper", - "len", - "lower", - "lstrip", - "partition", - "rpartition", - "rsplit", - "rstrip", - "slice", - "slice_replace", - "split", - "strip", - "swapcase", - "title", - "upper", - "casefold", - ], - [()] * 100, - [{}] * 100, - ) -) -ids, _, _ = zip(*_any_string_method) # use method name as fixture-id - - -# test that the above list captures all methods of StringMethods -missing_methods = { - f for f in dir(strings.StringMethods) if not f.startswith("_") -} - set(ids) -assert not missing_methods - - -@pytest.fixture(params=_any_string_method, ids=ids) -def any_string_method(request): - """ - Fixture for all public methods of `StringMethods` - - This fixture returns a tuple of the method name and sample arguments - necessary to call the method. - - Returns - ------- - method_name : str - The name of the method in `StringMethods` - args : tuple - Sample values for the positional arguments - kwargs : dict - Sample values for the keyword arguments - - Examples - -------- - >>> def test_something(any_string_method): - ... s = Series(['a', 'b', np.nan, 'd']) - ... - ... method_name, args, kwargs = any_string_method - ... method = getattr(s.str, method_name) - ... # will not raise - ... method(*args, **kwargs) - """ - return request.param - - -# subset of the full set from pandas/conftest.py -_any_allowed_skipna_inferred_dtype = [ - ("string", ["a", np.nan, "c"]), - ("bytes", [b"a", np.nan, b"c"]), - ("empty", [np.nan, np.nan, np.nan]), - ("empty", []), - ("mixed-integer", ["a", np.nan, 2]), -] -ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id - - -@pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids) -def any_allowed_skipna_inferred_dtype(request): - """ - Fixture for all (inferred) dtypes allowed in StringMethods.__init__ - - The covered (inferred) types are: - * 'string' - * 'empty' - * 'bytes' - * 'mixed' - * 'mixed-integer' - - Returns - ------- - inferred_dtype : str - The string for the inferred dtype from _libs.lib.infer_dtype - values : np.ndarray - An array of object dtype that will be inferred to have - `inferred_dtype` - - Examples - -------- - >>> import pandas._libs.lib as lib - >>> - >>> def test_something(any_allowed_skipna_inferred_dtype): - ... inferred_dtype, values = any_allowed_skipna_inferred_dtype - ... # will pass - ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype - ... - ... # constructor for .str-accessor will also pass - ... Series(values).str - """ - inferred_dtype, values = request.param - values = np.array(values, dtype=object) # object dtype to avoid casting - - # correctness of inference tested in tests/dtypes/test_inference.py - return inferred_dtype, values - - -class TestStringMethods: - def test_api(self): - - # GH 6106, GH 9322 - assert Series.str is strings.StringMethods - assert isinstance(Series([""]).str, strings.StringMethods) - - def test_api_mi_raises(self): - # GH 23679 - mi = MultiIndex.from_arrays([["a", "b", "c"]]) - msg = "Can only use .str accessor with Index, not MultiIndex" - with pytest.raises(AttributeError, match=msg): - mi.str - assert not hasattr(mi, "str") - - @pytest.mark.parametrize("dtype", [object, "category"]) - def test_api_per_dtype(self, index_or_series, dtype, any_skipna_inferred_dtype): - # one instance of parametrized fixture - box = index_or_series - inferred_dtype, values = any_skipna_inferred_dtype - - t = box(values, dtype=dtype) # explicit dtype to avoid casting - - types_passing_constructor = [ - "string", - "unicode", - "empty", - "bytes", - "mixed", - "mixed-integer", - ] - if inferred_dtype in types_passing_constructor: - # GH 6106 - assert isinstance(t.str, strings.StringMethods) - else: - # GH 9184, GH 23011, GH 23163 - msg = "Can only use .str accessor with string values.*" - with pytest.raises(AttributeError, match=msg): - t.str - assert not hasattr(t, "str") - - @pytest.mark.parametrize("dtype", [object, "category"]) - def test_api_per_method( - self, - index_or_series, - dtype, - any_allowed_skipna_inferred_dtype, - any_string_method, - request, - ): - # this test does not check correctness of the different methods, - # just that the methods work on the specified (inferred) dtypes, - # and raise on all others - box = index_or_series - - # one instance of each parametrized fixture - inferred_dtype, values = any_allowed_skipna_inferred_dtype - method_name, args, kwargs = any_string_method - - # TODO: get rid of these xfails - reason = None - if box is Index and values.size == 0: - if method_name in ["partition", "rpartition"] and kwargs.get( - "expand", True - ): - reason = "Method cannot deal with empty Index" - elif method_name == "split" and kwargs.get("expand", None): - reason = "Split fails on empty Series when expand=True" - elif method_name == "get_dummies": - reason = "Need to fortify get_dummies corner cases" - - elif box is Index and inferred_dtype == "empty" and dtype == object: - if method_name == "get_dummies": - reason = "Need to fortify get_dummies corner cases" - - if reason is not None: - mark = pytest.mark.xfail(reason=reason) - request.node.add_marker(mark) - - t = box(values, dtype=dtype) # explicit dtype to avoid casting - method = getattr(t.str, method_name) - - bytes_allowed = method_name in ["decode", "get", "len", "slice"] - # as of v0.23.4, all methods except 'cat' are very lenient with the - # allowed data types, just returning NaN for entries that error. - # This could be changed with an 'errors'-kwarg to the `str`-accessor, - # see discussion in GH 13877 - mixed_allowed = method_name not in ["cat"] - - allowed_types = ( - ["string", "unicode", "empty"] - + ["bytes"] * bytes_allowed - + ["mixed", "mixed-integer"] * mixed_allowed - ) - - if inferred_dtype in allowed_types: - # xref GH 23555, GH 23556 - method(*args, **kwargs) # works! - else: - # GH 23011, GH 23163 - msg = ( - f"Cannot use .str.{method_name} with values of " - f"inferred dtype {repr(inferred_dtype)}." - ) - with pytest.raises(TypeError, match=msg): - method(*args, **kwargs) - - def test_api_for_categorical(self, any_string_method): - # https://github.com/pandas-dev/pandas/issues/10661 - s = Series(list("aabb")) - s = s + " " + s - c = s.astype("category") - assert isinstance(c.str, strings.StringMethods) - - method_name, args, kwargs = any_string_method - - # we expect an AttributeError when str.removeprefix is tested on <= 3.9 - if ( - sys.version_info[0] <= 3 - and sys.version_info[1] < 9 - and method_name == "removeprefix" - ): - with pytest.raises(AttributeError): - result = getattr(c.str, method_name)(*args, **kwargs) - expected = getattr(s.str, method_name)(*args, **kwargs) - else: - result = getattr(c.str, method_name)(*args, **kwargs) - expected = getattr(s.str, method_name)(*args, **kwargs) - - if isinstance(result, DataFrame): - tm.assert_frame_equal(result, expected) - elif isinstance(result, Series): - tm.assert_series_equal(result, expected) - else: - # str.cat(others=None) returns string, for example - assert result == expected - - def test_iter(self): - # GH3638 - strs = "google", "wikimedia", "wikipedia", "wikitravel" - ds = Series(strs) - - with tm.assert_produces_warning(FutureWarning): - for s in ds.str: - # iter must yield a Series - assert isinstance(s, Series) - - # indices of each yielded Series should be equal to the index of - # the original Series - tm.assert_index_equal(s.index, ds.index) - - for el in s: - # each element of the series is either a basestring/str or nan - assert isinstance(el, str) or isna(el) - - # desired behavior is to iterate until everything would be nan on the - # next iter so make sure the last element of the iterator was 'l' in - # this case since 'wikitravel' is the longest string - assert s.dropna().values.item() == "l" - - def test_iter_empty(self): - ds = Series([], dtype=object) - - i, s = 100, 1 - - with tm.assert_produces_warning(FutureWarning): - for i, s in enumerate(ds.str): - pass - - # nothing to iterate over so nothing defined values should remain - # unchanged - assert i == 100 - assert s == 1 - - def test_iter_single_element(self): - ds = Series(["a"]) - - with tm.assert_produces_warning(FutureWarning): - for i, s in enumerate(ds.str): - pass - - assert not i - tm.assert_series_equal(ds, s) - - def test_iter_object_try_string(self): - ds = Series( - [ - slice(None, np.random.randint(10), np.random.randint(10, 20)) - for _ in range(4) - ] - ) - - i, s = 100, "h" - - with tm.assert_produces_warning(FutureWarning): - for i, s in enumerate(ds.str): - pass - - assert i == 100 - assert s == "h" - - @pytest.mark.parametrize("other", [None, Series, Index]) - def test_str_cat_name(self, index_or_series, other): - # GH 21053 - box = index_or_series - values = ["a", "b"] - if other: - other = other(values) - else: - other = values - result = box(values, name="name").str.cat(other, sep=",") - assert result.name == "name" - - def test_str_cat(self, index_or_series): - box = index_or_series - # test_cat above tests "str_cat" from ndarray; - # here testing "str.cat" from Series/Indext to ndarray/list - s = box(["a", "a", "b", "b", "c", np.nan]) - - # single array - result = s.str.cat() - expected = "aabbc" - assert result == expected - - result = s.str.cat(na_rep="-") - expected = "aabbc-" - assert result == expected - - result = s.str.cat(sep="_", na_rep="NA") - expected = "a_a_b_b_c_NA" - assert result == expected - - t = np.array(["a", np.nan, "b", "d", "foo", np.nan], dtype=object) - expected = box(["aa", "a-", "bb", "bd", "cfoo", "--"]) - - # Series/Index with array - result = s.str.cat(t, na_rep="-") - assert_series_or_index_equal(result, expected) - - # Series/Index with list - result = s.str.cat(list(t), na_rep="-") - assert_series_or_index_equal(result, expected) - - # errors for incorrect lengths - rgx = r"If `others` contains arrays or lists \(or other list-likes.*" - z = Series(["1", "2", "3"]) - - with pytest.raises(ValueError, match=rgx): - s.str.cat(z.values) - - with pytest.raises(ValueError, match=rgx): - s.str.cat(list(z)) - - def test_str_cat_raises_intuitive_error(self, index_or_series): - # GH 11334 - box = index_or_series - s = box(["a", "b", "c", "d"]) - message = "Did you mean to supply a `sep` keyword?" - with pytest.raises(ValueError, match=message): - s.str.cat("|") - with pytest.raises(ValueError, match=message): - s.str.cat(" ") - - @pytest.mark.parametrize("sep", ["", None]) - @pytest.mark.parametrize("dtype_target", ["object", "category"]) - @pytest.mark.parametrize("dtype_caller", ["object", "category"]) - def test_str_cat_categorical( - self, index_or_series, dtype_caller, dtype_target, sep - ): - box = index_or_series - - s = Index(["a", "a", "b", "a"], dtype=dtype_caller) - s = s if box == Index else Series(s, index=s) - t = Index(["b", "a", "b", "c"], dtype=dtype_target) - - expected = Index(["ab", "aa", "bb", "ac"]) - expected = expected if box == Index else Series(expected, index=s) - - # Series/Index with unaligned Index -> t.values - result = s.str.cat(t.values, sep=sep) - assert_series_or_index_equal(result, expected) - - # Series/Index with Series having matching Index - t = Series(t.values, index=s) - result = s.str.cat(t, sep=sep) - assert_series_or_index_equal(result, expected) - - # Series/Index with Series.values - result = s.str.cat(t.values, sep=sep) - assert_series_or_index_equal(result, expected) - - # Series/Index with Series having different Index - t = Series(t.values, index=t.values) - expected = Index(["aa", "aa", "aa", "bb", "bb"]) - expected = ( - expected if box == Index else Series(expected, index=expected.str[:1]) - ) - - result = s.str.cat(t, sep=sep) - assert_series_or_index_equal(result, expected) - - # test integer/float dtypes (inferred by constructor) and mixed - @pytest.mark.parametrize( - "data", - [[1, 2, 3], [0.1, 0.2, 0.3], [1, 2, "b"]], - ids=["integers", "floats", "mixed"], - ) - # without dtype=object, np.array would cast [1, 2, 'b'] to ['1', '2', 'b'] - @pytest.mark.parametrize( - "box", - [Series, Index, list, lambda x: np.array(x, dtype=object)], - ids=["Series", "Index", "list", "np.array"], - ) - def test_str_cat_wrong_dtype_raises(self, box, data): - # GH 22722 - s = Series(["a", "b", "c"]) - t = box(data) - - msg = "Concatenation requires list-likes containing only strings.*" - with pytest.raises(TypeError, match=msg): - # need to use outer and na_rep, as otherwise Index would not raise - s.str.cat(t, join="outer", na_rep="-") - - def test_str_cat_mixed_inputs(self, index_or_series): - box = index_or_series - s = Index(["a", "b", "c", "d"]) - s = s if box == Index else Series(s, index=s) - - t = Series(["A", "B", "C", "D"], index=s.values) - d = concat([t, Series(s, index=s)], axis=1) - - expected = Index(["aAa", "bBb", "cCc", "dDd"]) - expected = expected if box == Index else Series(expected.values, index=s.values) - - # Series/Index with DataFrame - result = s.str.cat(d) - assert_series_or_index_equal(result, expected) - - # Series/Index with two-dimensional ndarray - result = s.str.cat(d.values) - assert_series_or_index_equal(result, expected) - - # Series/Index with list of Series - result = s.str.cat([t, s]) - assert_series_or_index_equal(result, expected) - - # Series/Index with mixed list of Series/array - result = s.str.cat([t, s.values]) - assert_series_or_index_equal(result, expected) - - # Series/Index with list of Series; different indexes - t.index = ["b", "c", "d", "a"] - expected = box(["aDa", "bAb", "cBc", "dCd"]) - expected = expected if box == Index else Series(expected.values, index=s.values) - result = s.str.cat([t, s]) - assert_series_or_index_equal(result, expected) - - # Series/Index with mixed list; different index - result = s.str.cat([t, s.values]) - assert_series_or_index_equal(result, expected) - - # Series/Index with DataFrame; different indexes - d.index = ["b", "c", "d", "a"] - expected = box(["aDd", "bAa", "cBb", "dCc"]) - expected = expected if box == Index else Series(expected.values, index=s.values) - result = s.str.cat(d) - assert_series_or_index_equal(result, expected) - - # errors for incorrect lengths - rgx = r"If `others` contains arrays or lists \(or other list-likes.*" - z = Series(["1", "2", "3"]) - e = concat([z, z], axis=1) - - # two-dimensional ndarray - with pytest.raises(ValueError, match=rgx): - s.str.cat(e.values) - - # list of list-likes - with pytest.raises(ValueError, match=rgx): - s.str.cat([z.values, s.values]) - - # mixed list of Series/list-like - with pytest.raises(ValueError, match=rgx): - s.str.cat([z.values, s]) - - # errors for incorrect arguments in list-like - rgx = "others must be Series, Index, DataFrame,.*" - # make sure None/NaN do not crash checks in _get_series_list - u = Series(["a", np.nan, "c", None]) - - # mix of string and Series - with pytest.raises(TypeError, match=rgx): - s.str.cat([u, "u"]) - - # DataFrame in list - with pytest.raises(TypeError, match=rgx): - s.str.cat([u, d]) - - # 2-dim ndarray in list - with pytest.raises(TypeError, match=rgx): - s.str.cat([u, d.values]) - - # nested lists - with pytest.raises(TypeError, match=rgx): - s.str.cat([u, [u, d]]) - - # forbidden input type: set - # GH 23009 - with pytest.raises(TypeError, match=rgx): - s.str.cat(set(u)) - - # forbidden input type: set in list - # GH 23009 - with pytest.raises(TypeError, match=rgx): - s.str.cat([u, set(u)]) - - # other forbidden input type, e.g. int - with pytest.raises(TypeError, match=rgx): - s.str.cat(1) - - # nested list-likes - with pytest.raises(TypeError, match=rgx): - s.str.cat(iter([t.values, list(s)])) - - @pytest.mark.parametrize("join", ["left", "outer", "inner", "right"]) - def test_str_cat_align_indexed(self, index_or_series, join): - # https://github.com/pandas-dev/pandas/issues/18657 - box = index_or_series - - s = Series(["a", "b", "c", "d"], index=["a", "b", "c", "d"]) - t = Series(["D", "A", "E", "B"], index=["d", "a", "e", "b"]) - sa, ta = s.align(t, join=join) - # result after manual alignment of inputs - expected = sa.str.cat(ta, na_rep="-") - - if box == Index: - s = Index(s) - sa = Index(sa) - expected = Index(expected) - - result = s.str.cat(t, join=join, na_rep="-") - assert_series_or_index_equal(result, expected) - - @pytest.mark.parametrize("join", ["left", "outer", "inner", "right"]) - def test_str_cat_align_mixed_inputs(self, join): - s = Series(["a", "b", "c", "d"]) - t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1]) - d = concat([t, t], axis=1) - - expected_outer = Series(["aaa", "bbb", "c--", "ddd", "-ee"]) - expected = expected_outer.loc[s.index.join(t.index, how=join)] - - # list of Series - result = s.str.cat([t, t], join=join, na_rep="-") - tm.assert_series_equal(result, expected) - - # DataFrame - result = s.str.cat(d, join=join, na_rep="-") - tm.assert_series_equal(result, expected) - - # mixed list of indexed/unindexed - u = np.array(["A", "B", "C", "D"]) - expected_outer = Series(["aaA", "bbB", "c-C", "ddD", "-e-"]) - # joint index of rhs [t, u]; u will be forced have index of s - rhs_idx = ( - t.index.intersection(s.index) if join == "inner" else t.index.union(s.index) - ) - - expected = expected_outer.loc[s.index.join(rhs_idx, how=join)] - result = s.str.cat([t, u], join=join, na_rep="-") - tm.assert_series_equal(result, expected) - - with pytest.raises(TypeError, match="others must be Series,.*"): - # nested lists are forbidden - s.str.cat([t, list(u)], join=join) - - # errors for incorrect lengths - rgx = r"If `others` contains arrays or lists \(or other list-likes.*" - z = Series(["1", "2", "3"]).values - - # unindexed object of wrong length - with pytest.raises(ValueError, match=rgx): - s.str.cat(z, join=join) - - # unindexed object of wrong length in list - with pytest.raises(ValueError, match=rgx): - s.str.cat([t, z], join=join) - - def test_str_cat_all_na(self, index_or_series, index_or_series2): - # GH 24044 - box = index_or_series - other = index_or_series2 - - # check that all NaNs in caller / target work - s = Index(["a", "b", "c", "d"]) - s = s if box == Index else Series(s, index=s) - t = other([np.nan] * 4, dtype=object) - # add index of s for alignment - t = t if other == Index else Series(t, index=s) - - # all-NA target - if box == Series: - expected = Series([np.nan] * 4, index=s.index, dtype=object) - else: # box == Index - expected = Index([np.nan] * 4, dtype=object) - result = s.str.cat(t, join="left") - assert_series_or_index_equal(result, expected) - - # all-NA caller (only for Series) - if other == Series: - expected = Series([np.nan] * 4, dtype=object, index=t.index) - result = t.str.cat(s, join="left") - tm.assert_series_equal(result, expected) - - def test_str_cat_special_cases(self): - s = Series(["a", "b", "c", "d"]) - t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1]) - - # iterator of elements with different types - expected = Series(["aaa", "bbb", "c-c", "ddd", "-e-"]) - result = s.str.cat(iter([t, s.values]), join="outer", na_rep="-") - tm.assert_series_equal(result, expected) - - # right-align with different indexes in others - expected = Series(["aa-", "d-d"], index=[0, 3]) - result = s.str.cat([t.loc[[0]], t.loc[[3]]], join="right", na_rep="-") - tm.assert_series_equal(result, expected) - - def test_cat_on_filtered_index(self): - df = DataFrame( - index=MultiIndex.from_product( - [[2011, 2012], [1, 2, 3]], names=["year", "month"] - ) - ) - - df = df.reset_index() - df = df[df.month > 1] - - str_year = df.year.astype("str") - str_month = df.month.astype("str") - str_both = str_year.str.cat(str_month, sep=" ") - - assert str_both.loc[1] == "2011 2" - - str_multiple = str_year.str.cat([str_month, str_month], sep=" ") - - assert str_multiple.loc[1] == "2011 2 2" - - def test_count(self): - values = np.array( - ["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=np.object_ - ) - - result = Series(values).str.count("f[o]+") - exp = Series([1, 2, np.nan, 4]) - assert isinstance(result, Series) - tm.assert_series_equal(result, exp) - - # mixed - mixed = np.array( - ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], - dtype=object, - ) - rs = Series(mixed).str.count("a") - xp = Series([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan]) - assert isinstance(rs, Series) - tm.assert_series_equal(rs, xp) - - def test_contains(self): - values = np.array( - ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_ - ) - values = Series(values) - pat = "mmm[_]+" - - result = values.str.contains(pat) - expected = Series( - np.array([False, np.nan, True, True, False], dtype=np.object_) - ) - tm.assert_series_equal(result, expected) - - result = values.str.contains(pat, regex=False) - expected = Series( - np.array([False, np.nan, False, False, True], dtype=np.object_) - ) - tm.assert_series_equal(result, expected) - - values = Series(np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=object)) - result = values.str.contains(pat) - expected = Series(np.array([False, False, True, True])) - assert result.dtype == np.bool_ - tm.assert_series_equal(result, expected) - - # case insensitive using regex - values = Series(np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object)) - result = values.str.contains("FOO|mmm", case=False) - expected = Series(np.array([True, False, True, True])) - tm.assert_series_equal(result, expected) - - # case insensitive without regex - result = Series(values).str.contains("foo", regex=False, case=False) - expected = Series(np.array([True, False, True, False])) - tm.assert_series_equal(result, expected) - - # mixed - mixed = Series( - np.array( - ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], - dtype=object, - ) - ) - rs = mixed.str.contains("o") - xp = Series( - np.array( - [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan], - dtype=np.object_, - ) - ) - tm.assert_series_equal(rs, xp) - - rs = mixed.str.contains("o") - xp = Series( - [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan] - ) - assert isinstance(rs, Series) - tm.assert_series_equal(rs, xp) - - # unicode - values = Series( - np.array(["foo", np.nan, "fooommm__foo", "mmm_"], dtype=np.object_) - ) - pat = "mmm[_]+" - - result = values.str.contains(pat) - expected = Series(np.array([False, np.nan, True, True], dtype=np.object_)) - tm.assert_series_equal(result, expected) - - result = values.str.contains(pat, na=False) - expected = Series(np.array([False, False, True, True])) - tm.assert_series_equal(result, expected) - - values = Series( - np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=np.object_) - ) - result = values.str.contains(pat) - expected = Series(np.array([False, False, True, True])) - assert result.dtype == np.bool_ - tm.assert_series_equal(result, expected) - - def test_contains_for_object_category(self): - # gh 22158 - - # na for category - values = Series(["a", "b", "c", "a", np.nan], dtype="category") - result = values.str.contains("a", na=True) - expected = Series([True, False, False, True, True]) - tm.assert_series_equal(result, expected) - - result = values.str.contains("a", na=False) - expected = Series([True, False, False, True, False]) - tm.assert_series_equal(result, expected) - - # na for objects - values = Series(["a", "b", "c", "a", np.nan]) - result = values.str.contains("a", na=True) - expected = Series([True, False, False, True, True]) - tm.assert_series_equal(result, expected) - - result = values.str.contains("a", na=False) - expected = Series([True, False, False, True, False]) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("dtype", [None, "category"]) - @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) - @pytest.mark.parametrize("na", [True, False]) - def test_startswith(self, dtype, null_value, na): - # add category dtype parametrizations for GH-36241 - values = Series( - ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"], - dtype=dtype, - ) - - result = values.str.startswith("foo") - exp = Series([False, np.nan, True, False, False, np.nan, True]) - tm.assert_series_equal(result, exp) - - result = values.str.startswith("foo", na=na) - exp = Series([False, na, True, False, False, na, True]) - tm.assert_series_equal(result, exp) - - # mixed - mixed = np.array( - ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], - dtype=np.object_, - ) - rs = Series(mixed).str.startswith("f") - xp = Series( - [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan] - ) - tm.assert_series_equal(rs, xp) - - @pytest.mark.parametrize("dtype", [None, "category"]) - @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) - @pytest.mark.parametrize("na", [True, False]) - def test_endswith(self, dtype, null_value, na): - # add category dtype parametrizations for GH-36241 - values = Series( - ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"], - dtype=dtype, - ) - - result = values.str.endswith("foo") - exp = Series([False, np.nan, False, False, True, np.nan, True]) - tm.assert_series_equal(result, exp) - - result = values.str.endswith("foo", na=na) - exp = Series([False, na, False, False, True, na, True]) - tm.assert_series_equal(result, exp) - - # mixed - mixed = np.array( - ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], - dtype=object, - ) - rs = Series(mixed).str.endswith("f") - xp = Series( - [False, np.nan, False, np.nan, np.nan, False, np.nan, np.nan, np.nan] - ) - tm.assert_series_equal(rs, xp) - - def test_title(self): - values = Series(["FOO", "BAR", np.nan, "Blah", "blurg"]) - - result = values.str.title() - exp = Series(["Foo", "Bar", np.nan, "Blah", "Blurg"]) - tm.assert_series_equal(result, exp) - - # mixed - mixed = Series( - ["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0] - ) - mixed = mixed.str.title() - exp = Series( - ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", np.nan, np.nan, np.nan] - ) - tm.assert_almost_equal(mixed, exp) - - def test_lower_upper(self): - values = Series(["om", np.nan, "nom", "nom"]) - - result = values.str.upper() - exp = Series(["OM", np.nan, "NOM", "NOM"]) - tm.assert_series_equal(result, exp) - - result = result.str.lower() - tm.assert_series_equal(result, values) - - # mixed - mixed = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]) - mixed = mixed.str.upper() - rs = Series(mixed).str.lower() - xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) - assert isinstance(rs, Series) - tm.assert_series_equal(rs, xp) - - def test_capitalize(self): - values = Series(["FOO", "BAR", np.nan, "Blah", "blurg"]) - result = values.str.capitalize() - exp = Series(["Foo", "Bar", np.nan, "Blah", "Blurg"]) - tm.assert_series_equal(result, exp) - - # mixed - mixed = Series( - ["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0] - ) - mixed = mixed.str.capitalize() - exp = Series( - ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", np.nan, np.nan, np.nan] - ) - tm.assert_almost_equal(mixed, exp) - - def test_swapcase(self): - values = Series(["FOO", "BAR", np.nan, "Blah", "blurg"]) - result = values.str.swapcase() - exp = Series(["foo", "bar", np.nan, "bLAH", "BLURG"]) - tm.assert_series_equal(result, exp) - - # mixed - mixed = Series( - ["FOO", np.nan, "bar", True, datetime.today(), "Blah", None, 1, 2.0] - ) - mixed = mixed.str.swapcase() - exp = Series( - ["foo", np.nan, "BAR", np.nan, np.nan, "bLAH", np.nan, np.nan, np.nan] - ) - tm.assert_almost_equal(mixed, exp) - - def test_casemethods(self): - values = ["aaa", "bbb", "CCC", "Dddd", "eEEE"] - s = Series(values) - assert s.str.lower().tolist() == [v.lower() for v in values] - assert s.str.upper().tolist() == [v.upper() for v in values] - assert s.str.title().tolist() == [v.title() for v in values] - assert s.str.capitalize().tolist() == [v.capitalize() for v in values] - assert s.str.swapcase().tolist() == [v.swapcase() for v in values] - - def test_replace(self): - values = Series(["fooBAD__barBAD", np.nan]) - - result = values.str.replace("BAD[_]*", "", regex=True) - exp = Series(["foobar", np.nan]) - tm.assert_series_equal(result, exp) - - result = values.str.replace("BAD[_]*", "", n=1, regex=True) - exp = Series(["foobarBAD", np.nan]) - tm.assert_series_equal(result, exp) - - # mixed - mixed = Series( - ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] - ) - - rs = Series(mixed).str.replace("BAD[_]*", "", regex=True) - xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) - - # flags + unicode - values = Series([b"abcd,\xc3\xa0".decode("utf-8")]) - exp = Series([b"abcd, \xc3\xa0".decode("utf-8")]) - result = values.str.replace( - r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True - ) - tm.assert_series_equal(result, exp) - - # GH 13438 - msg = "repl must be a string or callable" - for klass in (Series, Index): - for repl in (None, 3, {"a": "b"}): - for data in (["a", "b", None], ["a", "b", "c", "ad"]): - values = klass(data) - with pytest.raises(TypeError, match=msg): - values.str.replace("a", repl) - - def test_replace_callable(self): - # GH 15055 - values = Series(["fooBAD__barBAD", np.nan]) - - # test with callable - repl = lambda m: m.group(0).swapcase() - result = values.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) - exp = Series(["foObaD__baRbaD", np.nan]) - tm.assert_series_equal(result, exp) - - # test with wrong number of arguments, raising an error - p_err = ( - r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " - r"(?(3)required )positional arguments?" - ) - - repl = lambda: None - with pytest.raises(TypeError, match=p_err): - values.str.replace("a", repl) - - repl = lambda m, x: None - with pytest.raises(TypeError, match=p_err): - values.str.replace("a", repl) - - repl = lambda m, x, y=None: None - with pytest.raises(TypeError, match=p_err): - values.str.replace("a", repl) - - # test regex named groups - values = Series(["Foo Bar Baz", np.nan]) - pat = r"(?P\w+) (?P\w+) (?P\w+)" - repl = lambda m: m.group("middle").swapcase() - result = values.str.replace(pat, repl, regex=True) - exp = Series(["bAR", np.nan]) - tm.assert_series_equal(result, exp) - - def test_replace_compiled_regex(self): - # GH 15446 - values = Series(["fooBAD__barBAD", np.nan]) - - # test with compiled regex - pat = re.compile(r"BAD_*") - result = values.str.replace(pat, "", regex=True) - exp = Series(["foobar", np.nan]) - tm.assert_series_equal(result, exp) - - result = values.str.replace(pat, "", n=1, regex=True) - exp = Series(["foobarBAD", np.nan]) - tm.assert_series_equal(result, exp) - - # mixed - mixed = Series( - ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] - ) - - rs = Series(mixed).str.replace(pat, "", regex=True) - xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) - - # flags + unicode - values = Series([b"abcd,\xc3\xa0".decode("utf-8")]) - exp = Series([b"abcd, \xc3\xa0".decode("utf-8")]) - pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) - result = values.str.replace(pat, ", ") - tm.assert_series_equal(result, exp) - - # case and flags provided to str.replace will have no effect - # and will produce warnings - values = Series(["fooBAD__barBAD__bad", np.nan]) - pat = re.compile(r"BAD_*") - - with pytest.raises(ValueError, match="case and flags cannot be"): - result = values.str.replace(pat, "", flags=re.IGNORECASE) - - with pytest.raises(ValueError, match="case and flags cannot be"): - result = values.str.replace(pat, "", case=False) - - with pytest.raises(ValueError, match="case and flags cannot be"): - result = values.str.replace(pat, "", case=True) - - # test with callable - values = Series(["fooBAD__barBAD", np.nan]) - repl = lambda m: m.group(0).swapcase() - pat = re.compile("[a-z][A-Z]{2}") - result = values.str.replace(pat, repl, n=2) - exp = Series(["foObaD__baRbaD", np.nan]) - tm.assert_series_equal(result, exp) - - def test_replace_literal(self): - # GH16808 literal replace (regex=False vs regex=True) - values = Series(["f.o", "foo", np.nan]) - exp = Series(["bao", "bao", np.nan]) - result = values.str.replace("f.", "ba", regex=True) - tm.assert_series_equal(result, exp) - - exp = Series(["bao", "foo", np.nan]) - result = values.str.replace("f.", "ba", regex=False) - tm.assert_series_equal(result, exp) - - # Cannot do a literal replace if given a callable repl or compiled - # pattern - callable_repl = lambda m: m.group(0).swapcase() - compiled_pat = re.compile("[a-z][A-Z]{2}") - - msg = "Cannot use a callable replacement when regex=False" - with pytest.raises(ValueError, match=msg): - values.str.replace("abc", callable_repl, regex=False) - - msg = "Cannot use a compiled regex as replacement pattern with regex=False" - with pytest.raises(ValueError, match=msg): - values.str.replace(compiled_pat, "", regex=False) - - def test_repeat(self): - values = Series(["a", "b", np.nan, "c", np.nan, "d"]) - - result = values.str.repeat(3) - exp = Series(["aaa", "bbb", np.nan, "ccc", np.nan, "ddd"]) - tm.assert_series_equal(result, exp) - - result = values.str.repeat([1, 2, 3, 4, 5, 6]) - exp = Series(["a", "bb", np.nan, "cccc", np.nan, "dddddd"]) - tm.assert_series_equal(result, exp) - - # mixed - mixed = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]) - - rs = Series(mixed).str.repeat(3) - xp = Series( - ["aaa", np.nan, "bbb", np.nan, np.nan, "foofoofoo", np.nan, np.nan, np.nan] - ) - assert isinstance(rs, Series) - tm.assert_series_equal(rs, xp) - - def test_repeat_with_null(self): - # GH: 31632 - values = Series(["a", None], dtype="string") - result = values.str.repeat([3, 4]) - exp = Series(["aaa", None], dtype="string") - tm.assert_series_equal(result, exp) - - values = Series(["a", "b"], dtype="string") - result = values.str.repeat([3, None]) - exp = Series(["aaa", None], dtype="string") - tm.assert_series_equal(result, exp) - - def test_match(self): - # New match behavior introduced in 0.13 - values = Series(["fooBAD__barBAD", np.nan, "foo"]) - result = values.str.match(".*(BAD[_]+).*(BAD)") - exp = Series([True, np.nan, False]) - tm.assert_series_equal(result, exp) - - values = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"]) - result = values.str.match(".*BAD[_]+.*BAD") - exp = Series([True, True, np.nan, False]) - tm.assert_series_equal(result, exp) - - # mixed - mixed = Series( - [ - "aBAD_BAD", - np.nan, - "BAD_b_BAD", - True, - datetime.today(), - "foo", - None, - 1, - 2.0, - ] - ) - rs = Series(mixed).str.match(".*(BAD[_]+).*(BAD)") - xp = Series([True, np.nan, True, np.nan, np.nan, False, np.nan, np.nan, np.nan]) - assert isinstance(rs, Series) - tm.assert_series_equal(rs, xp) - - # na GH #6609 - res = Series(["a", 0, np.nan]).str.match("a", na=False) - exp = Series([True, False, False]) - tm.assert_series_equal(exp, res) - res = Series(["a", 0, np.nan]).str.match("a") - exp = Series([True, np.nan, np.nan]) - tm.assert_series_equal(exp, res) - - values = Series(["ab", "AB", "abc", "ABC"]) - result = values.str.match("ab", case=False) - expected = Series([True, True, True, True]) - tm.assert_series_equal(result, expected) - - def test_fullmatch(self): - # GH 32806 - values = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"]) - result = values.str.fullmatch(".*BAD[_]+.*BAD") - exp = Series([True, False, np.nan, False]) - tm.assert_series_equal(result, exp) - - # Make sure that the new string arrays work - string_values = Series( - ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype="string" - ) - result = string_values.str.fullmatch(".*BAD[_]+.*BAD") - # Result is nullable boolean with StringDtype - string_exp = Series([True, False, np.nan, False], dtype="boolean") - tm.assert_series_equal(result, string_exp) - - values = Series(["ab", "AB", "abc", "ABC"]) - result = values.str.fullmatch("ab", case=False) - expected = Series([True, True, False, False]) - tm.assert_series_equal(result, expected) - - def test_extract_expand_None(self): - values = Series(["fooBAD__barBAD", np.nan, "foo"]) - with pytest.raises(ValueError, match="expand must be True or False"): - values.str.extract(".*(BAD[_]+).*(BAD)", expand=None) - - def test_extract_expand_unspecified(self): - values = Series(["fooBAD__barBAD", np.nan, "foo"]) - result_unspecified = values.str.extract(".*(BAD[_]+).*") - assert isinstance(result_unspecified, DataFrame) - result_true = values.str.extract(".*(BAD[_]+).*", expand=True) - tm.assert_frame_equal(result_unspecified, result_true) - - def test_extract_expand_False(self): - # Contains tests like those in test_match and some others. - values = Series(["fooBAD__barBAD", np.nan, "foo"]) - er = [np.nan, np.nan] # empty row - - result = values.str.extract(".*(BAD[_]+).*(BAD)", expand=False) - exp = DataFrame([["BAD__", "BAD"], er, er]) - tm.assert_frame_equal(result, exp) - - # mixed - mixed = Series( - [ - "aBAD_BAD", - np.nan, - "BAD_b_BAD", - True, - datetime.today(), - "foo", - None, - 1, - 2.0, - ] - ) - - rs = Series(mixed).str.extract(".*(BAD[_]+).*(BAD)", expand=False) - exp = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er]) - tm.assert_frame_equal(rs, exp) - - # unicode - values = Series(["fooBAD__barBAD", np.nan, "foo"]) - - result = values.str.extract(".*(BAD[_]+).*(BAD)", expand=False) - exp = DataFrame([["BAD__", "BAD"], er, er]) - tm.assert_frame_equal(result, exp) - - # GH9980 - # Index only works with one regex group since - # multi-group would expand to a frame - idx = Index(["A1", "A2", "A3", "A4", "B5"]) - with pytest.raises(ValueError, match="supported"): - idx.str.extract("([AB])([123])", expand=False) - - # these should work for both Series and Index - for klass in [Series, Index]: - # no groups - s_or_idx = klass(["A1", "B2", "C3"]) - msg = "pattern contains no capture groups" - with pytest.raises(ValueError, match=msg): - s_or_idx.str.extract("[ABC][123]", expand=False) - - # only non-capturing groups - with pytest.raises(ValueError, match=msg): - s_or_idx.str.extract("(?:[AB]).*", expand=False) - - # single group renames series/index properly - s_or_idx = klass(["A1", "A2"]) - result = s_or_idx.str.extract(r"(?PA)\d", expand=False) - assert result.name == "uno" - - exp = klass(["A", "A"], name="uno") - if klass == Series: - tm.assert_series_equal(result, exp) - else: - tm.assert_index_equal(result, exp) - - s = Series(["A1", "B2", "C3"]) - # one group, no matches - result = s.str.extract("(_)", expand=False) - exp = Series([np.nan, np.nan, np.nan], dtype=object) - tm.assert_series_equal(result, exp) - - # two groups, no matches - result = s.str.extract("(_)(_)", expand=False) - exp = DataFrame( - [[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]], dtype=object - ) - tm.assert_frame_equal(result, exp) - - # one group, some matches - result = s.str.extract("([AB])[123]", expand=False) - exp = Series(["A", "B", np.nan]) - tm.assert_series_equal(result, exp) - - # two groups, some matches - result = s.str.extract("([AB])([123])", expand=False) - exp = DataFrame([["A", "1"], ["B", "2"], [np.nan, np.nan]]) - tm.assert_frame_equal(result, exp) - - # one named group - result = s.str.extract("(?P[AB])", expand=False) - exp = Series(["A", "B", np.nan], name="letter") - tm.assert_series_equal(result, exp) - - # two named groups - result = s.str.extract("(?P[AB])(?P[123])", expand=False) - exp = DataFrame( - [["A", "1"], ["B", "2"], [np.nan, np.nan]], columns=["letter", "number"] - ) - tm.assert_frame_equal(result, exp) - - # mix named and unnamed groups - result = s.str.extract("([AB])(?P[123])", expand=False) - exp = DataFrame( - [["A", "1"], ["B", "2"], [np.nan, np.nan]], columns=[0, "number"] - ) - tm.assert_frame_equal(result, exp) - - # one normal group, one non-capturing group - result = s.str.extract("([AB])(?:[123])", expand=False) - exp = Series(["A", "B", np.nan]) - tm.assert_series_equal(result, exp) - - # two normal groups, one non-capturing group - result = Series(["A11", "B22", "C33"]).str.extract( - "([AB])([123])(?:[123])", expand=False - ) - exp = DataFrame([["A", "1"], ["B", "2"], [np.nan, np.nan]]) - tm.assert_frame_equal(result, exp) - - # one optional group followed by one normal group - result = Series(["A1", "B2", "3"]).str.extract( - "(?P[AB])?(?P[123])", expand=False - ) - exp = DataFrame( - [["A", "1"], ["B", "2"], [np.nan, "3"]], columns=["letter", "number"] - ) - tm.assert_frame_equal(result, exp) - - # one normal group followed by one optional group - result = Series(["A1", "B2", "C"]).str.extract( - "(?P[ABC])(?P[123])?", expand=False - ) - exp = DataFrame( - [["A", "1"], ["B", "2"], ["C", np.nan]], columns=["letter", "number"] - ) - tm.assert_frame_equal(result, exp) - - # GH6348 - # not passing index to the extractor - def check_index(index): - data = ["A1", "B2", "C"] - index = index[: len(data)] - s = Series(data, index=index) - result = s.str.extract(r"(\d)", expand=False) - exp = Series(["1", "2", np.nan], index=index) - tm.assert_series_equal(result, exp) - - result = Series(data, index=index).str.extract( - r"(?P\D)(?P\d)?", expand=False - ) - e_list = [["A", "1"], ["B", "2"], ["C", np.nan]] - exp = DataFrame(e_list, columns=["letter", "number"], index=index) - tm.assert_frame_equal(result, exp) - - i_funs = [ - tm.makeStringIndex, - tm.makeUnicodeIndex, - tm.makeIntIndex, - tm.makeDateIndex, - tm.makePeriodIndex, - tm.makeRangeIndex, - ] - for index in i_funs: - check_index(index()) - - # single_series_name_is_preserved. - s = Series(["a3", "b3", "c2"], name="bob") - r = s.str.extract(r"(?P[a-z])", expand=False) - e = Series(["a", "b", "c"], name="sue") - tm.assert_series_equal(r, e) - assert r.name == e.name - - def test_extract_expand_True(self): - # Contains tests like those in test_match and some others. - values = Series(["fooBAD__barBAD", np.nan, "foo"]) - er = [np.nan, np.nan] # empty row - - result = values.str.extract(".*(BAD[_]+).*(BAD)", expand=True) - exp = DataFrame([["BAD__", "BAD"], er, er]) - tm.assert_frame_equal(result, exp) - - # mixed - mixed = Series( - [ - "aBAD_BAD", - np.nan, - "BAD_b_BAD", - True, - datetime.today(), - "foo", - None, - 1, - 2.0, - ] - ) - - rs = Series(mixed).str.extract(".*(BAD[_]+).*(BAD)", expand=True) - exp = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er]) - tm.assert_frame_equal(rs, exp) - - # these should work for both Series and Index - for klass in [Series, Index]: - # no groups - s_or_idx = klass(["A1", "B2", "C3"]) - msg = "pattern contains no capture groups" - with pytest.raises(ValueError, match=msg): - s_or_idx.str.extract("[ABC][123]", expand=True) - - # only non-capturing groups - with pytest.raises(ValueError, match=msg): - s_or_idx.str.extract("(?:[AB]).*", expand=True) - - # single group renames series/index properly - s_or_idx = klass(["A1", "A2"]) - result_df = s_or_idx.str.extract(r"(?PA)\d", expand=True) - assert isinstance(result_df, DataFrame) - result_series = result_df["uno"] - tm.assert_series_equal(result_series, Series(["A", "A"], name="uno")) - - def test_extract_series(self): - # extract should give the same result whether or not the - # series has a name. - for series_name in None, "series_name": - s = Series(["A1", "B2", "C3"], name=series_name) - # one group, no matches - result = s.str.extract("(_)", expand=True) - exp = DataFrame([np.nan, np.nan, np.nan], dtype=object) - tm.assert_frame_equal(result, exp) - - # two groups, no matches - result = s.str.extract("(_)(_)", expand=True) - exp = DataFrame( - [[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]], dtype=object - ) - tm.assert_frame_equal(result, exp) - - # one group, some matches - result = s.str.extract("([AB])[123]", expand=True) - exp = DataFrame(["A", "B", np.nan]) - tm.assert_frame_equal(result, exp) - - # two groups, some matches - result = s.str.extract("([AB])([123])", expand=True) - exp = DataFrame([["A", "1"], ["B", "2"], [np.nan, np.nan]]) - tm.assert_frame_equal(result, exp) - - # one named group - result = s.str.extract("(?P[AB])", expand=True) - exp = DataFrame({"letter": ["A", "B", np.nan]}) - tm.assert_frame_equal(result, exp) - - # two named groups - result = s.str.extract("(?P[AB])(?P[123])", expand=True) - e_list = [["A", "1"], ["B", "2"], [np.nan, np.nan]] - exp = DataFrame(e_list, columns=["letter", "number"]) - tm.assert_frame_equal(result, exp) - - # mix named and unnamed groups - result = s.str.extract("([AB])(?P[123])", expand=True) - exp = DataFrame(e_list, columns=[0, "number"]) - tm.assert_frame_equal(result, exp) - - # one normal group, one non-capturing group - result = s.str.extract("([AB])(?:[123])", expand=True) - exp = DataFrame(["A", "B", np.nan]) - tm.assert_frame_equal(result, exp) - - def test_extract_optional_groups(self): - - # two normal groups, one non-capturing group - result = Series(["A11", "B22", "C33"]).str.extract( - "([AB])([123])(?:[123])", expand=True - ) - exp = DataFrame([["A", "1"], ["B", "2"], [np.nan, np.nan]]) - tm.assert_frame_equal(result, exp) - - # one optional group followed by one normal group - result = Series(["A1", "B2", "3"]).str.extract( - "(?P[AB])?(?P[123])", expand=True - ) - e_list = [["A", "1"], ["B", "2"], [np.nan, "3"]] - exp = DataFrame(e_list, columns=["letter", "number"]) - tm.assert_frame_equal(result, exp) - - # one normal group followed by one optional group - result = Series(["A1", "B2", "C"]).str.extract( - "(?P[ABC])(?P[123])?", expand=True - ) - e_list = [["A", "1"], ["B", "2"], ["C", np.nan]] - exp = DataFrame(e_list, columns=["letter", "number"]) - tm.assert_frame_equal(result, exp) - - # GH6348 - # not passing index to the extractor - def check_index(index): - data = ["A1", "B2", "C"] - index = index[: len(data)] - result = Series(data, index=index).str.extract(r"(\d)", expand=True) - exp = DataFrame(["1", "2", np.nan], index=index) - tm.assert_frame_equal(result, exp) - - result = Series(data, index=index).str.extract( - r"(?P\D)(?P\d)?", expand=True - ) - e_list = [["A", "1"], ["B", "2"], ["C", np.nan]] - exp = DataFrame(e_list, columns=["letter", "number"], index=index) - tm.assert_frame_equal(result, exp) - - i_funs = [ - tm.makeStringIndex, - tm.makeUnicodeIndex, - tm.makeIntIndex, - tm.makeDateIndex, - tm.makePeriodIndex, - tm.makeRangeIndex, - ] - for index in i_funs: - check_index(index()) - - def test_extract_single_group_returns_frame(self): - # GH11386 extract should always return DataFrame, even when - # there is only one group. Prior to v0.18.0, extract returned - # Series when there was only one group in the regex. - s = Series(["a3", "b3", "c2"], name="series_name") - r = s.str.extract(r"(?P[a-z])", expand=True) - e = DataFrame({"letter": ["a", "b", "c"]}) - tm.assert_frame_equal(r, e) - - def test_extractall(self): - subject_list = [ - "dave@google.com", - "tdhock5@gmail.com", - "maudelaperriere@gmail.com", - "rob@gmail.com some text steve@gmail.com", - "a@b.com some text c@d.com and e@f.com", - np.nan, - "", - ] - expected_tuples = [ - ("dave", "google", "com"), - ("tdhock5", "gmail", "com"), - ("maudelaperriere", "gmail", "com"), - ("rob", "gmail", "com"), - ("steve", "gmail", "com"), - ("a", "b", "com"), - ("c", "d", "com"), - ("e", "f", "com"), - ] - named_pattern = r""" - (?P[a-z0-9]+) - @ - (?P[a-z]+) - \. - (?P[a-z]{2,4}) - """ - expected_columns = ["user", "domain", "tld"] - S = Series(subject_list) - # extractall should return a DataFrame with one row for each - # match, indexed by the subject from which the match came. - expected_index = MultiIndex.from_tuples( - [(0, 0), (1, 0), (2, 0), (3, 0), (3, 1), (4, 0), (4, 1), (4, 2)], - names=(None, "match"), - ) - expected_df = DataFrame(expected_tuples, expected_index, expected_columns) - computed_df = S.str.extractall(named_pattern, re.VERBOSE) - tm.assert_frame_equal(computed_df, expected_df) - - # The index of the input Series should be used to construct - # the index of the output DataFrame: - series_index = MultiIndex.from_tuples( - [ - ("single", "Dave"), - ("single", "Toby"), - ("single", "Maude"), - ("multiple", "robAndSteve"), - ("multiple", "abcdef"), - ("none", "missing"), - ("none", "empty"), - ] - ) - Si = Series(subject_list, series_index) - expected_index = MultiIndex.from_tuples( - [ - ("single", "Dave", 0), - ("single", "Toby", 0), - ("single", "Maude", 0), - ("multiple", "robAndSteve", 0), - ("multiple", "robAndSteve", 1), - ("multiple", "abcdef", 0), - ("multiple", "abcdef", 1), - ("multiple", "abcdef", 2), - ], - names=(None, None, "match"), - ) - expected_df = DataFrame(expected_tuples, expected_index, expected_columns) - computed_df = Si.str.extractall(named_pattern, re.VERBOSE) - tm.assert_frame_equal(computed_df, expected_df) - - # MultiIndexed subject with names. - Sn = Series(subject_list, series_index) - Sn.index.names = ("matches", "description") - expected_index.names = ("matches", "description", "match") - expected_df = DataFrame(expected_tuples, expected_index, expected_columns) - computed_df = Sn.str.extractall(named_pattern, re.VERBOSE) - tm.assert_frame_equal(computed_df, expected_df) - - # optional groups. - subject_list = ["", "A1", "32"] - named_pattern = "(?P[AB])?(?P[123])" - computed_df = Series(subject_list).str.extractall(named_pattern) - expected_index = MultiIndex.from_tuples( - [(1, 0), (2, 0), (2, 1)], names=(None, "match") - ) - expected_df = DataFrame( - [("A", "1"), (np.nan, "3"), (np.nan, "2")], - expected_index, - columns=["letter", "number"], - ) - tm.assert_frame_equal(computed_df, expected_df) - - # only one of two groups has a name. - pattern = "([AB])?(?P[123])" - computed_df = Series(subject_list).str.extractall(pattern) - expected_df = DataFrame( - [("A", "1"), (np.nan, "3"), (np.nan, "2")], - expected_index, - columns=[0, "number"], - ) - tm.assert_frame_equal(computed_df, expected_df) - - def test_extractall_single_group(self): - # extractall(one named group) returns DataFrame with one named - # column. - s = Series(["a3", "b3", "d4c2"], name="series_name") - r = s.str.extractall(r"(?P[a-z])") - i = MultiIndex.from_tuples( - [(0, 0), (1, 0), (2, 0), (2, 1)], names=(None, "match") - ) - e = DataFrame({"letter": ["a", "b", "d", "c"]}, i) - tm.assert_frame_equal(r, e) - - # extractall(one un-named group) returns DataFrame with one - # un-named column. - r = s.str.extractall(r"([a-z])") - e = DataFrame(["a", "b", "d", "c"], i) - tm.assert_frame_equal(r, e) - - def test_extractall_single_group_with_quantifier(self): - # extractall(one un-named group with quantifier) returns - # DataFrame with one un-named column (GH13382). - s = Series(["ab3", "abc3", "d4cd2"], name="series_name") - r = s.str.extractall(r"([a-z]+)") - i = MultiIndex.from_tuples( - [(0, 0), (1, 0), (2, 0), (2, 1)], names=(None, "match") - ) - e = DataFrame(["ab", "abc", "d", "cd"], i) - tm.assert_frame_equal(r, e) - - @pytest.mark.parametrize( - "data, names", - [ - ([], (None,)), - ([], ("i1",)), - ([], (None, "i2")), - ([], ("i1", "i2")), - (["a3", "b3", "d4c2"], (None,)), - (["a3", "b3", "d4c2"], ("i1", "i2")), - (["a3", "b3", "d4c2"], (None, "i2")), - (["a3", "b3", "d4c2"], ("i1", "i2")), - ], - ) - def test_extractall_no_matches(self, data, names): - # GH19075 extractall with no matches should return a valid MultiIndex - n = len(data) - if len(names) == 1: - i = Index(range(n), name=names[0]) - else: - a = (tuple([i] * (n - 1)) for i in range(n)) - i = MultiIndex.from_tuples(a, names=names) - s = Series(data, name="series_name", index=i, dtype="object") - ei = MultiIndex.from_tuples([], names=(names + ("match",))) - - # one un-named group. - r = s.str.extractall("(z)") - e = DataFrame(columns=[0], index=ei) - tm.assert_frame_equal(r, e) - - # two un-named groups. - r = s.str.extractall("(z)(z)") - e = DataFrame(columns=[0, 1], index=ei) - tm.assert_frame_equal(r, e) - - # one named group. - r = s.str.extractall("(?Pz)") - e = DataFrame(columns=["first"], index=ei) - tm.assert_frame_equal(r, e) - - # two named groups. - r = s.str.extractall("(?Pz)(?Pz)") - e = DataFrame(columns=["first", "second"], index=ei) - tm.assert_frame_equal(r, e) - - # one named, one un-named. - r = s.str.extractall("(z)(?Pz)") - e = DataFrame(columns=[0, "second"], index=ei) - tm.assert_frame_equal(r, e) - - def test_extractall_stringindex(self): - s = Series(["a1a2", "b1", "c1"], name="xxx") - res = s.str.extractall(r"[ab](?P\d)") - exp_idx = MultiIndex.from_tuples( - [(0, 0), (0, 1), (1, 0)], names=[None, "match"] - ) - exp = DataFrame({"digit": ["1", "2", "1"]}, index=exp_idx) - tm.assert_frame_equal(res, exp) - - # index should return the same result as the default index without name - # thus index.name doesn't affect to the result - for idx in [ - Index(["a1a2", "b1", "c1"]), - Index(["a1a2", "b1", "c1"], name="xxx"), - ]: - - res = idx.str.extractall(r"[ab](?P\d)") - tm.assert_frame_equal(res, exp) - - s = Series( - ["a1a2", "b1", "c1"], - name="s_name", - index=Index(["XX", "yy", "zz"], name="idx_name"), - ) - res = s.str.extractall(r"[ab](?P\d)") - exp_idx = MultiIndex.from_tuples( - [("XX", 0), ("XX", 1), ("yy", 0)], names=["idx_name", "match"] - ) - exp = DataFrame({"digit": ["1", "2", "1"]}, index=exp_idx) - tm.assert_frame_equal(res, exp) - - def test_extractall_errors(self): - # Does not make sense to use extractall with a regex that has - # no capture groups. (it returns DataFrame with one column for - # each capture group) - s = Series(["a3", "b3", "d4c2"], name="series_name") - with pytest.raises(ValueError, match="no capture groups"): - s.str.extractall(r"[a-z]") - - def test_extract_index_one_two_groups(self): - s = Series(["a3", "b3", "d4c2"], index=["A3", "B3", "D4"], name="series_name") - r = s.index.str.extract(r"([A-Z])", expand=True) - e = DataFrame(["A", "B", "D"]) - tm.assert_frame_equal(r, e) - - # Prior to v0.18.0, index.str.extract(regex with one group) - # returned Index. With more than one group, extract raised an - # error (GH9980). Now extract always returns DataFrame. - r = s.index.str.extract(r"(?P[A-Z])(?P[0-9])", expand=True) - e_list = [("A", "3"), ("B", "3"), ("D", "4")] - e = DataFrame(e_list, columns=["letter", "digit"]) - tm.assert_frame_equal(r, e) - - def test_extractall_same_as_extract(self): - s = Series(["a3", "b3", "c2"], name="series_name") - - pattern_two_noname = r"([a-z])([0-9])" - extract_two_noname = s.str.extract(pattern_two_noname, expand=True) - has_multi_index = s.str.extractall(pattern_two_noname) - no_multi_index = has_multi_index.xs(0, level="match") - tm.assert_frame_equal(extract_two_noname, no_multi_index) - - pattern_two_named = r"(?P[a-z])(?P[0-9])" - extract_two_named = s.str.extract(pattern_two_named, expand=True) - has_multi_index = s.str.extractall(pattern_two_named) - no_multi_index = has_multi_index.xs(0, level="match") - tm.assert_frame_equal(extract_two_named, no_multi_index) - - pattern_one_named = r"(?P[a-z])" - extract_one_named = s.str.extract(pattern_one_named, expand=True) - has_multi_index = s.str.extractall(pattern_one_named) - no_multi_index = has_multi_index.xs(0, level="match") - tm.assert_frame_equal(extract_one_named, no_multi_index) - - pattern_one_noname = r"([a-z])" - extract_one_noname = s.str.extract(pattern_one_noname, expand=True) - has_multi_index = s.str.extractall(pattern_one_noname) - no_multi_index = has_multi_index.xs(0, level="match") - tm.assert_frame_equal(extract_one_noname, no_multi_index) - - def test_extractall_same_as_extract_subject_index(self): - # same as above tests, but s has an MultiIndex. - i = MultiIndex.from_tuples( - [("A", "first"), ("B", "second"), ("C", "third")], - names=("capital", "ordinal"), - ) - s = Series(["a3", "b3", "c2"], i, name="series_name") - - pattern_two_noname = r"([a-z])([0-9])" - extract_two_noname = s.str.extract(pattern_two_noname, expand=True) - has_match_index = s.str.extractall(pattern_two_noname) - no_match_index = has_match_index.xs(0, level="match") - tm.assert_frame_equal(extract_two_noname, no_match_index) - - pattern_two_named = r"(?P[a-z])(?P[0-9])" - extract_two_named = s.str.extract(pattern_two_named, expand=True) - has_match_index = s.str.extractall(pattern_two_named) - no_match_index = has_match_index.xs(0, level="match") - tm.assert_frame_equal(extract_two_named, no_match_index) - - pattern_one_named = r"(?P[a-z])" - extract_one_named = s.str.extract(pattern_one_named, expand=True) - has_match_index = s.str.extractall(pattern_one_named) - no_match_index = has_match_index.xs(0, level="match") - tm.assert_frame_equal(extract_one_named, no_match_index) - - pattern_one_noname = r"([a-z])" - extract_one_noname = s.str.extract(pattern_one_noname, expand=True) - has_match_index = s.str.extractall(pattern_one_noname) - no_match_index = has_match_index.xs(0, level="match") - tm.assert_frame_equal(extract_one_noname, no_match_index) - - def test_empty_str_methods(self): - empty_str = empty = Series(dtype=object) - empty_int = Series(dtype="int64") - empty_bool = Series(dtype=bool) - empty_bytes = Series(dtype=object) - - # GH7241 - # (extract) on empty series - - tm.assert_series_equal(empty_str, empty.str.cat(empty)) - assert "" == empty.str.cat() - tm.assert_series_equal(empty_str, empty.str.title()) - tm.assert_series_equal(empty_int, empty.str.count("a")) - tm.assert_series_equal(empty_bool, empty.str.contains("a")) - tm.assert_series_equal(empty_bool, empty.str.startswith("a")) - tm.assert_series_equal(empty_bool, empty.str.endswith("a")) - tm.assert_series_equal(empty_str, empty.str.lower()) - tm.assert_series_equal(empty_str, empty.str.upper()) - tm.assert_series_equal(empty_str, empty.str.replace("a", "b")) - tm.assert_series_equal(empty_str, empty.str.repeat(3)) - tm.assert_series_equal(empty_bool, empty.str.match("^a")) - tm.assert_frame_equal( - DataFrame(columns=[0], dtype=str), empty.str.extract("()", expand=True) - ) - tm.assert_frame_equal( - DataFrame(columns=[0, 1], dtype=str), empty.str.extract("()()", expand=True) - ) - tm.assert_series_equal(empty_str, empty.str.extract("()", expand=False)) - tm.assert_frame_equal( - DataFrame(columns=[0, 1], dtype=str), - empty.str.extract("()()", expand=False), - ) - tm.assert_frame_equal(DataFrame(dtype=str), empty.str.get_dummies()) - tm.assert_series_equal(empty_str, empty_str.str.join("")) - tm.assert_series_equal(empty_int, empty.str.len()) - tm.assert_series_equal(empty_str, empty_str.str.findall("a")) - tm.assert_series_equal(empty_int, empty.str.find("a")) - tm.assert_series_equal(empty_int, empty.str.rfind("a")) - tm.assert_series_equal(empty_str, empty.str.pad(42)) - tm.assert_series_equal(empty_str, empty.str.center(42)) - tm.assert_series_equal(empty_str, empty.str.split("a")) - tm.assert_series_equal(empty_str, empty.str.rsplit("a")) - tm.assert_series_equal(empty_str, empty.str.partition("a", expand=False)) - tm.assert_series_equal(empty_str, empty.str.rpartition("a", expand=False)) - tm.assert_series_equal(empty_str, empty.str.slice(stop=1)) - tm.assert_series_equal(empty_str, empty.str.slice(step=1)) - tm.assert_series_equal(empty_str, empty.str.strip()) - tm.assert_series_equal(empty_str, empty.str.lstrip()) - tm.assert_series_equal(empty_str, empty.str.rstrip()) - tm.assert_series_equal(empty_str, empty.str.wrap(42)) - tm.assert_series_equal(empty_str, empty.str.get(0)) - tm.assert_series_equal(empty_str, empty_bytes.str.decode("ascii")) - tm.assert_series_equal(empty_bytes, empty.str.encode("ascii")) - # ismethods should always return boolean (GH 29624) - tm.assert_series_equal(empty_bool, empty.str.isalnum()) - tm.assert_series_equal(empty_bool, empty.str.isalpha()) - tm.assert_series_equal(empty_bool, empty.str.isdigit()) - tm.assert_series_equal(empty_bool, empty.str.isspace()) - tm.assert_series_equal(empty_bool, empty.str.islower()) - tm.assert_series_equal(empty_bool, empty.str.isupper()) - tm.assert_series_equal(empty_bool, empty.str.istitle()) - tm.assert_series_equal(empty_bool, empty.str.isnumeric()) - tm.assert_series_equal(empty_bool, empty.str.isdecimal()) - tm.assert_series_equal(empty_str, empty.str.capitalize()) - tm.assert_series_equal(empty_str, empty.str.swapcase()) - tm.assert_series_equal(empty_str, empty.str.normalize("NFC")) - - table = str.maketrans("a", "b") - tm.assert_series_equal(empty_str, empty.str.translate(table)) - - def test_empty_str_methods_to_frame(self): - empty = Series(dtype=str) - empty_df = DataFrame() - tm.assert_frame_equal(empty_df, empty.str.partition("a")) - tm.assert_frame_equal(empty_df, empty.str.rpartition("a")) - - def test_ismethods(self): - values = ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "] - str_s = Series(values) - alnum_e = [True, True, True, True, True, False, True, True, False, False] - alpha_e = [True, True, True, False, False, False, True, False, False, False] - digit_e = [False, False, False, True, False, False, False, True, False, False] - - # TODO: unused - num_e = [ # noqa - False, - False, - False, - True, - False, - False, - False, - True, - False, - False, - ] - - space_e = [False, False, False, False, False, False, False, False, False, True] - lower_e = [False, True, False, False, False, False, False, False, False, False] - upper_e = [True, False, False, False, True, False, True, False, False, False] - title_e = [True, False, True, False, True, False, False, False, False, False] - - tm.assert_series_equal(str_s.str.isalnum(), Series(alnum_e)) - tm.assert_series_equal(str_s.str.isalpha(), Series(alpha_e)) - tm.assert_series_equal(str_s.str.isdigit(), Series(digit_e)) - tm.assert_series_equal(str_s.str.isspace(), Series(space_e)) - tm.assert_series_equal(str_s.str.islower(), Series(lower_e)) - tm.assert_series_equal(str_s.str.isupper(), Series(upper_e)) - tm.assert_series_equal(str_s.str.istitle(), Series(title_e)) - - assert str_s.str.isalnum().tolist() == [v.isalnum() for v in values] - assert str_s.str.isalpha().tolist() == [v.isalpha() for v in values] - assert str_s.str.isdigit().tolist() == [v.isdigit() for v in values] - assert str_s.str.isspace().tolist() == [v.isspace() for v in values] - assert str_s.str.islower().tolist() == [v.islower() for v in values] - assert str_s.str.isupper().tolist() == [v.isupper() for v in values] - assert str_s.str.istitle().tolist() == [v.istitle() for v in values] - - def test_isnumeric(self): - # 0x00bc: ¼ VULGAR FRACTION ONE QUARTER - # 0x2605: ★ not number - # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY - # 0xFF13: 3 Em 3 - values = ["A", "3", "¼", "★", "፸", "3", "four"] - s = Series(values) - numeric_e = [False, True, True, False, True, True, False] - decimal_e = [False, True, False, False, False, True, False] - tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e)) - tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e)) - - unicodes = ["A", "3", "¼", "★", "፸", "3", "four"] - assert s.str.isnumeric().tolist() == [v.isnumeric() for v in unicodes] - assert s.str.isdecimal().tolist() == [v.isdecimal() for v in unicodes] - - values = ["A", np.nan, "¼", "★", np.nan, "3", "four"] - s = Series(values) - numeric_e = [False, np.nan, True, False, np.nan, True, False] - decimal_e = [False, np.nan, False, False, np.nan, True, False] - tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e)) - tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e)) - - def test_get_dummies(self): - s = Series(["a|b", "a|c", np.nan]) - result = s.str.get_dummies("|") - expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc")) - tm.assert_frame_equal(result, expected) - - s = Series(["a;b", "a", 7]) - result = s.str.get_dummies(";") - expected = DataFrame([[0, 1, 1], [0, 1, 0], [1, 0, 0]], columns=list("7ab")) - tm.assert_frame_equal(result, expected) - - # GH9980, GH8028 - idx = Index(["a|b", "a|c", "b|c"]) - result = idx.str.get_dummies("|") - - expected = MultiIndex.from_tuples( - [(1, 1, 0), (1, 0, 1), (0, 1, 1)], names=("a", "b", "c") - ) - tm.assert_index_equal(result, expected) - - def test_get_dummies_with_name_dummy(self): - # GH 12180 - # Dummies named 'name' should work as expected - s = Series(["a", "b,name", "b"]) - result = s.str.get_dummies(",") - expected = DataFrame( - [[1, 0, 0], [0, 1, 1], [0, 1, 0]], columns=["a", "b", "name"] - ) - tm.assert_frame_equal(result, expected) - - idx = Index(["a|b", "name|c", "b|name"]) - result = idx.str.get_dummies("|") - - expected = MultiIndex.from_tuples( - [(1, 1, 0, 0), (0, 0, 1, 1), (0, 1, 0, 1)], names=("a", "b", "c", "name") - ) - tm.assert_index_equal(result, expected) - - def test_join(self): - values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) - result = values.str.split("_").str.join("_") - tm.assert_series_equal(values, result) - - # mixed - mixed = Series( - [ - "a_b", - np.nan, - "asdf_cas_asdf", - True, - datetime.today(), - "foo", - None, - 1, - 2.0, - ] - ) - - rs = Series(mixed).str.split("_").str.join("_") - xp = Series( - [ - "a_b", - np.nan, - "asdf_cas_asdf", - np.nan, - np.nan, - "foo", - np.nan, - np.nan, - np.nan, - ] - ) - - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) - - def test_len(self): - values = Series(["foo", "fooo", "fooooo", np.nan, "fooooooo"]) - - result = values.str.len() - exp = values.map(lambda x: len(x) if notna(x) else np.nan) - tm.assert_series_equal(result, exp) - - # mixed - mixed = Series( - [ - "a_b", - np.nan, - "asdf_cas_asdf", - True, - datetime.today(), - "foo", - None, - 1, - 2.0, - ] - ) - - rs = Series(mixed).str.len() - xp = Series([3, np.nan, 13, np.nan, np.nan, 3, np.nan, np.nan, np.nan]) - - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) - - def test_findall(self): - values = Series(["fooBAD__barBAD", np.nan, "foo", "BAD"]) - - result = values.str.findall("BAD[_]*") - exp = Series([["BAD__", "BAD"], np.nan, [], ["BAD"]]) - tm.assert_almost_equal(result, exp) - - # mixed - mixed = Series( - [ - "fooBAD__barBAD", - np.nan, - "foo", - True, - datetime.today(), - "BAD", - None, - 1, - 2.0, - ] - ) - - rs = Series(mixed).str.findall("BAD[_]*") - xp = Series( - [ - ["BAD__", "BAD"], - np.nan, - [], - np.nan, - np.nan, - ["BAD"], - np.nan, - np.nan, - np.nan, - ] - ) - - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) - - def test_find(self): - values = Series(["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"]) - result = values.str.find("EF") - tm.assert_series_equal(result, Series([4, 3, 1, 0, -1])) - expected = np.array([v.find("EF") for v in values.values], dtype=np.int64) - tm.assert_numpy_array_equal(result.values, expected) - - result = values.str.rfind("EF") - tm.assert_series_equal(result, Series([4, 5, 7, 4, -1])) - expected = np.array([v.rfind("EF") for v in values.values], dtype=np.int64) - tm.assert_numpy_array_equal(result.values, expected) - - result = values.str.find("EF", 3) - tm.assert_series_equal(result, Series([4, 3, 7, 4, -1])) - expected = np.array([v.find("EF", 3) for v in values.values], dtype=np.int64) - tm.assert_numpy_array_equal(result.values, expected) - - result = values.str.rfind("EF", 3) - tm.assert_series_equal(result, Series([4, 5, 7, 4, -1])) - expected = np.array([v.rfind("EF", 3) for v in values.values], dtype=np.int64) - tm.assert_numpy_array_equal(result.values, expected) - - result = values.str.find("EF", 3, 6) - tm.assert_series_equal(result, Series([4, 3, -1, 4, -1])) - expected = np.array([v.find("EF", 3, 6) for v in values.values], dtype=np.int64) - tm.assert_numpy_array_equal(result.values, expected) - - result = values.str.rfind("EF", 3, 6) - tm.assert_series_equal(result, Series([4, 3, -1, 4, -1])) - expected = np.array( - [v.rfind("EF", 3, 6) for v in values.values], dtype=np.int64 - ) - tm.assert_numpy_array_equal(result.values, expected) - - with pytest.raises(TypeError, match="expected a string object, not int"): - result = values.str.find(0) - - with pytest.raises(TypeError, match="expected a string object, not int"): - result = values.str.rfind(0) - - def test_find_nan(self): - values = Series(["ABCDEFG", np.nan, "DEFGHIJEF", np.nan, "XXXX"]) - result = values.str.find("EF") - tm.assert_series_equal(result, Series([4, np.nan, 1, np.nan, -1])) - - result = values.str.rfind("EF") - tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1])) - - result = values.str.find("EF", 3) - tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1])) - - result = values.str.rfind("EF", 3) - tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1])) - - result = values.str.find("EF", 3, 6) - tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1])) - - result = values.str.rfind("EF", 3, 6) - tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1])) - - def test_index(self): - def _check(result, expected): - if isinstance(result, Series): - tm.assert_series_equal(result, expected) - else: - tm.assert_index_equal(result, expected) - - for klass in [Series, Index]: - s = klass(["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"]) - - result = s.str.index("EF") - _check(result, klass([4, 3, 1, 0])) - expected = np.array([v.index("EF") for v in s.values], dtype=np.int64) - tm.assert_numpy_array_equal(result.values, expected) - - result = s.str.rindex("EF") - _check(result, klass([4, 5, 7, 4])) - expected = np.array([v.rindex("EF") for v in s.values], dtype=np.int64) - tm.assert_numpy_array_equal(result.values, expected) - - result = s.str.index("EF", 3) - _check(result, klass([4, 3, 7, 4])) - expected = np.array([v.index("EF", 3) for v in s.values], dtype=np.int64) - tm.assert_numpy_array_equal(result.values, expected) - - result = s.str.rindex("EF", 3) - _check(result, klass([4, 5, 7, 4])) - expected = np.array([v.rindex("EF", 3) for v in s.values], dtype=np.int64) - tm.assert_numpy_array_equal(result.values, expected) - - result = s.str.index("E", 4, 8) - _check(result, klass([4, 5, 7, 4])) - expected = np.array([v.index("E", 4, 8) for v in s.values], dtype=np.int64) - tm.assert_numpy_array_equal(result.values, expected) - - result = s.str.rindex("E", 0, 5) - _check(result, klass([4, 3, 1, 4])) - expected = np.array([v.rindex("E", 0, 5) for v in s.values], dtype=np.int64) - tm.assert_numpy_array_equal(result.values, expected) - - with pytest.raises(ValueError, match="substring not found"): - result = s.str.index("DE") - - msg = "expected a string object, not int" - with pytest.raises(TypeError, match=msg): - result = s.str.index(0) - - with pytest.raises(TypeError, match=msg): - result = s.str.rindex(0) - - # test with nan - s = Series(["abcb", "ab", "bcbe", np.nan]) - result = s.str.index("b") - tm.assert_series_equal(result, Series([1, 1, 0, np.nan])) - result = s.str.rindex("b") - tm.assert_series_equal(result, Series([3, 1, 2, np.nan])) - - def test_pad(self): - values = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"]) - - result = values.str.pad(5, side="left") - exp = Series([" a", " b", np.nan, " c", np.nan, "eeeeee"]) - tm.assert_almost_equal(result, exp) - - result = values.str.pad(5, side="right") - exp = Series(["a ", "b ", np.nan, "c ", np.nan, "eeeeee"]) - tm.assert_almost_equal(result, exp) - - result = values.str.pad(5, side="both") - exp = Series([" a ", " b ", np.nan, " c ", np.nan, "eeeeee"]) - tm.assert_almost_equal(result, exp) - - # mixed - mixed = Series(["a", np.nan, "b", True, datetime.today(), "ee", None, 1, 2.0]) - - rs = Series(mixed).str.pad(5, side="left") - xp = Series( - [" a", np.nan, " b", np.nan, np.nan, " ee", np.nan, np.nan, np.nan] - ) - - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) - - mixed = Series(["a", np.nan, "b", True, datetime.today(), "ee", None, 1, 2.0]) - - rs = Series(mixed).str.pad(5, side="right") - xp = Series( - ["a ", np.nan, "b ", np.nan, np.nan, "ee ", np.nan, np.nan, np.nan] - ) - - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) - - mixed = Series(["a", np.nan, "b", True, datetime.today(), "ee", None, 1, 2.0]) - - rs = Series(mixed).str.pad(5, side="both") - xp = Series( - [" a ", np.nan, " b ", np.nan, np.nan, " ee ", np.nan, np.nan, np.nan] - ) - - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) - - def test_pad_fillchar(self): - - values = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"]) - - result = values.str.pad(5, side="left", fillchar="X") - exp = Series(["XXXXa", "XXXXb", np.nan, "XXXXc", np.nan, "eeeeee"]) - tm.assert_almost_equal(result, exp) - - result = values.str.pad(5, side="right", fillchar="X") - exp = Series(["aXXXX", "bXXXX", np.nan, "cXXXX", np.nan, "eeeeee"]) - tm.assert_almost_equal(result, exp) - - result = values.str.pad(5, side="both", fillchar="X") - exp = Series(["XXaXX", "XXbXX", np.nan, "XXcXX", np.nan, "eeeeee"]) - tm.assert_almost_equal(result, exp) - - msg = "fillchar must be a character, not str" - with pytest.raises(TypeError, match=msg): - result = values.str.pad(5, fillchar="XY") - - msg = "fillchar must be a character, not int" - with pytest.raises(TypeError, match=msg): - result = values.str.pad(5, fillchar=5) - - @pytest.mark.parametrize("f", ["center", "ljust", "rjust", "zfill", "pad"]) - def test_pad_width(self, f): - # see gh-13598 - s = Series(["1", "22", "a", "bb"]) - msg = "width must be of integer type, not*" - - with pytest.raises(TypeError, match=msg): - getattr(s.str, f)("f") - - def test_translate(self): - def _check(result, expected): - if isinstance(result, Series): - tm.assert_series_equal(result, expected) - else: - tm.assert_index_equal(result, expected) - - for klass in [Series, Index]: - s = klass(["abcdefg", "abcc", "cdddfg", "cdefggg"]) - table = str.maketrans("abc", "cde") - result = s.str.translate(table) - expected = klass(["cdedefg", "cdee", "edddfg", "edefggg"]) - _check(result, expected) - - # Series with non-string values - s = Series(["a", "b", "c", 1.2]) - expected = Series(["c", "d", "e", np.nan]) - result = s.str.translate(table) - tm.assert_series_equal(result, expected) - - def test_center_ljust_rjust(self): - values = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"]) - - result = values.str.center(5) - exp = Series([" a ", " b ", np.nan, " c ", np.nan, "eeeeee"]) - tm.assert_almost_equal(result, exp) - - result = values.str.ljust(5) - exp = Series(["a ", "b ", np.nan, "c ", np.nan, "eeeeee"]) - tm.assert_almost_equal(result, exp) - - result = values.str.rjust(5) - exp = Series([" a", " b", np.nan, " c", np.nan, "eeeeee"]) - tm.assert_almost_equal(result, exp) - - # mixed - mixed = Series( - ["a", np.nan, "b", True, datetime.today(), "c", "eee", None, 1, 2.0] - ) - - rs = Series(mixed).str.center(5) - xp = Series( - [ - " a ", - np.nan, - " b ", - np.nan, - np.nan, - " c ", - " eee ", - np.nan, - np.nan, - np.nan, - ] - ) - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) - - rs = Series(mixed).str.ljust(5) - xp = Series( - [ - "a ", - np.nan, - "b ", - np.nan, - np.nan, - "c ", - "eee ", - np.nan, - np.nan, - np.nan, - ] - ) - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) - - rs = Series(mixed).str.rjust(5) - xp = Series( - [ - " a", - np.nan, - " b", - np.nan, - np.nan, - " c", - " eee", - np.nan, - np.nan, - np.nan, - ] - ) - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) - - def test_center_ljust_rjust_fillchar(self): - values = Series(["a", "bb", "cccc", "ddddd", "eeeeee"]) - - result = values.str.center(5, fillchar="X") - expected = Series(["XXaXX", "XXbbX", "Xcccc", "ddddd", "eeeeee"]) - tm.assert_series_equal(result, expected) - expected = np.array([v.center(5, "X") for v in values.values], dtype=np.object_) - tm.assert_numpy_array_equal(result.values, expected) - - result = values.str.ljust(5, fillchar="X") - expected = Series(["aXXXX", "bbXXX", "ccccX", "ddddd", "eeeeee"]) - tm.assert_series_equal(result, expected) - expected = np.array([v.ljust(5, "X") for v in values.values], dtype=np.object_) - tm.assert_numpy_array_equal(result.values, expected) - - result = values.str.rjust(5, fillchar="X") - expected = Series(["XXXXa", "XXXbb", "Xcccc", "ddddd", "eeeeee"]) - tm.assert_series_equal(result, expected) - expected = np.array([v.rjust(5, "X") for v in values.values], dtype=np.object_) - tm.assert_numpy_array_equal(result.values, expected) - - # If fillchar is not a charatter, normal str raises TypeError - # 'aaa'.ljust(5, 'XY') - # TypeError: must be char, not str - template = "fillchar must be a character, not {dtype}" - - with pytest.raises(TypeError, match=template.format(dtype="str")): - values.str.center(5, fillchar="XY") - - with pytest.raises(TypeError, match=template.format(dtype="str")): - values.str.ljust(5, fillchar="XY") - - with pytest.raises(TypeError, match=template.format(dtype="str")): - values.str.rjust(5, fillchar="XY") - - with pytest.raises(TypeError, match=template.format(dtype="int")): - values.str.center(5, fillchar=1) - - with pytest.raises(TypeError, match=template.format(dtype="int")): - values.str.ljust(5, fillchar=1) - - with pytest.raises(TypeError, match=template.format(dtype="int")): - values.str.rjust(5, fillchar=1) - - def test_zfill(self): - values = Series(["1", "22", "aaa", "333", "45678"]) - - result = values.str.zfill(5) - expected = Series(["00001", "00022", "00aaa", "00333", "45678"]) - tm.assert_series_equal(result, expected) - expected = np.array([v.zfill(5) for v in values.values], dtype=np.object_) - tm.assert_numpy_array_equal(result.values, expected) - - result = values.str.zfill(3) - expected = Series(["001", "022", "aaa", "333", "45678"]) - tm.assert_series_equal(result, expected) - expected = np.array([v.zfill(3) for v in values.values], dtype=np.object_) - tm.assert_numpy_array_equal(result.values, expected) - - values = Series(["1", np.nan, "aaa", np.nan, "45678"]) - result = values.str.zfill(5) - expected = Series(["00001", np.nan, "00aaa", np.nan, "45678"]) - tm.assert_series_equal(result, expected) - - def test_split(self): - values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) - - result = values.str.split("_") - exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) - tm.assert_series_equal(result, exp) - - # more than one char - values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"]) - result = values.str.split("__") - tm.assert_series_equal(result, exp) - - result = values.str.split("__", expand=False) - tm.assert_series_equal(result, exp) - - # mixed - mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0]) - result = mixed.str.split("_") - exp = Series( - [ - ["a", "b", "c"], - np.nan, - ["d", "e", "f"], - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - ] - ) - assert isinstance(result, Series) - tm.assert_almost_equal(result, exp) - - result = mixed.str.split("_", expand=False) - assert isinstance(result, Series) - tm.assert_almost_equal(result, exp) - - # regex split - values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"]) - result = values.str.split("[,_]") - exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) - tm.assert_series_equal(result, exp) - - @pytest.mark.parametrize("dtype", [object, "string"]) - @pytest.mark.parametrize("method", ["split", "rsplit"]) - def test_split_n(self, dtype, method): - s = Series(["a b", pd.NA, "b c"], dtype=dtype) - expected = Series([["a", "b"], pd.NA, ["b", "c"]]) - - result = getattr(s.str, method)(" ", n=None) - tm.assert_series_equal(result, expected) - - result = getattr(s.str, method)(" ", n=0) - tm.assert_series_equal(result, expected) - - def test_rsplit(self): - values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) - result = values.str.rsplit("_") - exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) - tm.assert_series_equal(result, exp) - - # more than one char - values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"]) - result = values.str.rsplit("__") - tm.assert_series_equal(result, exp) - - result = values.str.rsplit("__", expand=False) - tm.assert_series_equal(result, exp) - - # mixed - mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0]) - result = mixed.str.rsplit("_") - exp = Series( - [ - ["a", "b", "c"], - np.nan, - ["d", "e", "f"], - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - ] - ) - assert isinstance(result, Series) - tm.assert_almost_equal(result, exp) - - result = mixed.str.rsplit("_", expand=False) - assert isinstance(result, Series) - tm.assert_almost_equal(result, exp) - - # regex split is not supported by rsplit - values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"]) - result = values.str.rsplit("[,_]") - exp = Series([["a,b_c"], ["c_d,e"], np.nan, ["f,g,h"]]) - tm.assert_series_equal(result, exp) - - # setting max number of splits, make sure it's from reverse - values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) - result = values.str.rsplit("_", n=1) - exp = Series([["a_b", "c"], ["c_d", "e"], np.nan, ["f_g", "h"]]) - tm.assert_series_equal(result, exp) - - def test_split_blank_string(self): - # expand blank split GH 20067 - values = Series([""], name="test") - result = values.str.split(expand=True) - exp = DataFrame([[]]) # NOTE: this is NOT an empty DataFrame - tm.assert_frame_equal(result, exp) - - values = Series(["a b c", "a b", "", " "], name="test") - result = values.str.split(expand=True) - exp = DataFrame( - [ - ["a", "b", "c"], - ["a", "b", np.nan], - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan], - ] - ) - tm.assert_frame_equal(result, exp) - - def test_split_noargs(self): - # #1859 - s = Series(["Wes McKinney", "Travis Oliphant"]) - result = s.str.split() - expected = ["Travis", "Oliphant"] - assert result[1] == expected - result = s.str.rsplit() - assert result[1] == expected - - def test_split_maxsplit(self): - # re.split 0, str.split -1 - s = Series(["bd asdf jfg", "kjasdflqw asdfnfk"]) - - result = s.str.split(n=-1) - xp = s.str.split() - tm.assert_series_equal(result, xp) - - result = s.str.split(n=0) - tm.assert_series_equal(result, xp) - - xp = s.str.split("asdf") - result = s.str.split("asdf", n=0) - tm.assert_series_equal(result, xp) - - result = s.str.split("asdf", n=-1) - tm.assert_series_equal(result, xp) - - def test_split_no_pat_with_nonzero_n(self): - s = Series(["split once", "split once too!"]) - result = s.str.split(n=1) - expected = Series({0: ["split", "once"], 1: ["split", "once too!"]}) - tm.assert_series_equal(expected, result, check_index_type=False) - - def test_split_to_dataframe(self): - s = Series(["nosplit", "alsonosplit"]) - result = s.str.split("_", expand=True) - exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}) - tm.assert_frame_equal(result, exp) - - s = Series(["some_equal_splits", "with_no_nans"]) - result = s.str.split("_", expand=True) - exp = DataFrame( - {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]} - ) - tm.assert_frame_equal(result, exp) - - s = Series(["some_unequal_splits", "one_of_these_things_is_not"]) - result = s.str.split("_", expand=True) - exp = DataFrame( - { - 0: ["some", "one"], - 1: ["unequal", "of"], - 2: ["splits", "these"], - 3: [np.nan, "things"], - 4: [np.nan, "is"], - 5: [np.nan, "not"], - } - ) - tm.assert_frame_equal(result, exp) - - s = Series(["some_splits", "with_index"], index=["preserve", "me"]) - result = s.str.split("_", expand=True) - exp = DataFrame( - {0: ["some", "with"], 1: ["splits", "index"]}, index=["preserve", "me"] - ) - tm.assert_frame_equal(result, exp) - - with pytest.raises(ValueError, match="expand must be"): - s.str.split("_", expand="not_a_boolean") - - def test_split_to_multiindex_expand(self): - # https://github.com/pandas-dev/pandas/issues/23677 - - idx = Index(["nosplit", "alsonosplit", np.nan]) - result = idx.str.split("_", expand=True) - exp = idx - tm.assert_index_equal(result, exp) - assert result.nlevels == 1 - - idx = Index(["some_equal_splits", "with_no_nans", np.nan, None]) - result = idx.str.split("_", expand=True) - exp = MultiIndex.from_tuples( - [ - ("some", "equal", "splits"), - ("with", "no", "nans"), - [np.nan, np.nan, np.nan], - [None, None, None], - ] - ) - tm.assert_index_equal(result, exp) - assert result.nlevels == 3 - - idx = Index(["some_unequal_splits", "one_of_these_things_is_not", np.nan, None]) - result = idx.str.split("_", expand=True) - exp = MultiIndex.from_tuples( - [ - ("some", "unequal", "splits", np.nan, np.nan, np.nan), - ("one", "of", "these", "things", "is", "not"), - (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan), - (None, None, None, None, None, None), - ] - ) - tm.assert_index_equal(result, exp) - assert result.nlevels == 6 - - with pytest.raises(ValueError, match="expand must be"): - idx.str.split("_", expand="not_a_boolean") - - def test_rsplit_to_dataframe_expand(self): - s = Series(["nosplit", "alsonosplit"]) - result = s.str.rsplit("_", expand=True) - exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}) - tm.assert_frame_equal(result, exp) - - s = Series(["some_equal_splits", "with_no_nans"]) - result = s.str.rsplit("_", expand=True) - exp = DataFrame( - {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]} - ) - tm.assert_frame_equal(result, exp) - - result = s.str.rsplit("_", expand=True, n=2) - exp = DataFrame( - {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]} - ) - tm.assert_frame_equal(result, exp) - - result = s.str.rsplit("_", expand=True, n=1) - exp = DataFrame({0: ["some_equal", "with_no"], 1: ["splits", "nans"]}) - tm.assert_frame_equal(result, exp) - - s = Series(["some_splits", "with_index"], index=["preserve", "me"]) - result = s.str.rsplit("_", expand=True) - exp = DataFrame( - {0: ["some", "with"], 1: ["splits", "index"]}, index=["preserve", "me"] - ) - tm.assert_frame_equal(result, exp) - - def test_rsplit_to_multiindex_expand(self): - idx = Index(["nosplit", "alsonosplit"]) - result = idx.str.rsplit("_", expand=True) - exp = idx - tm.assert_index_equal(result, exp) - assert result.nlevels == 1 - - idx = Index(["some_equal_splits", "with_no_nans"]) - result = idx.str.rsplit("_", expand=True) - exp = MultiIndex.from_tuples( - [("some", "equal", "splits"), ("with", "no", "nans")] - ) - tm.assert_index_equal(result, exp) - assert result.nlevels == 3 - - idx = Index(["some_equal_splits", "with_no_nans"]) - result = idx.str.rsplit("_", expand=True, n=1) - exp = MultiIndex.from_tuples([("some_equal", "splits"), ("with_no", "nans")]) - tm.assert_index_equal(result, exp) - assert result.nlevels == 2 - - def test_split_nan_expand(self): - # gh-18450 - s = Series(["foo,bar,baz", np.nan]) - result = s.str.split(",", expand=True) - exp = DataFrame([["foo", "bar", "baz"], [np.nan, np.nan, np.nan]]) - tm.assert_frame_equal(result, exp) - - # check that these are actually np.nan and not None - # TODO see GH 18463 - # tm.assert_frame_equal does not differentiate - assert all(np.isnan(x) for x in result.iloc[1]) - - def test_split_with_name(self): - # GH 12617 - - # should preserve name - s = Series(["a,b", "c,d"], name="xxx") - res = s.str.split(",") - exp = Series([["a", "b"], ["c", "d"]], name="xxx") - tm.assert_series_equal(res, exp) - - res = s.str.split(",", expand=True) - exp = DataFrame([["a", "b"], ["c", "d"]]) - tm.assert_frame_equal(res, exp) - - idx = Index(["a,b", "c,d"], name="xxx") - res = idx.str.split(",") - exp = Index([["a", "b"], ["c", "d"]], name="xxx") - assert res.nlevels == 1 - tm.assert_index_equal(res, exp) - - res = idx.str.split(",", expand=True) - exp = MultiIndex.from_tuples([("a", "b"), ("c", "d")]) - assert res.nlevels == 2 - tm.assert_index_equal(res, exp) - - def test_partition_series(self): - # https://github.com/pandas-dev/pandas/issues/23558 - - values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None]) - - result = values.str.partition("_", expand=False) - exp = Series( - [("a", "_", "b_c"), ("c", "_", "d_e"), np.nan, ("f", "_", "g_h"), None] - ) - tm.assert_series_equal(result, exp) - - result = values.str.rpartition("_", expand=False) - exp = Series( - [("a_b", "_", "c"), ("c_d", "_", "e"), np.nan, ("f_g", "_", "h"), None] - ) - tm.assert_series_equal(result, exp) - - # more than one char - values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h", None]) - result = values.str.partition("__", expand=False) - exp = Series( - [ - ("a", "__", "b__c"), - ("c", "__", "d__e"), - np.nan, - ("f", "__", "g__h"), - None, - ] - ) - tm.assert_series_equal(result, exp) - - result = values.str.rpartition("__", expand=False) - exp = Series( - [ - ("a__b", "__", "c"), - ("c__d", "__", "e"), - np.nan, - ("f__g", "__", "h"), - None, - ] - ) - tm.assert_series_equal(result, exp) - - # None - values = Series(["a b c", "c d e", np.nan, "f g h", None]) - result = values.str.partition(expand=False) - exp = Series( - [("a", " ", "b c"), ("c", " ", "d e"), np.nan, ("f", " ", "g h"), None] - ) - tm.assert_series_equal(result, exp) - - result = values.str.rpartition(expand=False) - exp = Series( - [("a b", " ", "c"), ("c d", " ", "e"), np.nan, ("f g", " ", "h"), None] - ) - tm.assert_series_equal(result, exp) - - # Not split - values = Series(["abc", "cde", np.nan, "fgh", None]) - result = values.str.partition("_", expand=False) - exp = Series([("abc", "", ""), ("cde", "", ""), np.nan, ("fgh", "", ""), None]) - tm.assert_series_equal(result, exp) - - result = values.str.rpartition("_", expand=False) - exp = Series([("", "", "abc"), ("", "", "cde"), np.nan, ("", "", "fgh"), None]) - tm.assert_series_equal(result, exp) - - # unicode - values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) - - result = values.str.partition("_", expand=False) - exp = Series([("a", "_", "b_c"), ("c", "_", "d_e"), np.nan, ("f", "_", "g_h")]) - tm.assert_series_equal(result, exp) - - result = values.str.rpartition("_", expand=False) - exp = Series([("a_b", "_", "c"), ("c_d", "_", "e"), np.nan, ("f_g", "_", "h")]) - tm.assert_series_equal(result, exp) - - # compare to standard lib - values = Series(["A_B_C", "B_C_D", "E_F_G", "EFGHEF"]) - result = values.str.partition("_", expand=False).tolist() - assert result == [v.partition("_") for v in values] - result = values.str.rpartition("_", expand=False).tolist() - assert result == [v.rpartition("_") for v in values] - - def test_partition_index(self): - # https://github.com/pandas-dev/pandas/issues/23558 - - values = Index(["a_b_c", "c_d_e", "f_g_h", np.nan, None]) - - result = values.str.partition("_", expand=False) - exp = Index( - np.array( - [("a", "_", "b_c"), ("c", "_", "d_e"), ("f", "_", "g_h"), np.nan, None], - dtype=object, - ) - ) - tm.assert_index_equal(result, exp) - assert result.nlevels == 1 - - result = values.str.rpartition("_", expand=False) - exp = Index( - np.array( - [("a_b", "_", "c"), ("c_d", "_", "e"), ("f_g", "_", "h"), np.nan, None], - dtype=object, - ) - ) - tm.assert_index_equal(result, exp) - assert result.nlevels == 1 - - result = values.str.partition("_") - exp = Index( - [ - ("a", "_", "b_c"), - ("c", "_", "d_e"), - ("f", "_", "g_h"), - (np.nan, np.nan, np.nan), - (None, None, None), - ] - ) - tm.assert_index_equal(result, exp) - assert isinstance(result, MultiIndex) - assert result.nlevels == 3 - - result = values.str.rpartition("_") - exp = Index( - [ - ("a_b", "_", "c"), - ("c_d", "_", "e"), - ("f_g", "_", "h"), - (np.nan, np.nan, np.nan), - (None, None, None), - ] - ) - tm.assert_index_equal(result, exp) - assert isinstance(result, MultiIndex) - assert result.nlevels == 3 - - def test_partition_to_dataframe(self): - # https://github.com/pandas-dev/pandas/issues/23558 - - values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None]) - result = values.str.partition("_") - exp = DataFrame( - { - 0: ["a", "c", np.nan, "f", None], - 1: ["_", "_", np.nan, "_", None], - 2: ["b_c", "d_e", np.nan, "g_h", None], - } - ) - tm.assert_frame_equal(result, exp) - - result = values.str.rpartition("_") - exp = DataFrame( - { - 0: ["a_b", "c_d", np.nan, "f_g", None], - 1: ["_", "_", np.nan, "_", None], - 2: ["c", "e", np.nan, "h", None], - } - ) - tm.assert_frame_equal(result, exp) - - values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None]) - result = values.str.partition("_", expand=True) - exp = DataFrame( - { - 0: ["a", "c", np.nan, "f", None], - 1: ["_", "_", np.nan, "_", None], - 2: ["b_c", "d_e", np.nan, "g_h", None], - } - ) - tm.assert_frame_equal(result, exp) - - result = values.str.rpartition("_", expand=True) - exp = DataFrame( - { - 0: ["a_b", "c_d", np.nan, "f_g", None], - 1: ["_", "_", np.nan, "_", None], - 2: ["c", "e", np.nan, "h", None], - } - ) - tm.assert_frame_equal(result, exp) - - def test_partition_with_name(self): - # GH 12617 - - s = Series(["a,b", "c,d"], name="xxx") - res = s.str.partition(",") - exp = DataFrame({0: ["a", "c"], 1: [",", ","], 2: ["b", "d"]}) - tm.assert_frame_equal(res, exp) - - # should preserve name - res = s.str.partition(",", expand=False) - exp = Series([("a", ",", "b"), ("c", ",", "d")], name="xxx") - tm.assert_series_equal(res, exp) - - idx = Index(["a,b", "c,d"], name="xxx") - res = idx.str.partition(",") - exp = MultiIndex.from_tuples([("a", ",", "b"), ("c", ",", "d")]) - assert res.nlevels == 3 - tm.assert_index_equal(res, exp) - - # should preserve name - res = idx.str.partition(",", expand=False) - exp = Index(np.array([("a", ",", "b"), ("c", ",", "d")]), name="xxx") - assert res.nlevels == 1 - tm.assert_index_equal(res, exp) - - def test_partition_sep_kwarg(self): - # GH 22676; depr kwarg "pat" in favor of "sep" - values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) - - expected = values.str.partition(sep="_") - result = values.str.partition("_") - tm.assert_frame_equal(result, expected) - - expected = values.str.rpartition(sep="_") - result = values.str.rpartition("_") - tm.assert_frame_equal(result, expected) - - def test_pipe_failures(self): - # #2119 - s = Series(["A|B|C"]) - - result = s.str.split("|") - exp = Series([["A", "B", "C"]]) - - tm.assert_series_equal(result, exp) - - result = s.str.replace("|", " ", regex=False) - exp = Series(["A B C"]) - - tm.assert_series_equal(result, exp) - - @pytest.mark.parametrize( - "start, stop, step, expected", - [ - (2, 5, None, Series(["foo", "bar", np.nan, "baz"])), - (0, 3, -1, Series(["", "", np.nan, ""])), - (None, None, -1, Series(["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"])), - (3, 10, 2, Series(["oto", "ato", np.nan, "aqx"])), - (3, 0, -1, Series(["ofa", "aba", np.nan, "aba"])), - ], - ) - def test_slice(self, start, stop, step, expected): - values = Series(["aafootwo", "aabartwo", np.nan, "aabazqux"]) - result = values.str.slice(start, stop, step) - tm.assert_series_equal(result, expected) - - # mixed - mixed = Series( - ["aafootwo", np.nan, "aabartwo", True, datetime.today(), None, 1, 2.0] - ) - - rs = Series(mixed).str.slice(2, 5) - xp = Series(["foo", np.nan, "bar", np.nan, np.nan, np.nan, np.nan, np.nan]) - - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) - - rs = Series(mixed).str.slice(2, 5, -1) - xp = Series(["oof", np.nan, "rab", np.nan, np.nan, np.nan, np.nan, np.nan]) - - def test_slice_replace(self): - values = Series(["short", "a bit longer", "evenlongerthanthat", "", np.nan]) - - exp = Series(["shrt", "a it longer", "evnlongerthanthat", "", np.nan]) - result = values.str.slice_replace(2, 3) - tm.assert_series_equal(result, exp) - - exp = Series(["shzrt", "a zit longer", "evznlongerthanthat", "z", np.nan]) - result = values.str.slice_replace(2, 3, "z") - tm.assert_series_equal(result, exp) - - exp = Series(["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan]) - result = values.str.slice_replace(2, 2, "z") - tm.assert_series_equal(result, exp) - - exp = Series(["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan]) - result = values.str.slice_replace(2, 1, "z") - tm.assert_series_equal(result, exp) - - exp = Series(["shorz", "a bit longez", "evenlongerthanthaz", "z", np.nan]) - result = values.str.slice_replace(-1, None, "z") - tm.assert_series_equal(result, exp) - - exp = Series(["zrt", "zer", "zat", "z", np.nan]) - result = values.str.slice_replace(None, -2, "z") - tm.assert_series_equal(result, exp) - - exp = Series(["shortz", "a bit znger", "evenlozerthanthat", "z", np.nan]) - result = values.str.slice_replace(6, 8, "z") - tm.assert_series_equal(result, exp) - - exp = Series(["zrt", "a zit longer", "evenlongzerthanthat", "z", np.nan]) - result = values.str.slice_replace(-10, 3, "z") - tm.assert_series_equal(result, exp) - - def test_strip_lstrip_rstrip(self): - values = Series([" aa ", " bb \n", np.nan, "cc "]) - - result = values.str.strip() - exp = Series(["aa", "bb", np.nan, "cc"]) - tm.assert_series_equal(result, exp) - - result = values.str.lstrip() - exp = Series(["aa ", "bb \n", np.nan, "cc "]) - tm.assert_series_equal(result, exp) - - result = values.str.rstrip() - exp = Series([" aa", " bb", np.nan, "cc"]) - tm.assert_series_equal(result, exp) - - def test_strip_lstrip_rstrip_mixed(self): - # mixed - mixed = Series( - [" aa ", np.nan, " bb \t\n", True, datetime.today(), None, 1, 2.0] - ) - - rs = Series(mixed).str.strip() - xp = Series(["aa", np.nan, "bb", np.nan, np.nan, np.nan, np.nan, np.nan]) - - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) - - rs = Series(mixed).str.lstrip() - xp = Series(["aa ", np.nan, "bb \t\n", np.nan, np.nan, np.nan, np.nan, np.nan]) - - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) - - rs = Series(mixed).str.rstrip() - xp = Series([" aa", np.nan, " bb", np.nan, np.nan, np.nan, np.nan, np.nan]) - - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) - - def test_strip_lstrip_rstrip_args(self): - values = Series(["xxABCxx", "xx BNSD", "LDFJH xx"]) - - rs = values.str.strip("x") - xp = Series(["ABC", " BNSD", "LDFJH "]) - tm.assert_series_equal(rs, xp) - - rs = values.str.lstrip("x") - xp = Series(["ABCxx", " BNSD", "LDFJH xx"]) - tm.assert_series_equal(rs, xp) - - rs = values.str.rstrip("x") - xp = Series(["xxABC", "xx BNSD", "LDFJH "]) - tm.assert_series_equal(rs, xp) - - def test_wrap(self): - # test values are: two words less than width, two words equal to width, - # two words greater than width, one word less than width, one word - # equal to width, one word greater than width, multiple tokens with - # trailing whitespace equal to width - values = Series( - [ - "hello world", - "hello world!", - "hello world!!", - "abcdefabcde", - "abcdefabcdef", - "abcdefabcdefa", - "ab ab ab ab ", - "ab ab ab ab a", - "\t", - ] - ) - - # expected values - xp = Series( - [ - "hello world", - "hello world!", - "hello\nworld!!", - "abcdefabcde", - "abcdefabcdef", - "abcdefabcdef\na", - "ab ab ab ab", - "ab ab ab ab\na", - "", - ] - ) - - rs = values.str.wrap(12, break_long_words=True) - tm.assert_series_equal(rs, xp) - - # test with pre and post whitespace (non-unicode), NaN, and non-ascii - # Unicode - values = Series([" pre ", np.nan, "\xac\u20ac\U00008000 abadcafe"]) - xp = Series([" pre", np.nan, "\xac\u20ac\U00008000 ab\nadcafe"]) - rs = values.str.wrap(6) - tm.assert_series_equal(rs, xp) - - def test_get(self): - values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) - - result = values.str.split("_").str.get(1) - expected = Series(["b", "d", np.nan, "g"]) - tm.assert_series_equal(result, expected) - - # mixed - mixed = Series(["a_b_c", np.nan, "c_d_e", True, datetime.today(), None, 1, 2.0]) - - rs = Series(mixed).str.split("_").str.get(1) - xp = Series(["b", np.nan, "d", np.nan, np.nan, np.nan, np.nan, np.nan]) - - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) - - # bounds testing - values = Series(["1_2_3_4_5", "6_7_8_9_10", "11_12"]) - - # positive index - result = values.str.split("_").str.get(2) - expected = Series(["3", "8", np.nan]) - tm.assert_series_equal(result, expected) - - # negative index - result = values.str.split("_").str.get(-3) - expected = Series(["3", "8", np.nan]) - tm.assert_series_equal(result, expected) - - def test_get_complex(self): - # GH 20671, getting value not in dict raising `KeyError` - values = Series([(1, 2, 3), [1, 2, 3], {1, 2, 3}, {1: "a", 2: "b", 3: "c"}]) - - result = values.str.get(1) - expected = Series([2, 2, np.nan, "a"]) - tm.assert_series_equal(result, expected) - - result = values.str.get(-1) - expected = Series([3, 3, np.nan, np.nan]) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("to_type", [tuple, list, np.array]) - def test_get_complex_nested(self, to_type): - values = Series([to_type([to_type([1, 2])])]) - - result = values.str.get(0) - expected = Series([to_type([1, 2])]) - tm.assert_series_equal(result, expected) - - result = values.str.get(1) - expected = Series([np.nan]) - tm.assert_series_equal(result, expected) - - def test_contains_moar(self): - # PR #1179 - s = Series(["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"]) - - result = s.str.contains("a") - expected = Series( - [False, False, False, True, True, False, np.nan, False, False, True] - ) - tm.assert_series_equal(result, expected) - - result = s.str.contains("a", case=False) - expected = Series( - [True, False, False, True, True, False, np.nan, True, False, True] - ) - tm.assert_series_equal(result, expected) - - result = s.str.contains("Aa") - expected = Series( - [False, False, False, True, False, False, np.nan, False, False, False] - ) - tm.assert_series_equal(result, expected) - - result = s.str.contains("ba") - expected = Series( - [False, False, False, True, False, False, np.nan, False, False, False] - ) - tm.assert_series_equal(result, expected) - - result = s.str.contains("ba", case=False) - expected = Series( - [False, False, False, True, True, False, np.nan, True, False, False] - ) - tm.assert_series_equal(result, expected) - - def test_contains_nan(self): - # PR #14171 - s = Series([np.nan, np.nan, np.nan], dtype=np.object_) - - result = s.str.contains("foo", na=False) - expected = Series([False, False, False], dtype=np.bool_) - tm.assert_series_equal(result, expected) - - result = s.str.contains("foo", na=True) - expected = Series([True, True, True], dtype=np.bool_) - tm.assert_series_equal(result, expected) - - result = s.str.contains("foo", na="foo") - expected = Series(["foo", "foo", "foo"], dtype=np.object_) - tm.assert_series_equal(result, expected) - - result = s.str.contains("foo") - expected = Series([np.nan, np.nan, np.nan], dtype=np.object_) - tm.assert_series_equal(result, expected) - - def test_replace_moar(self): - # PR #1179 - s = Series(["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"]) - - result = s.str.replace("A", "YYY") - expected = Series( - ["YYY", "B", "C", "YYYaba", "Baca", "", np.nan, "CYYYBYYY", "dog", "cat"] - ) - tm.assert_series_equal(result, expected) - - result = s.str.replace("A", "YYY", case=False) - expected = Series( - [ - "YYY", - "B", - "C", - "YYYYYYbYYY", - "BYYYcYYY", - "", - np.nan, - "CYYYBYYY", - "dog", - "cYYYt", - ] - ) - tm.assert_series_equal(result, expected) - - result = s.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) - expected = Series( - [ - "A", - "B", - "C", - "XX-XX ba", - "XX-XX ca", - "", - np.nan, - "XX-XX BA", - "XX-XX ", - "XX-XX t", - ] - ) - tm.assert_series_equal(result, expected) - - def test_string_slice_get_syntax(self): - s = Series( - [ - "YYY", - "B", - "C", - "YYYYYYbYYY", - "BYYYcYYY", - np.nan, - "CYYYBYYY", - "dog", - "cYYYt", - ] - ) - - result = s.str[0] - expected = s.str.get(0) - tm.assert_series_equal(result, expected) - - result = s.str[:3] - expected = s.str.slice(stop=3) - tm.assert_series_equal(result, expected) - - result = s.str[2::-1] - expected = s.str.slice(start=2, step=-1) - tm.assert_series_equal(result, expected) - - def test_string_slice_out_of_bounds(self): - s = Series([(1, 2), (1,), (3, 4, 5)]) - - result = s.str[1] - expected = Series([2, np.nan, 4]) - - tm.assert_series_equal(result, expected) - - s = Series(["foo", "b", "ba"]) - result = s.str[1] - expected = Series(["o", np.nan, "a"]) - tm.assert_series_equal(result, expected) - - def test_match_findall_flags(self): - data = { - "Dave": "dave@google.com", - "Steve": "steve@gmail.com", - "Rob": "rob@gmail.com", - "Wes": np.nan, - } - data = Series(data) - - pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})" - - result = data.str.extract(pat, flags=re.IGNORECASE, expand=True) - assert result.iloc[0].tolist() == ["dave", "google", "com"] - - result = data.str.match(pat, flags=re.IGNORECASE) - assert result[0] - - result = data.str.fullmatch(pat, flags=re.IGNORECASE) - assert result[0] - - result = data.str.findall(pat, flags=re.IGNORECASE) - assert result[0][0] == ("dave", "google", "com") - - result = data.str.count(pat, flags=re.IGNORECASE) - assert result[0] == 1 - - with tm.assert_produces_warning(UserWarning): - result = data.str.contains(pat, flags=re.IGNORECASE) - assert result[0] - - def test_encode_decode(self): - base = Series(["a", "b", "a\xe4"]) - series = base.str.encode("utf-8") - - f = lambda x: x.decode("utf-8") - result = series.str.decode("utf-8") - exp = series.map(f) - - tm.assert_series_equal(result, exp) - - def test_encode_decode_errors(self): - encodeBase = Series(["a", "b", "a\x9d"]) - - msg = ( - r"'charmap' codec can't encode character '\\x9d' in position 1: " - "character maps to " - ) - with pytest.raises(UnicodeEncodeError, match=msg): - encodeBase.str.encode("cp1252") - - f = lambda x: x.encode("cp1252", "ignore") - result = encodeBase.str.encode("cp1252", "ignore") - exp = encodeBase.map(f) - tm.assert_series_equal(result, exp) - - decodeBase = Series([b"a", b"b", b"a\x9d"]) - - msg = ( - "'charmap' codec can't decode byte 0x9d in position 1: " - "character maps to " - ) - with pytest.raises(UnicodeDecodeError, match=msg): - decodeBase.str.decode("cp1252") - - f = lambda x: x.decode("cp1252", "ignore") - result = decodeBase.str.decode("cp1252", "ignore") - exp = decodeBase.map(f) - - tm.assert_series_equal(result, exp) - - def test_normalize(self): - values = ["ABC", "ABC", "123", np.nan, "アイエ"] - s = Series(values, index=["a", "b", "c", "d", "e"]) - - normed = ["ABC", "ABC", "123", np.nan, "アイエ"] - expected = Series(normed, index=["a", "b", "c", "d", "e"]) - - result = s.str.normalize("NFKC") - tm.assert_series_equal(result, expected) - - expected = Series( - ["ABC", "ABC", "123", np.nan, "アイエ"], index=["a", "b", "c", "d", "e"] - ) - - result = s.str.normalize("NFC") - tm.assert_series_equal(result, expected) - - with pytest.raises(ValueError, match="invalid normalization form"): - s.str.normalize("xxx") - - s = Index(["ABC", "123", "アイエ"]) - expected = Index(["ABC", "123", "アイエ"]) - result = s.str.normalize("NFKC") - tm.assert_index_equal(result, expected) - - def test_index_str_accessor_visibility(self): - from pandas.core.strings import StringMethods - - cases = [ - (["a", "b"], "string"), - (["a", "b", 1], "mixed-integer"), - (["a", "b", 1.3], "mixed"), - (["a", "b", 1.3, 1], "mixed-integer"), - (["aa", datetime(2011, 1, 1)], "mixed"), - ] - for values, tp in cases: - idx = Index(values) - assert isinstance(Series(values).str, StringMethods) - assert isinstance(idx.str, StringMethods) - assert idx.inferred_type == tp - - for values, tp in cases: - idx = Index(values) - assert isinstance(Series(values).str, StringMethods) - assert isinstance(idx.str, StringMethods) - assert idx.inferred_type == tp - - cases = [ - ([1, np.nan], "floating"), - ([datetime(2011, 1, 1)], "datetime64"), - ([timedelta(1)], "timedelta64"), - ] - for values, tp in cases: - idx = Index(values) - message = "Can only use .str accessor with string values" - with pytest.raises(AttributeError, match=message): - Series(values).str - with pytest.raises(AttributeError, match=message): - idx.str - assert idx.inferred_type == tp - - # MultiIndex has mixed dtype, but not allow to use accessor - idx = MultiIndex.from_tuples([("a", "b"), ("a", "b")]) - assert idx.inferred_type == "mixed" - message = "Can only use .str accessor with Index, not MultiIndex" - with pytest.raises(AttributeError, match=message): - idx.str - - def test_str_accessor_no_new_attributes(self): - # https://github.com/pandas-dev/pandas/issues/10673 - s = Series(list("aabbcde")) - with pytest.raises(AttributeError, match="You cannot add any new attribute"): - s.str.xlabel = "a" - - def test_method_on_bytes(self): - lhs = Series(np.array(list("abc"), "S1").astype(object)) - rhs = Series(np.array(list("def"), "S1").astype(object)) - with pytest.raises(TypeError, match="Cannot use .str.cat with values of.*"): - lhs.str.cat(rhs) - - def test_casefold(self): - # GH25405 - expected = Series(["ss", np.nan, "case", "ssd"]) - s = Series(["ß", np.nan, "case", "ßd"]) - result = s.str.casefold() - - tm.assert_series_equal(result, expected) - - -def test_string_array(any_string_method): - method_name, args, kwargs = any_string_method - if method_name == "decode": - pytest.skip("decode requires bytes.") - - data = ["a", "bb", np.nan, "ccc"] - a = Series(data, dtype=object) - b = Series(data, dtype="string") - - expected = getattr(a.str, method_name)(*args, **kwargs) - result = getattr(b.str, method_name)(*args, **kwargs) - - if isinstance(expected, Series): - if expected.dtype == "object" and lib.is_string_array( - expected.dropna().values, - ): - assert result.dtype == "string" - result = result.astype(object) - - elif expected.dtype == "object" and lib.is_bool_array( - expected.values, skipna=True - ): - assert result.dtype == "boolean" - result = result.astype(object) - - elif expected.dtype == "bool": - assert result.dtype == "boolean" - result = result.astype("bool") - - elif expected.dtype == "float" and expected.isna().any(): - assert result.dtype == "Int64" - result = result.astype("float") - - elif isinstance(expected, DataFrame): - columns = expected.select_dtypes(include="object").columns - assert all(result[columns].dtypes == "string") - result[columns] = result[columns].astype(object) - tm.assert_equal(result, expected) - - -@pytest.mark.parametrize( - "method,expected", - [ - ("count", [2, None]), - ("find", [0, None]), - ("index", [0, None]), - ("rindex", [2, None]), - ], -) -def test_string_array_numeric_integer_array(method, expected): - s = Series(["aba", None], dtype="string") - result = getattr(s.str, method)("a") - expected = Series(expected, dtype="Int64") - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "method,expected", - [ - ("isdigit", [False, None, True]), - ("isalpha", [True, None, False]), - ("isalnum", [True, None, True]), - ("isdigit", [False, None, True]), - ], -) -def test_string_array_boolean_array(method, expected): - s = Series(["a", None, "1"], dtype="string") - result = getattr(s.str, method)() - expected = Series(expected, dtype="boolean") - tm.assert_series_equal(result, expected) - - -def test_string_array_extract(): - # https://github.com/pandas-dev/pandas/issues/30969 - # Only expand=False & multiple groups was failing - a = Series(["a1", "b2", "cc"], dtype="string") - b = Series(["a1", "b2", "cc"], dtype="object") - pat = r"(\w)(\d)" - - result = a.str.extract(pat, expand=False) - expected = b.str.extract(pat, expand=False) - assert all(result.dtypes == "string") - - result = result.astype(object) - tm.assert_equal(result, expected) - - -@pytest.mark.parametrize("klass", [tuple, list, np.array, pd.Series, pd.Index]) -def test_cat_different_classes(klass): - # https://github.com/pandas-dev/pandas/issues/33425 - s = Series(["a", "b", "c"]) - result = s.str.cat(klass(["x", "y", "z"])) - expected = Series(["ax", "by", "cz"]) - tm.assert_series_equal(result, expected) - - -def test_str_get_stringarray_multiple_nans(): - s = Series(pd.array(["a", "ab", pd.NA, "abc"])) - result = s.str.get(2) - expected = Series(pd.array([pd.NA, pd.NA, pd.NA, "c"])) - tm.assert_series_equal(result, expected) - - -def test_str_accessor_in_apply_func(): - # https://github.com/pandas-dev/pandas/issues/38979 - df = DataFrame(zip("abc", "def")) - expected = Series(["A/D", "B/E", "C/F"]) - result = df.apply(lambda f: "/".join(f.str.upper()), axis=1) - tm.assert_series_equal(result, expected) - - -def test_str_removeprefix(): - # https://github.com/pandas-dev/pandas/issues/36944 - - df = DataFrame( - {"A": ["str_string1", "str_string2", "str_string3", "string_no_prefix"]} - ) - df["A"] = df["A"].str.removeprefix("str_") - result = DataFrame({"A": ["string1", "string2", "string3", "string_no_prefix"]}) - - tm.assert_frame_equal(df, result) From 198bc37d8705b2282b2eb1efbf2f87feec457070 Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Tue, 26 Jan 2021 13:55:26 +0100 Subject: [PATCH 18/21] fix pre-commit --- pandas/core/strings/accessor.py | 35 ++++++++++++++++++++++++++++- pandas/core/strings/object_array.py | 2 +- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 5b253022a502b..d5ef5938a6d71 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -2886,7 +2886,40 @@ def casefold(self): ) @forbid_nonstring_types(["bytes"]) - def removeprefix(self, prefix=None): + def removeprefix(self, prefix: str = None): + """ + Remove a defined prefix from an object series. If the prefix is not present, + the original string will be returned. + + Parameters + ---------- + prefix: str, default None + Prefix to remove. + + Returns + ------- + Series/Index: object + The Series or Index with given prefix removed. + + See also + -------- + Series.str.strip : Remove leading and trailing characters. + + Examples + -------- + >>> s = pd.Series(["str_string1", "str_string2", "no_prefix"]) + >>> s + 0 str_string1 + 1 str_string2 + 2 no_prefix + dtype: object + + >>> s.str.removeprefix("str_") + 0 string1 + 1 string2 + 2 no_prefix + dtype: object + """ result = self._data.array._str_removeprefix(prefix) return self._wrap_result(result) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index dae8c868d054b..f4c02fd2cea7e 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -433,7 +433,7 @@ def _str_rstrip(self, to_strip=None): def _str_removeprefix(self, prefix=None): f_startswith = lambda x: x.startswith(prefix) - f_slice = lambda x: x[len(prefix):] + f_slice = lambda x: x[len(prefix) :] has_prefix = self._str_map(f_startswith) From a253720a5840fd989d58c20eb7cdcb2f1fffe070 Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Fri, 12 Feb 2021 23:39:01 +0100 Subject: [PATCH 19/21] fix and working version --- pandas/core/strings/object_array.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index f4c02fd2cea7e..886588c702a77 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -434,7 +434,8 @@ def _str_rstrip(self, to_strip=None): def _str_removeprefix(self, prefix=None): f_startswith = lambda x: x.startswith(prefix) f_slice = lambda x: x[len(prefix) :] + has_prefix = self._str_map(f_startswith, dtype="object") + sliced = self._str_map(f_slice, dtype="object") - has_prefix = self._str_map(f_startswith) + return np.where(has_prefix, sliced, self) - return np.where(has_prefix, self._str_map(f_slice), self) From 46a212250ec70241e3f74837a7a4b81188f1a9db Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Fri, 12 Feb 2021 23:40:26 +0100 Subject: [PATCH 20/21] remove duplicated line --- doc/source/whatsnew/v1.3.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 9c283072d14da..3349fb6864de4 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -52,7 +52,6 @@ Other enhancements - :meth:`DataFrame.apply` can now accept NumPy unary operators as strings, e.g. ``df.apply("sqrt")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) - :meth:`DataFrame.apply` can now accept non-callable DataFrame properties as strings, e.g. ``df.apply("size")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) - :meth:`Series.apply` can now accept list-like or dictionary-like arguments that aren't lists or dictionaries, e.g. ``ser.apply(np.array(["sum", "mean"]))``, which was already the case for :meth:`DataFrame.apply` (:issue:`39140`) -- :meth:`.Styler.set_tooltips` allows on hover tooltips to be added to styled HTML dataframes. - :meth:`DataFrame.plot.scatter` can now accept a categorical column as the argument to ``c`` (:issue:`12380`, :issue:`31357`) - :meth:`.Styler.set_tooltips` allows on hover tooltips to be added to styled HTML dataframes (:issue:`35643`, :issue:`21266`, :issue:`39317`) - :meth:`.Styler.set_tooltips_class` and :meth:`.Styler.set_table_styles` amended to optionally allow certain css-string input arguments (:issue:`39564`) From ab5ec1cf78e8ebb5b2c99d0b9e613bc1c6d32565 Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Fri, 12 Feb 2021 23:46:10 +0100 Subject: [PATCH 21/21] fix black --- pandas/core/strings/object_array.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 886588c702a77..4e7d474693dce 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -438,4 +438,3 @@ def _str_removeprefix(self, prefix=None): sliced = self._str_map(f_slice, dtype="object") return np.where(has_prefix, sliced, self) -