From d31faac5a87315e31e0c1b543572fb5cbe6532ad Mon Sep 17 00:00:00 2001 From: ujex256 <105550500+ujex256@users.noreply.github.com> Date: Tue, 25 Jun 2024 18:42:24 +0900 Subject: [PATCH 1/4] fix: add str.isascii method --- pandas/core/strings/base.py | 4 ++++ pandas/core/strings/object_array.py | 3 +++ 2 files changed, 7 insertions(+) diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index 1281a03e297f9..1920ad49e0aaa 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -174,6 +174,10 @@ def _str_isalnum(self): def _str_isalpha(self): pass + @abc.abstractmethod + def _str_isascii(self): + pass + @abc.abstractmethod def _str_isdecimal(self): pass diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 290a28ab60ae1..6ba53fb5a7c8e 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -409,6 +409,9 @@ def _str_isalnum(self): def _str_isalpha(self): return self._str_map(str.isalpha, dtype="bool") + def _str_isascii(self): + return self._str_map(str.isascii, dtype="bool") + def _str_isdecimal(self): return self._str_map(str.isdecimal, dtype="bool") From d48988561514e54298d13da72b6a0b715758d40f Mon Sep 17 00:00:00 2001 From: ujex256 <105550500+ujex256@users.noreply.github.com> Date: Tue, 25 Jun 2024 20:48:59 +0900 Subject: [PATCH 2/4] docs: add description of isascii() --- pandas/core/strings/accessor.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index dd9276179cf4d..86d38a87f0cc4 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -3320,7 +3320,8 @@ def casefold(self): This is equivalent to running the Python string method :meth:`str.%(method)s` for each element of the Series/Index. If a string - has zero characters, ``False`` is returned for that check. + has zero characters, ``False`` is returned for that check + except for the `isascii` method. Returns ------- @@ -3333,6 +3334,7 @@ def casefold(self): Series.str.isalpha : Check whether all characters are alphabetic. Series.str.isnumeric : Check whether all characters are numeric. Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isascii : Check whether all characters are ASCII characters. Series.str.isdigit : Check whether all characters are digits. Series.str.isdecimal : Check whether all characters are decimal. Series.str.isspace : Check whether all characters are whitespace. @@ -3367,6 +3369,13 @@ def casefold(self): 3 False dtype: bool + >>> s1.str.isascii() + 0 True + 1 True + 2 True + 3 True + dtype: bool + Note that checks against characters mixed with any additional punctuation or whitespace will evaluate to false for an alphanumeric check. @@ -3457,6 +3466,7 @@ def casefold(self): """ _doc_args["isalnum"] = {"type": "alphanumeric", "method": "isalnum"} _doc_args["isalpha"] = {"type": "alphabetic", "method": "isalpha"} + _doc_args["isascii"] = {"type": "ASCII characters", "method": "isascii"} _doc_args["isdigit"] = {"type": "digits", "method": "isdigit"} _doc_args["isspace"] = {"type": "whitespace", "method": "isspace"} _doc_args["islower"] = {"type": "lowercase", "method": "islower"} @@ -3472,6 +3482,9 @@ def casefold(self): isalpha = _map_and_wrap( "isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"] ) + isascii = _map_and_wrap( + "isascii", docstring=_shared_docs["ismethods"] % _doc_args["isascii"] + ) isdigit = _map_and_wrap( "isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"] ) From f4f75ab65a069fa48e9e2ad7f6b4bf01e9a30a8e Mon Sep 17 00:00:00 2001 From: ujex256 <105550500+ujex256@users.noreply.github.com> Date: Tue, 25 Jun 2024 22:24:32 +0900 Subject: [PATCH 3/4] test: add str.isascii testcase --- pandas/tests/strings/conftest.py | 2 +- pandas/tests/strings/test_strings.py | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/pandas/tests/strings/conftest.py b/pandas/tests/strings/conftest.py index 92b7b16da3c1f..ba60e48f7c23f 100644 --- a/pandas/tests/strings/conftest.py +++ b/pandas/tests/strings/conftest.py @@ -68,6 +68,7 @@ "get_dummies", "isalnum", "isalpha", + "isascii", "isdecimal", "isdigit", "islower", @@ -97,7 +98,6 @@ ) ids, _, _ = zip(*_any_string_method) # use method name as fixture-id missing_methods = {f for f in dir(StringMethods) if not f.startswith("_")} - set(ids) - # test that the above list captures all methods of StringMethods assert not missing_methods diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 25e4e1f9ec50c..8d604afd2fc1a 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -217,6 +217,19 @@ def test_ismethods(method, expected, any_string_dtype): assert list(result) == expected +def test_isascii(any_string_dtype): + ser = Series( + ["a", "bb", "123", "あ", "\n", "", " ", "¼"], + dtype=any_string_dtype, + ) + expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean" + result = ser.str.isascii() + expected = Series( + [True, True, True, False, True, True, True, False], dtype=expected_dtype + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "method, expected", [ From b9056cbee3e7b91f2076401212b065330477fa4a Mon Sep 17 00:00:00 2001 From: ujex256 <105550500+ujex256@users.noreply.github.com> Date: Tue, 25 Jun 2024 22:45:47 +0900 Subject: [PATCH 4/4] wip: _str_isascii --- pandas/core/arrays/string_arrow.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 97c06149d0b7e..7ffad20aebfd8 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -459,6 +459,9 @@ def _str_isalpha(self): result = pc.utf8_is_alpha(self._pa_array) return self._result_converter(result) + def _str_isascii(self): + return super()._str_isascii() + def _str_isdecimal(self): result = pc.utf8_is_decimal(self._pa_array) return self._result_converter(result)