From 0349837e3122650ad2c5a9f978a89fdbb0782c3d Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Thu, 25 Apr 2024 19:24:44 +0800 Subject: [PATCH 1/9] Add test --- pandas/tests/extension/test_arrow.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 79440b55dd5dd..c9cf264b1e4f8 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2296,6 +2296,16 @@ def test_str_split_pat_none(method): tm.assert_series_equal(result, expected) +def test_str_split_regex_none(): + # GH 58321 + ser = pd.Series(["230/270/270", "240-290-290"], dtype=ArrowDtype(pa.string())) + result = ser.str.split(r"/|-", regex=None) + expected = pd.Series( + ArrowExtensionArray(pa.array([["230", "270", "270"], ["240", "290", "290"]])) + ) + tm.assert_series_equal(result, expected) + + def test_str_split(): # GH 52401 ser = pd.Series(["a1cbcb", "a2cbcb", None], dtype=ArrowDtype(pa.string())) From 4948784959a2f4840e291aeb34874ffaa94a6295 Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Thu, 25 Apr 2024 19:34:04 +0800 Subject: [PATCH 2/9] Fix --- pandas/core/arrays/arrow/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 1154130b9bed3..47bcd73e270aa 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2579,7 +2579,7 @@ def _str_split( n = None if pat is None: split_func = pc.utf8_split_whitespace - elif regex: + elif regex or (regex is None and len(pat) != 1): split_func = functools.partial(pc.split_pattern_regex, pattern=pat) else: split_func = functools.partial(pc.split_pattern, pattern=pat) From 14c059ce428f728b85c93d3b2d8cdbc963662cb6 Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Thu, 25 Apr 2024 19:34:56 +0800 Subject: [PATCH 3/9] Add whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c77348b365370..e4bb4b0c5604a 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -391,6 +391,7 @@ Conversion Strings ^^^^^^^ +- Bug in :meth:`Series.str.split` would not treat ``pat`` as regex when ``regex=None`` for series having ``pd.ArrowDtype(pa.string())`` dtype (:issue:`58321`) - Bug in :meth:`Series.value_counts` would not respect ``sort=False`` for series having ``string`` dtype (:issue:`55224`) - From f0c2097aba28cbe5df1a0c9dc10a86d5fa582826 Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Tue, 30 Apr 2024 16:33:55 +0800 Subject: [PATCH 4/9] Move logic outside --- pandas/core/arrays/arrow/array.py | 2 +- pandas/core/strings/accessor.py | 3 +++ pandas/core/strings/object_array.py | 8 +------- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index b0be934576b61..0240433cdb683 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2581,7 +2581,7 @@ def _str_split( n = None if pat is None: split_func = pc.utf8_split_whitespace - elif regex or (regex is None and len(pat) != 1): + elif regex: split_func = functools.partial(pc.split_pattern_regex, pattern=pat) else: split_func = functools.partial(pc.split_pattern, pattern=pat) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index d274c1d7a5aff..cb1fae32d4c7f 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -910,6 +910,9 @@ def split( ) if is_re(pat): regex = True + elif pat is not None and regex is None: + # regex is None so link to old behavior #43563 + regex = len(pat) != 1 result = self._data.array._str_split(pat, n, expand, regex) if self._data.dtype == "category": dtype = self._data.dtype.categories.dtype diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index bdcf55e61d2d1..91b9e544f72cf 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -339,14 +339,8 @@ def _str_split( new_pat: str | re.Pattern if regex is True or isinstance(pat, re.Pattern): new_pat = re.compile(pat) - elif regex is False: - new_pat = pat - # regex is None so link to old behavior #43563 else: - if len(pat) == 1: - new_pat = pat - else: - new_pat = re.compile(pat) + new_pat = pat if isinstance(new_pat, re.Pattern): if n is None or n == -1: From 6f93a8d9a8d50acbbf055b328cc51ccce2018a11 Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Tue, 30 Apr 2024 16:40:59 +0800 Subject: [PATCH 5/9] Move test --- pandas/conftest.py | 20 +++++++++++- pandas/tests/extension/test_arrow.py | 10 ------ pandas/tests/strings/test_split_partition.py | 32 +++++++++++++++----- 3 files changed, 43 insertions(+), 19 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 21100178262c8..f1cd91b220bd2 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -92,7 +92,6 @@ except ImportError: has_pyarrow = False else: - del pa has_pyarrow = True import zoneinfo @@ -1367,6 +1366,25 @@ def any_string_dtype(request): return request.param +@pytest.fixture( + params=[ + "object", + "string[python]", + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), + pytest.param(pd.ArrowDtype(pa.string()), marks=td.skip_if_no("pyarrow")), + ] +) +def any_string_dtype_2(request): + """ + Parametrized fixture for string dtypes. + * 'object' + * 'string[python]' + * 'string[pyarrow]' + """ + return request.param + + @pytest.fixture(params=tm.DATETIME64_DTYPES) def datetime64_dtype(request): """ diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 5b66b40357fe8..7d31fe6085c3a 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2296,16 +2296,6 @@ def test_str_split_pat_none(method): tm.assert_series_equal(result, expected) -def test_str_split_regex_none(): - # GH 58321 - ser = pd.Series(["230/270/270", "240-290-290"], dtype=ArrowDtype(pa.string())) - result = ser.str.split(r"/|-", regex=None) - expected = pd.Series( - ArrowExtensionArray(pa.array([["230", "270", "270"], ["240", "290", "290"]])) - ) - tm.assert_series_equal(result, expected) - - def test_str_split(): # GH 52401 ser = pd.Series(["a1cbcb", "a2cbcb", None], dtype=ArrowDtype(pa.string())) diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 452e5ec5cf939..53c50304cc8a0 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -17,6 +17,10 @@ object_pyarrow_numpy, ) +pa = pytest.importorskip("pyarrow") + +from pandas.core.arrays.arrow.array import ArrowExtensionArray + @pytest.mark.parametrize("method", ["split", "rsplit"]) def test_split(any_string_dtype, method): @@ -59,27 +63,39 @@ def test_split_regex(any_string_dtype): tm.assert_series_equal(result, exp) -def test_split_regex_explicit(any_string_dtype): +def test_split_regex_explicit(any_string_dtype_2): # explicit regex = True split with compiled regex regex_pat = re.compile(r".jpg") - values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype) - result = values.str.split(regex_pat) - exp = Series([["xx", "zzz", ""]]) - tm.assert_series_equal(result, exp) + values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype_2) + + if not isinstance(any_string_dtype_2, pd.ArrowDtype): + # ArrowDtype does not support compiled regex + result = values.str.split(regex_pat) + exp = Series([["xx", "zzz", ""]]) + tm.assert_series_equal(result, exp) # explicit regex = False split result = values.str.split(r"\.jpg", regex=False) - exp = Series([["xxxjpgzzz.jpg"]]) + if not isinstance(any_string_dtype_2, pd.ArrowDtype): + exp = Series([["xxxjpgzzz.jpg"]]) + else: + exp = Series(ArrowExtensionArray(pa.array([["xxxjpgzzz.jpg"]]))) tm.assert_series_equal(result, exp) # non explicit regex split, pattern length == 1 result = values.str.split(r".") - exp = Series([["xxxjpgzzz", "jpg"]]) + if not isinstance(any_string_dtype_2, pd.ArrowDtype): + exp = Series([["xxxjpgzzz", "jpg"]]) + else: + exp = Series(ArrowExtensionArray(pa.array([["xxxjpgzzz", "jpg"]]))) tm.assert_series_equal(result, exp) # non explicit regex split, pattern length != 1 result = values.str.split(r".jpg") - exp = Series([["xx", "zzz", ""]]) + if not isinstance(any_string_dtype_2, pd.ArrowDtype): + exp = Series([["xx", "zzz", ""]]) + else: + exp = Series(ArrowExtensionArray(pa.array([["xx", "zzz", ""]]))) tm.assert_series_equal(result, exp) # regex=False with pattern compiled regex raises error From b9b3197e22f638ed0ce5cbe89ed9aa4b8aeafefa Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Sun, 12 May 2024 23:39:05 +0800 Subject: [PATCH 6/9] Revert "Move test" This reverts commit 6f93a8d9a8d50acbbf055b328cc51ccce2018a11. --- pandas/conftest.py | 20 +----------- pandas/tests/extension/test_arrow.py | 10 ++++++ pandas/tests/strings/test_split_partition.py | 32 +++++--------------- 3 files changed, 19 insertions(+), 43 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index f1cd91b220bd2..21100178262c8 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -92,6 +92,7 @@ except ImportError: has_pyarrow = False else: + del pa has_pyarrow = True import zoneinfo @@ -1366,25 +1367,6 @@ def any_string_dtype(request): return request.param -@pytest.fixture( - params=[ - "object", - "string[python]", - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - pytest.param(pd.ArrowDtype(pa.string()), marks=td.skip_if_no("pyarrow")), - ] -) -def any_string_dtype_2(request): - """ - Parametrized fixture for string dtypes. - * 'object' - * 'string[python]' - * 'string[pyarrow]' - """ - return request.param - - @pytest.fixture(params=tm.DATETIME64_DTYPES) def datetime64_dtype(request): """ diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 7d31fe6085c3a..5b66b40357fe8 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2296,6 +2296,16 @@ def test_str_split_pat_none(method): tm.assert_series_equal(result, expected) +def test_str_split_regex_none(): + # GH 58321 + ser = pd.Series(["230/270/270", "240-290-290"], dtype=ArrowDtype(pa.string())) + result = ser.str.split(r"/|-", regex=None) + expected = pd.Series( + ArrowExtensionArray(pa.array([["230", "270", "270"], ["240", "290", "290"]])) + ) + tm.assert_series_equal(result, expected) + + def test_str_split(): # GH 52401 ser = pd.Series(["a1cbcb", "a2cbcb", None], dtype=ArrowDtype(pa.string())) diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 53c50304cc8a0..452e5ec5cf939 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -17,10 +17,6 @@ object_pyarrow_numpy, ) -pa = pytest.importorskip("pyarrow") - -from pandas.core.arrays.arrow.array import ArrowExtensionArray - @pytest.mark.parametrize("method", ["split", "rsplit"]) def test_split(any_string_dtype, method): @@ -63,39 +59,27 @@ def test_split_regex(any_string_dtype): tm.assert_series_equal(result, exp) -def test_split_regex_explicit(any_string_dtype_2): +def test_split_regex_explicit(any_string_dtype): # explicit regex = True split with compiled regex regex_pat = re.compile(r".jpg") - values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype_2) - - if not isinstance(any_string_dtype_2, pd.ArrowDtype): - # ArrowDtype does not support compiled regex - result = values.str.split(regex_pat) - exp = Series([["xx", "zzz", ""]]) - tm.assert_series_equal(result, exp) + values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype) + result = values.str.split(regex_pat) + exp = Series([["xx", "zzz", ""]]) + tm.assert_series_equal(result, exp) # explicit regex = False split result = values.str.split(r"\.jpg", regex=False) - if not isinstance(any_string_dtype_2, pd.ArrowDtype): - exp = Series([["xxxjpgzzz.jpg"]]) - else: - exp = Series(ArrowExtensionArray(pa.array([["xxxjpgzzz.jpg"]]))) + exp = Series([["xxxjpgzzz.jpg"]]) tm.assert_series_equal(result, exp) # non explicit regex split, pattern length == 1 result = values.str.split(r".") - if not isinstance(any_string_dtype_2, pd.ArrowDtype): - exp = Series([["xxxjpgzzz", "jpg"]]) - else: - exp = Series(ArrowExtensionArray(pa.array([["xxxjpgzzz", "jpg"]]))) + exp = Series([["xxxjpgzzz", "jpg"]]) tm.assert_series_equal(result, exp) # non explicit regex split, pattern length != 1 result = values.str.split(r".jpg") - if not isinstance(any_string_dtype_2, pd.ArrowDtype): - exp = Series([["xx", "zzz", ""]]) - else: - exp = Series(ArrowExtensionArray(pa.array([["xx", "zzz", ""]]))) + exp = Series([["xx", "zzz", ""]]) tm.assert_series_equal(result, exp) # regex=False with pattern compiled regex raises error From d9199577110e9cbad308640d3bd2a8b3c4e0d049 Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Sun, 12 May 2024 23:49:14 +0800 Subject: [PATCH 7/9] Add test --- pandas/tests/extension/test_arrow.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 5b66b40357fe8..3a5493529808d 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2296,14 +2296,25 @@ def test_str_split_pat_none(method): tm.assert_series_equal(result, expected) -def test_str_split_regex_none(): +def test_str_split_regex_explicit(): # GH 58321 - ser = pd.Series(["230/270/270", "240-290-290"], dtype=ArrowDtype(pa.string())) - result = ser.str.split(r"/|-", regex=None) - expected = pd.Series( - ArrowExtensionArray(pa.array([["230", "270", "270"], ["240", "290", "290"]])) - ) - tm.assert_series_equal(result, expected) + # adapted from tests/strings/test_split_partition.py + values = pd.Series("xxxjpgzzz.jpg", dtype=ArrowDtype(pa.string())) + + # explicit regex = False split + result = values.str.split(r"\.jpg", regex=False) + exp = pd.Series(ArrowExtensionArray(pa.array([["xxxjpgzzz.jpg"]]))) + tm.assert_series_equal(result, exp) + + # non explicit regex split, pattern length == 1 + result = values.str.split(r".") + exp = pd.Series(ArrowExtensionArray(pa.array([["xxxjpgzzz", "jpg"]]))) + tm.assert_series_equal(result, exp) + + # non explicit regex split, pattern length != 1 + result = values.str.split(r".jpg") + exp = pd.Series(ArrowExtensionArray(pa.array([["xx", "zzz", ""]]))) + tm.assert_series_equal(result, exp) def test_str_split(): From 20e5ebe063708d7e501ca5edf67e32d813942049 Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Mon, 3 Jun 2024 13:24:10 +0800 Subject: [PATCH 8/9] pass mypy type check --- pandas/core/strings/accessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index d08e396441907..81a703c889370 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -920,7 +920,7 @@ def split( ) if is_re(pat): regex = True - elif pat is not None and regex is None: + elif isinstance(pat, str) and regex is None: # regex is None so link to old behavior #43563 regex = len(pat) != 1 result = self._data.array._str_split(pat, n, expand, regex) From 14fd864d3b20528479c9093d712621e4ccfe63cd Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Thu, 27 Jun 2024 20:11:41 +0800 Subject: [PATCH 9/9] keep _str_split --- pandas/core/strings/object_array.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 91b9e544f72cf..bdcf55e61d2d1 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -339,8 +339,14 @@ def _str_split( new_pat: str | re.Pattern if regex is True or isinstance(pat, re.Pattern): new_pat = re.compile(pat) - else: + elif regex is False: new_pat = pat + # regex is None so link to old behavior #43563 + else: + if len(pat) == 1: + new_pat = pat + else: + new_pat = re.compile(pat) if isinstance(new_pat, re.Pattern): if n is None or n == -1: