Skip to content

Commit 6d16567

Browse files
authored
WARN: PerformanceWarning for non-pyarrow fallback (#46732)
1 parent 5fcbef5 commit 6d16567

File tree

7 files changed

+229
-69
lines changed

7 files changed

+229
-69
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ Other enhancements
9595
- :meth:`pd.concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`)
9696
- :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`)
9797
- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, and :meth:`DataFrame.cov` (:issue:`46560`)
98+
- A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`)
9899

99100
.. ---------------------------------------------------------------------------
100101
.. _whatsnew_150.notable_bug_fixes:

pandas/core/arrays/arrow/_arrow_utils.py

+15
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,28 @@
11
from __future__ import annotations
22

33
import json
4+
import warnings
45

56
import numpy as np
67
import pyarrow
78

9+
from pandas.errors import PerformanceWarning
10+
from pandas.util._exceptions import find_stack_level
11+
812
from pandas.core.arrays.interval import VALID_CLOSED
913

1014

15+
def fallback_performancewarning(version: str | None = None):
16+
"""
17+
Raise a PerformanceWarning for falling back to ExtensionArray's
18+
non-pyarrow method
19+
"""
20+
msg = "Falling back on a non-pyarrow code path which may decrease performance."
21+
if version is not None:
22+
msg += f" Upgrade to pyarrow >={version} to possibly suppress this warning."
23+
warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level())
24+
25+
1126
def pyarrow_array_to_numpy_and_mask(arr, dtype: np.dtype):
1227
"""
1328
Convert a primitive pyarrow.Array to a numpy array and boolean mask based

pandas/core/arrays/string_arrow.py

+15
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@
6060
import pyarrow as pa
6161
import pyarrow.compute as pc
6262

63+
from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning
64+
6365
ARROW_CMP_FUNCS = {
6466
"eq": pc.equal,
6567
"ne": pc.not_equal,
@@ -331,6 +333,7 @@ def _maybe_convert_setitem_value(self, value):
331333

332334
def isin(self, values):
333335
if pa_version_under2p0:
336+
fallback_performancewarning(version="2")
334337
return super().isin(values)
335338

336339
value_set = [
@@ -437,10 +440,12 @@ def _str_map(
437440

438441
def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex: bool = True):
439442
if flags:
443+
fallback_performancewarning()
440444
return super()._str_contains(pat, case, flags, na, regex)
441445

442446
if regex:
443447
if pa_version_under4p0 or case is False:
448+
fallback_performancewarning(version="4")
444449
return super()._str_contains(pat, case, flags, na, regex)
445450
else:
446451
result = pc.match_substring_regex(self._data, pat)
@@ -456,13 +461,15 @@ def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex: bool = True):
456461

457462
def _str_startswith(self, pat: str, na=None):
458463
if pa_version_under4p0:
464+
fallback_performancewarning(version="4")
459465
return super()._str_startswith(pat, na)
460466

461467
pat = "^" + re.escape(pat)
462468
return self._str_contains(pat, na=na, regex=True)
463469

464470
def _str_endswith(self, pat: str, na=None):
465471
if pa_version_under4p0:
472+
fallback_performancewarning(version="4")
466473
return super()._str_endswith(pat, na)
467474

468475
pat = re.escape(pat) + "$"
@@ -484,6 +491,7 @@ def _str_replace(
484491
or not case
485492
or flags
486493
):
494+
fallback_performancewarning(version="4")
487495
return super()._str_replace(pat, repl, n, case, flags, regex)
488496

489497
func = pc.replace_substring_regex if regex else pc.replace_substring
@@ -494,6 +502,7 @@ def _str_match(
494502
self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
495503
):
496504
if pa_version_under4p0:
505+
fallback_performancewarning(version="4")
497506
return super()._str_match(pat, case, flags, na)
498507

499508
if not pat.startswith("^"):
@@ -504,6 +513,7 @@ def _str_fullmatch(
504513
self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
505514
):
506515
if pa_version_under4p0:
516+
fallback_performancewarning(version="4")
507517
return super()._str_fullmatch(pat, case, flags, na)
508518

509519
if not pat.endswith("$") or pat.endswith("//$"):
@@ -536,6 +546,7 @@ def _str_isnumeric(self):
536546

537547
def _str_isspace(self):
538548
if pa_version_under2p0:
549+
fallback_performancewarning(version="2")
539550
return super()._str_isspace()
540551

541552
result = pc.utf8_is_space(self._data)
@@ -551,6 +562,7 @@ def _str_isupper(self):
551562

552563
def _str_len(self):
553564
if pa_version_under4p0:
565+
fallback_performancewarning(version="4")
554566
return super()._str_len()
555567

556568
result = pc.utf8_length(self._data)
@@ -564,6 +576,7 @@ def _str_upper(self):
564576

565577
def _str_strip(self, to_strip=None):
566578
if pa_version_under4p0:
579+
fallback_performancewarning(version="4")
567580
return super()._str_strip(to_strip)
568581

569582
if to_strip is None:
@@ -574,6 +587,7 @@ def _str_strip(self, to_strip=None):
574587

575588
def _str_lstrip(self, to_strip=None):
576589
if pa_version_under4p0:
590+
fallback_performancewarning(version="4")
577591
return super()._str_lstrip(to_strip)
578592

579593
if to_strip is None:
@@ -584,6 +598,7 @@ def _str_lstrip(self, to_strip=None):
584598

585599
def _str_rstrip(self, to_strip=None):
586600
if pa_version_under4p0:
601+
fallback_performancewarning(version="4")
587602
return super()._str_rstrip(to_strip)
588603

589604
if to_strip is None:

pandas/tests/arrays/string_/test_string.py

+19-4
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,13 @@
22
This module tests the functionality of StringArray and ArrowStringArray.
33
Tests for the str accessors are in pandas/tests/strings/test_string_array.py
44
"""
5+
from contextlib import nullcontext
6+
57
import numpy as np
68
import pytest
79

10+
from pandas.compat import pa_version_under2p0
11+
from pandas.errors import PerformanceWarning
812
import pandas.util._test_decorators as td
913

1014
from pandas.core.dtypes.common import is_dtype_equal
@@ -14,6 +18,13 @@
1418
from pandas.core.arrays.string_arrow import ArrowStringArray
1519

1620

21+
def maybe_perf_warn(using_pyarrow):
22+
if using_pyarrow:
23+
return tm.assert_produces_warning(PerformanceWarning, match="Falling back")
24+
else:
25+
return nullcontext()
26+
27+
1728
@pytest.fixture
1829
def dtype(string_storage):
1930
return pd.StringDtype(storage=string_storage)
@@ -557,18 +568,22 @@ def test_to_numpy_na_value(dtype, nulls_fixture):
557568
def test_isin(dtype, fixed_now_ts):
558569
s = pd.Series(["a", "b", None], dtype=dtype)
559570

560-
result = s.isin(["a", "c"])
571+
with maybe_perf_warn(dtype == "pyarrow" and pa_version_under2p0):
572+
result = s.isin(["a", "c"])
561573
expected = pd.Series([True, False, False])
562574
tm.assert_series_equal(result, expected)
563575

564-
result = s.isin(["a", pd.NA])
576+
with maybe_perf_warn(dtype == "pyarrow" and pa_version_under2p0):
577+
result = s.isin(["a", pd.NA])
565578
expected = pd.Series([True, False, True])
566579
tm.assert_series_equal(result, expected)
567580

568-
result = s.isin([])
581+
with maybe_perf_warn(dtype == "pyarrow" and pa_version_under2p0):
582+
result = s.isin([])
569583
expected = pd.Series([False, False, False])
570584
tm.assert_series_equal(result, expected)
571585

572-
result = s.isin(["a", fixed_now_ts])
586+
with maybe_perf_warn(dtype == "pyarrow" and pa_version_under2p0):
587+
result = s.isin(["a", fixed_now_ts])
573588
expected = pd.Series([True, False, False])
574589
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)