From ac63e765bf5ed1efb4a315ae4ec8bf7f89c9600a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 10 Dec 2018 08:46:21 -0600 Subject: [PATCH 1/3] BUG/PERF: Use EA in Index.get_value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid converting the ExtensionArray to an ndarray. Before ``` pandas/tests/arrays/categorical/test_indexing.py F [100%] ========================================================================= FAILURES ========================================================================= ______________________________________________________________________ test_series_at ______________________________________________________________________ def test_series_at(): arr = NonCoercaibleCategorical(['a', 'b', 'c']) ser = Series(arr) > result = ser.at[0] pandas/tests/arrays/categorical/test_indexing.py:158: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ pandas/core/indexing.py:2266: in __getitem__ return self.obj._get_value(*key, takeable=self._takeable) pandas/core/series.py:1078: in _get_value return self.index.get_value(self._values, label) pandas/core/indexes/base.py:4303: in get_value s = com.values_from_object(series) pandas/_libs/lib.pyx:82: in pandas._libs.lib.values_from_object obj = func() pandas/core/arrays/categorical.py:1509: in get_values return np.array(self) _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = <[ValueError("I cannot be converted.") raised in repr()] NonCoercaibleCategorical object at 0x113d8b6a0>, dtype = None def __array__(self, dtype=None): > raise ValueError("I cannot be converted.") E ValueError: I cannot be converted. ``` Perf: ``` In [3]: a = pd.Series(pd.Categorical(np.random.choice(list(string.ascii_letters[:10]), 10_000))) In [4]: %timeit a.at[0] 143 µs ± 4.86 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each) In [3]: a = pd.Series(pd.Categorical(np.random.choice(list(string.ascii_letters[:10]), 10_000))) In [4]: %timeit a.at[0] 11.1 µs ± 95.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each) ``` --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/core/indexes/base.py | 3 ++- pandas/tests/arrays/categorical/test_indexing.py | 12 ++++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 6b8d548251061..313b89f0bab36 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1209,6 +1209,7 @@ Performance Improvements - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) - Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) - Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`) +- Improved performance of :meth:`Series.at` and :meth:`Index.get_value` for Extension Arrays values (e.g. :class:`Categorical`) (:issue:``) - Improved performance of membership checks in :class:`Categorical` and :class:`CategoricalIndex` (i.e. ``x in cat``-style checks are much faster). :meth:`CategoricalIndex.contains` is likewise much faster (:issue:`21369`, :issue:`21508`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index fc5f6758f9e06..811d66c74ed15 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4281,7 +4281,8 @@ def get_value(self, series, key): # if we have something that is Index-like, then # use this, e.g. DatetimeIndex - s = getattr(series, '_values', None) + # Things like `Series._get_value` (via .at) pass the EA directly here. + s = getattr(series, '_values', series) if isinstance(s, (ExtensionArray, Index)) and is_scalar(key): # GH 20882, 21257 # Unify Index and ExtensionArray treatment diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 8df5728f7d895..4c3f3d68b01c0 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -145,3 +145,15 @@ def test_mask_with_boolean_raises(index): with pytest.raises(ValueError, match='NA / NaN'): s[idx] + + +class NonCoercaibleCategorical(Categorical): + def __array__(self, dtype=None): + raise ValueError("I cannot be converted.") + + +def test_series_at(): + arr = NonCoercaibleCategorical(['a', 'b', 'c']) + ser = Series(arr) + result = ser.at[0] + assert result == 'a' From 7a6a16d78cfdac8434d0d1bcda4660d423e0eff5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 10 Dec 2018 08:51:32 -0600 Subject: [PATCH 2/3] issue number --- doc/source/whatsnew/v0.24.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 313b89f0bab36..11673f332876f 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1209,7 +1209,7 @@ Performance Improvements - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) - Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) - Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`) -- Improved performance of :meth:`Series.at` and :meth:`Index.get_value` for Extension Arrays values (e.g. :class:`Categorical`) (:issue:``) +- Improved performance of :meth:`Series.at` and :meth:`Index.get_value` for Extension Arrays values (e.g. :class:`Categorical`) (:issue:`24204`) - Improved performance of membership checks in :class:`Categorical` and :class:`CategoricalIndex` (i.e. ``x in cat``-style checks are much faster). :meth:`CategoricalIndex.contains` is likewise much faster (:issue:`21369`, :issue:`21508`) From 49c596757e7f2caf563705e7296a14c751de9e5e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 10 Dec 2018 10:11:25 -0600 Subject: [PATCH 3/3] Use monkeypatch --- .../tests/arrays/categorical/test_indexing.py | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 4c3f3d68b01c0..8b31bb0534368 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -147,13 +147,28 @@ def test_mask_with_boolean_raises(index): s[idx] -class NonCoercaibleCategorical(Categorical): - def __array__(self, dtype=None): +@pytest.fixture +def non_coercible_categorical(monkeypatch): + """ + Monkeypatch Categorical.__array__ to ensure no implicit conversion. + + Raises + ------ + ValueError + When Categorical.__array__ is called. + """ + # TODO(Categorical): identify other places where this may be + # useful and move to a conftest.py + def array(self, dtype=None): raise ValueError("I cannot be converted.") + with monkeypatch.context() as m: + m.setattr(Categorical, "__array__", array) + yield -def test_series_at(): - arr = NonCoercaibleCategorical(['a', 'b', 'c']) + +def test_series_at(non_coercible_categorical): + arr = Categorical(['a', 'b', 'c']) ser = Series(arr) result = ser.at[0] assert result == 'a'