From 55b940b869c453f9a9291a5808d2d1613e6bedc3 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 3 Jun 2023 16:18:40 -0400 Subject: [PATCH 1/2] BUG/PERF: DataFrame.isin lossy data conversion --- doc/source/whatsnew/v2.1.0.rst | 2 ++ pandas/core/frame.py | 22 ++++++++++++++-------- pandas/tests/frame/methods/test_isin.py | 8 ++++++++ 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 92124a536fe26..809692901a931 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -310,6 +310,8 @@ Performance improvements - Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`52525`) - Performance improvement when doing various reshaping operations on :class:`arrays.IntegerArrays` & :class:`arrays.FloatingArray` by avoiding doing unnecessary validation (:issue:`53013`) - Performance improvement when indexing with pyarrow timestamp and duration dtypes (:issue:`53368`) +- Performance improvement in :meth:`DataFrame.isin` for extension dtypes (:issue:`#####`) +- .. --------------------------------------------------------------------------- .. _whatsnew_210.bug_fixes: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d4c2124182ea5..1524197938a81 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11731,15 +11731,21 @@ def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame: "to be passed to DataFrame.isin(), " f"you passed a '{type(values).__name__}'" ) - # error: Argument 2 to "isin" has incompatible type "Union[Sequence[Any], - # Mapping[Any, Any]]"; expected "Union[Union[ExtensionArray, - # ndarray[Any, Any]], Index, Series]" - res_values = algorithms.isin( - self.values.ravel(), - values, # type: ignore[arg-type] - ) + + def isin_(x): + # error: Argument 2 to "isin" has incompatible type "Union[Series, + # DataFrame, Sequence[Any], Mapping[Any, Any]]"; expected + # "Union[Union[Union[ExtensionArray, ndarray[Any, Any]], Index, + # Series], List[Any], range]" + result = algorithms.isin( + x.ravel(), + values, # type: ignore[arg-type] + ) + return result.reshape(x.shape) + + res_values = self._mgr.apply(isin_) result = self._constructor( - res_values.reshape(self.shape), + res_values, self.index, self.columns, copy=False, diff --git a/pandas/tests/frame/methods/test_isin.py b/pandas/tests/frame/methods/test_isin.py index e924963f588f3..1e527d598ad57 100644 --- a/pandas/tests/frame/methods/test_isin.py +++ b/pandas/tests/frame/methods/test_isin.py @@ -217,3 +217,11 @@ def test_isin_read_only(self): result = df.isin(arr) expected = DataFrame([True, True, True]) tm.assert_frame_equal(result, expected) + + def test_isin_not_lossy(self): + # GH ##### + val = 1666880195890293744 + df = DataFrame({"a": [val], "b": [1.0]}) + result = df.isin([val]) + expected = DataFrame({"a": [True], "b": [False]}) + tm.assert_frame_equal(result, expected) From de9c05fc0a117f6bee32a76b51b4d118e41cb912 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 3 Jun 2023 16:23:50 -0400 Subject: [PATCH 2/2] gh refs --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/tests/frame/methods/test_isin.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 809692901a931..db46b82ab90ac 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -301,6 +301,7 @@ Performance improvements - Performance improvement in :class:`Series` reductions (:issue:`52341`) - Performance improvement in :func:`concat` when ``axis=1`` and objects have different indexes (:issue:`52541`) - Performance improvement in :meth:`.DataFrameGroupBy.groups` (:issue:`53088`) +- Performance improvement in :meth:`DataFrame.isin` for extension dtypes (:issue:`53514`) - Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`) - Performance improvement in :meth:`Series.add` for pyarrow string and binary dtypes (:issue:`53150`) - Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`) @@ -310,7 +311,6 @@ Performance improvements - Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`52525`) - Performance improvement when doing various reshaping operations on :class:`arrays.IntegerArrays` & :class:`arrays.FloatingArray` by avoiding doing unnecessary validation (:issue:`53013`) - Performance improvement when indexing with pyarrow timestamp and duration dtypes (:issue:`53368`) -- Performance improvement in :meth:`DataFrame.isin` for extension dtypes (:issue:`#####`) - .. --------------------------------------------------------------------------- diff --git a/pandas/tests/frame/methods/test_isin.py b/pandas/tests/frame/methods/test_isin.py index 1e527d598ad57..b4511aad27a93 100644 --- a/pandas/tests/frame/methods/test_isin.py +++ b/pandas/tests/frame/methods/test_isin.py @@ -219,7 +219,7 @@ def test_isin_read_only(self): tm.assert_frame_equal(result, expected) def test_isin_not_lossy(self): - # GH ##### + # GH 53514 val = 1666880195890293744 df = DataFrame({"a": [val], "b": [1.0]}) result = df.isin([val])