From 43eb356224b2ef9112964365bc1a7792278aad69 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 5 Jan 2023 22:58:17 +0100 Subject: [PATCH 1/5] ENH: Add fast array equal function for indexers --- pandas/_libs/lib.pyi | 8 ++++++++ pandas/_libs/lib.pyx | 34 ++++++++++++++++++++++++++++++++++ pandas/tests/libs/test_lib.py | 21 +++++++++++++++++++++ 3 files changed, 63 insertions(+) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 9bc02e90ebb9e..c1e0ef2a22faa 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -240,3 +240,11 @@ def get_reverse_indexer( ) -> npt.NDArray[np.intp]: ... def is_bool_list(obj: list) -> bool: ... def dtypes_all_equal(types: list[DtypeObj]) -> bool: ... +@overload +def array_equal_fast( + left: np.ndarray[np.int64], right: np.ndarray[np.int64] +) -> bool: ... +@overload +def array_equal_fast( + left: np.ndarray[np.int32], right: np.ndarray[np.int32] +) -> bool: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index b56cf2a23a45f..0202e9c1a011b 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -50,6 +50,7 @@ from numpy cimport ( complex128_t, flatiter, float64_t, + int32_t, int64_t, intp_t, ndarray, @@ -642,6 +643,39 @@ def array_equivalent_object(ndarray left, ndarray right) -> bool: return True +ctypedef fused int6432_t: + int64_t + int32_t + + +@cython.wraparound(False) +@cython.boundscheck(False) +def array_equal_fast( + ndarray[int6432_t, ndim=1] left, ndarray[int6432_t, ndim=1] right, +) -> bool: + """ + Perform an element by element comparison on 1-d integer arrays, meant for indexer + comparisons + """ + cdef: + Py_ssize_t i, n = left.size + + if left.size != right.size: + return False + + with nogil: + for i in range(n): + + if left[i] != right[i]: + break + else: + i = i + 1 + + if i != n: + return False + return True + + ctypedef fused ndarr_object: ndarray[object, ndim=1] ndarray[object, ndim=2] diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py index fd7c47d47112f..f3d34c64fa0d6 100644 --- a/pandas/tests/libs/test_lib.py +++ b/pandas/tests/libs/test_lib.py @@ -243,6 +243,27 @@ def test_get_reverse_indexer(self): expected = np.array([4, 2, 3, 6, 7], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("dtype", ["int64", "int32"]) + def test_array_equal_fast(self, dtype): + # GH# + left = np.arange(1, 100, dtype=dtype) + right = np.arange(1, 100, dtype=dtype) + assert lib.array_equal_fast(left, right) + + @pytest.mark.parametrize("dtype", ["int64", "int32"]) + def test_array_equal_fast_not_equal(self, dtype): + # GH# + left = np.array([1, 2], dtype=dtype) + right = np.array([2, 2], dtype=dtype) + assert not lib.array_equal_fast(left, right) + + @pytest.mark.parametrize("dtype", ["int64", "int32"]) + def test_array_equal_fast_not_equal_shape(self, dtype): + # GH# + left = np.array([1, 2, 3], dtype=dtype) + right = np.array([2, 2], dtype=dtype) + assert not lib.array_equal_fast(left, right) + def test_cache_readonly_preserve_docstrings(): # GH18197 From 3ec714e0e4893c4897b535b8728fdd040154f8a9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 5 Jan 2023 22:59:04 +0100 Subject: [PATCH 2/5] Add gh ref --- pandas/tests/libs/test_lib.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py index f3d34c64fa0d6..e352250dc748d 100644 --- a/pandas/tests/libs/test_lib.py +++ b/pandas/tests/libs/test_lib.py @@ -245,21 +245,21 @@ def test_get_reverse_indexer(self): @pytest.mark.parametrize("dtype", ["int64", "int32"]) def test_array_equal_fast(self, dtype): - # GH# + # GH#50592 left = np.arange(1, 100, dtype=dtype) right = np.arange(1, 100, dtype=dtype) assert lib.array_equal_fast(left, right) @pytest.mark.parametrize("dtype", ["int64", "int32"]) def test_array_equal_fast_not_equal(self, dtype): - # GH# + # GH#50592 left = np.array([1, 2], dtype=dtype) right = np.array([2, 2], dtype=dtype) assert not lib.array_equal_fast(left, right) @pytest.mark.parametrize("dtype", ["int64", "int32"]) def test_array_equal_fast_not_equal_shape(self, dtype): - # GH# + # GH#50592 left = np.array([1, 2, 3], dtype=dtype) right = np.array([2, 2], dtype=dtype) assert not lib.array_equal_fast(left, right) From c9bf2eaab31e2945836f06bde77c975dff6cecac Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 5 Jan 2023 23:47:10 +0100 Subject: [PATCH 3/5] Fix cython code --- pandas/_libs/lib.pyx | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 0202e9c1a011b..208fead7014d6 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -659,6 +659,7 @@ def array_equal_fast( """ cdef: Py_ssize_t i, n = left.size + bint ret = True if left.size != right.size: return False @@ -667,13 +668,10 @@ def array_equal_fast( for i in range(n): if left[i] != right[i]: + ret = False break - else: - i = i + 1 - if i != n: - return False - return True + return ret ctypedef fused ndarr_object: From b359e49a6b8591c74fc0cb485cd707b3f2791c1d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 5 Jan 2023 23:57:51 +0100 Subject: [PATCH 4/5] Remove nogil --- pandas/_libs/lib.pyx | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 208fead7014d6..89e02ac0fa86d 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -659,19 +659,16 @@ def array_equal_fast( """ cdef: Py_ssize_t i, n = left.size - bint ret = True if left.size != right.size: return False - with nogil: - for i in range(n): + for i in range(n): - if left[i] != right[i]: - ret = False - break + if left[i] != right[i]: + return False - return ret + return True ctypedef fused ndarr_object: From 0145e9ad0c2bd63039bee8136264a400af18d527 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 6 Jan 2023 16:09:46 +0100 Subject: [PATCH 5/5] Fix types --- pandas/_libs/lib.pyi | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index c1e0ef2a22faa..2439082bf7413 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -240,11 +240,6 @@ def get_reverse_indexer( ) -> npt.NDArray[np.intp]: ... def is_bool_list(obj: list) -> bool: ... def dtypes_all_equal(types: list[DtypeObj]) -> bool: ... -@overload -def array_equal_fast( - left: np.ndarray[np.int64], right: np.ndarray[np.int64] -) -> bool: ... -@overload def array_equal_fast( - left: np.ndarray[np.int32], right: np.ndarray[np.int32] + left: np.ndarray, right: np.ndarray # np.ndarray[np.int64, ndim=1] ) -> bool: ...