From 160ae908a4ef790902878697285950d5db6a4ba5 Mon Sep 17 00:00:00 2001 From: benjamindonnachie <83379521+benjamindonnachie@users.noreply.github.com> Date: Sat, 17 Aug 2024 14:03:26 +0100 Subject: [PATCH 1/6] Update lib.pyi maybe_indices_to_slice to use uint64 Updates maybe_indices_to_slice to use uint64 allowing massive dataframes to be manipulated (see https://github.com/pandas-dev/pandas/issues/59531) --- pandas/_libs/lib.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index daaaacee3487d..de4feaabe3ad5 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -161,7 +161,7 @@ def maybe_booleans_to_slice( ) -> slice | npt.NDArray[np.uint8]: ... def maybe_indices_to_slice( indices: npt.NDArray[np.intp], - max_len: int, + max_len: np.unit64, ) -> slice | npt.NDArray[np.intp]: ... def is_all_arraylike(obj: list) -> bool: ... From 7e53cdfac17f24286cd8c1eaeecf5c7490046ee6 Mon Sep 17 00:00:00 2001 From: benjamindonnachie <83379521+benjamindonnachie@users.noreply.github.com> Date: Sat, 17 Aug 2024 14:12:19 +0100 Subject: [PATCH 2/6] Update lib.pyx maybe_indices_to_slice to use uint64_t Update maybe_indices_to_slice to use uint64_t allowing manipulation of massive data frames (See https://github.com/pandas-dev/pandas/issues/59531) --- pandas/_libs/lib.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 489d4fa111d40..7fd05bcbfbc91 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -494,10 +494,10 @@ def has_only_ints_or_nan(const floating[:] arr) -> bool: return True -def maybe_indices_to_slice(ndarray[intp_t, ndim=1] indices, int max_len): +def maybe_indices_to_slice(ndarray[intp_t, ndim=1] indices, uint64_t max_len): cdef: Py_ssize_t i, n = len(indices) - intp_t k, vstart, vlast, v + uint64_t k, vstart, vlast, v if n == 0: return slice(0, 0) From 142263cdcb6051ce992ce634247403c9e032e8d9 Mon Sep 17 00:00:00 2001 From: benjamindonnachie <83379521+benjamindonnachie@users.noreply.github.com> Date: Sat, 17 Aug 2024 16:30:56 +0100 Subject: [PATCH 3/6] Update lib.pyx Revised to use intp_t for max_len in maybe_indices_to_slice. As per conversation with WillAyd revised patch as intp_t is the proper size for an indexer. --- pandas/_libs/lib.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 7fd05bcbfbc91..75ab3259ccef6 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -494,10 +494,10 @@ def has_only_ints_or_nan(const floating[:] arr) -> bool: return True -def maybe_indices_to_slice(ndarray[intp_t, ndim=1] indices, uint64_t max_len): +def maybe_indices_to_slice(ndarray[intp_t, ndim=1] indices, intp_t max_len): cdef: Py_ssize_t i, n = len(indices) - uint64_t k, vstart, vlast, v + intp_t k, vstart, vlast, v if n == 0: return slice(0, 0) From 7e52467a448fcb06e9a598cd6847ca519f375c6f Mon Sep 17 00:00:00 2001 From: benjamindonnachie <83379521+benjamindonnachie@users.noreply.github.com> Date: Sat, 17 Aug 2024 16:31:55 +0100 Subject: [PATCH 4/6] Update lib.pyi Updated may_indices_to_slice to use np.intp for max_len. As per conversation with WillAyd revised patch as intp_t is the proper size for an indexer. --- pandas/_libs/lib.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index de4feaabe3ad5..ebdf78d04379b 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -161,7 +161,7 @@ def maybe_booleans_to_slice( ) -> slice | npt.NDArray[np.uint8]: ... def maybe_indices_to_slice( indices: npt.NDArray[np.intp], - max_len: np.unit64, + max_len: np.intp, ) -> slice | npt.NDArray[np.intp]: ... def is_all_arraylike(obj: list) -> bool: ... From 71df7ab791ce10531f4e80fa8442cefe0b5f1ad1 Mon Sep 17 00:00:00 2001 From: benjamindonnachie <83379521+benjamindonnachie@users.noreply.github.com> Date: Sat, 17 Aug 2024 17:08:11 +0100 Subject: [PATCH 5/6] Update v3.0.0.rst Updated whatsnew to reflect fix to maybe_indices_to_slice to address OverflowError. --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index f25edd39cf7da..30f132959f20d 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -591,6 +591,7 @@ Indexing ^^^^^^^^ - Bug in :meth:`DataFrame.__getitem__` returning modified columns when called with ``slice`` in Python 3.12 (:issue:`57500`) - Bug in :meth:`DataFrame.from_records` throwing a ``ValueError`` when passed an empty list in ``index`` (:issue:`58594`) +- Bug in :meth:`maybe_indices_to_slice` max_len was set to an int causing ``OverflowError``: value too large to convert to int when manipulating very large dataframes (:issue:`59531`) - Missing From 3bde5ffad423cba3e3a714c8761e8f13c0e9bd4b Mon Sep 17 00:00:00 2001 From: benjamindonnachie <83379521+benjamindonnachie@users.noreply.github.com> Date: Thu, 19 Sep 2024 10:38:34 +0100 Subject: [PATCH 6/6] Update doc/source/whatsnew/v3.0.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 30f132959f20d..4970b56a3acaf 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -591,7 +591,7 @@ Indexing ^^^^^^^^ - Bug in :meth:`DataFrame.__getitem__` returning modified columns when called with ``slice`` in Python 3.12 (:issue:`57500`) - Bug in :meth:`DataFrame.from_records` throwing a ``ValueError`` when passed an empty list in ``index`` (:issue:`58594`) -- Bug in :meth:`maybe_indices_to_slice` max_len was set to an int causing ``OverflowError``: value too large to convert to int when manipulating very large dataframes (:issue:`59531`) +- Bug in :meth:`DataFrame.__getitem__` when slicing a :class:`DataFrame` many rows raised an ``OverflowError`` (:issue:`59531`) - Missing