pandas-dev · lithomas1 · Aug 11, 2023 · Aug 11, 2023 · Aug 11, 2023
diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
@@ -41,7 +41,7 @@
     // pip (with all the conda available packages installed first,
     // followed by the pip installed packages).
     "matrix": {
-        "Cython": ["3.0.0"],
+        "Cython": ["0.29.33"],
         "matplotlib": [],
         "sqlalchemy": [],
         "scipy": [],

@@ -6,7 +6,7 @@ dependencies:
 
   # build dependencies
   - versioneer[toml]
-  - cython>=3.0.0
+  - cython>=0.29.33
   - meson[ninja]=1.0.1
   - meson-python=0.13.1
 

@@ -7,7 +7,7 @@ dependencies:
 
   # build dependencies
   - versioneer[toml]
-  - cython>=3.0.0
+  - cython>=0.29.33
   - meson[ninja]=1.0.1
   - meson-python=0.13.1
 

@@ -7,7 +7,7 @@ dependencies:
   # build dependencies
   - versioneer[toml]
   - meson[ninja]=1.0.1
-  - cython>=3.0.0
+  - cython>=0.29.33
   - meson-python=0.13.1
 
   # test dependencies

@@ -6,7 +6,7 @@ dependencies:
 
   # build dependencies
   - versioneer[toml]
-  - cython>=3.0.0
+  - cython>=0.29.33
   - meson[ninja]=1.0.1
   - meson-python=0.13.1
 

@@ -8,7 +8,7 @@ dependencies:
 
   # build dependencies
   - versioneer[toml]
-  - cython>=3.0.0
+  - cython>=0.29.33
   - meson[ninja]=1.0.1
   - meson-python=0.13.1
 

@@ -6,7 +6,7 @@ dependencies:
 
   # build dependencies
   - versioneer[toml]
-  - cython>=3.0.0
+  - cython>=0.29.33
   - meson[ninja]=1.0.1
   - meson-python=0.13.1
 

@@ -9,7 +9,7 @@ dependencies:
 
   # build dependencies
   - versioneer[toml]
-  - cython>=3.0.0
+  - cython>=0.29.33
   - meson[ninja]=1.0.1
   - meson-python=0.13.1
 

@@ -6,7 +6,7 @@ dependencies:
 
   # build dependencies
   - versioneer[toml]
-  - cython>=3.0.0
+  - cython>=0.29.33
   - meson[ninja]=1.0.1
   - meson-python=0.13.1
 

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -873,7 +873,6 @@ Other
 - Bug in :meth:`Series.memory_usage` when ``deep=True`` throw an error with Series of objects and the returned value is incorrect, as it does not take into account GC corrections (:issue:`51858`)
 - Bug in :meth:`period_range` the default behavior when freq was not passed as an argument was incorrect(:issue:`53687`)
 - Fixed incorrect ``__name__`` attribute of ``pandas._libs.json`` (:issue:`52898`)
-- The minimum version of Cython needed to compile pandas is now ``3.0.0`` (:issue:`54335`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_210.contributors:

diff --git a/environment.yml b/environment.yml
@@ -8,7 +8,7 @@ dependencies:
 
   # build dependencies
   - versioneer[toml]
-  - cython=3.0.0
+  - cython=0.29.33
   - meson[ninja]=1.0.1
   - meson-python=0.13.1
 

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
@@ -998,7 +998,8 @@ def rank_1d(
 
     N = len(values)
     if labels is not None:
-        assert len(labels) == N
+        # TODO(cython3): cast won't be necessary (#2992)
+        assert <Py_ssize_t>len(labels) == N
     out = np.empty(N)
     grp_sizes = np.ones(N, dtype=np.int64)
 
@@ -1087,7 +1088,8 @@ cdef void rank_sorted_1d(
     float64_t[::1] out,
     int64_t[::1] grp_sizes,
     const intp_t[:] sort_indexer,
-    const numeric_object_t[:] masked_vals,
+    # TODO(cython3): make const (https://github.com/cython/cython/issues/3222)
+    numeric_object_t[:] masked_vals,
     const uint8_t[:] mask,
     bint check_mask,
     Py_ssize_t N,
@@ -1142,7 +1144,108 @@ cdef void rank_sorted_1d(
     # array that we sorted previously, which gives us the location of
     # that sorted value for retrieval back from the original
     # values / masked_vals arrays
-    with gil(numeric_object_t is object):
+    # TODO(cython3): de-duplicate once cython supports conditional nogil
+    if numeric_object_t is object:
+        with gil:
+            for i in range(N):
+                at_end = i == N - 1
+
+                # dups and sum_ranks will be incremented each loop where
+                # the value / group remains the same, and should be reset
+                # when either of those change. Used to calculate tiebreakers
+                dups += 1
+                sum_ranks += i - grp_start + 1
+
+                next_val_diff = at_end or are_diff(masked_vals[sort_indexer[i]],
+                                                   masked_vals[sort_indexer[i+1]])
+
+                # We'll need this check later anyway to determine group size, so just
+                # compute it here since shortcircuiting won't help
+                group_changed = at_end or (check_labels and
+                                           (labels[sort_indexer[i]]
+                                            != labels[sort_indexer[i+1]]))
+
+                # Update out only when there is a transition of values or labels.
+                # When a new value or group is encountered, go back #dups steps(
+                # the number of occurrence of current value) and assign the ranks
+                # based on the starting index of the current group (grp_start)
+                # and the current index
+                if (next_val_diff or group_changed or (check_mask and
+                                                       (mask[sort_indexer[i]]
+                                                        ^ mask[sort_indexer[i+1]]))):
+
+                    # If keep_na, check for missing values and assign back
+                    # to the result where appropriate
+                    if keep_na and check_mask and mask[sort_indexer[i]]:
+                        grp_na_count = dups
+                        for j in range(i - dups + 1, i + 1):
+                            out[sort_indexer[j]] = NaN
+                    elif tiebreak == TIEBREAK_AVERAGE:
+                        for j in range(i - dups + 1, i + 1):
+                            out[sort_indexer[j]] = sum_ranks / <float64_t>dups
+                    elif tiebreak == TIEBREAK_MIN:
+                        for j in range(i - dups + 1, i + 1):
+                            out[sort_indexer[j]] = i - grp_start - dups + 2
+                    elif tiebreak == TIEBREAK_MAX:
+                        for j in range(i - dups + 1, i + 1):
+                            out[sort_indexer[j]] = i - grp_start + 1
+
+                    # With n as the previous rank in the group and m as the number
+                    # of duplicates in this stretch, if TIEBREAK_FIRST and ascending,
+                    # then rankings should be n + 1, n + 2 ... n + m
+                    elif tiebreak == TIEBREAK_FIRST:
+                        for j in range(i - dups + 1, i + 1):
+                            out[sort_indexer[j]] = j + 1 - grp_start
+
+                    # If TIEBREAK_FIRST and descending, the ranking should be
+                    # n + m, n + (m - 1) ... n + 1. This is equivalent to
+                    # (i - dups + 1) + (i - j + 1) - grp_start
+                    elif tiebreak == TIEBREAK_FIRST_DESCENDING:
+                        for j in range(i - dups + 1, i + 1):
+                            out[sort_indexer[j]] = 2 * i - j - dups + 2 - grp_start
+                    elif tiebreak == TIEBREAK_DENSE:
+                        for j in range(i - dups + 1, i + 1):
+                            out[sort_indexer[j]] = grp_vals_seen
+
+                    # Look forward to the next value (using the sorting in
+                    # lexsort_indexer). If the value does not equal the current
+                    # value then we need to reset the dups and sum_ranks, knowing
+                    # that a new value is coming up. The conditional also needs
+                    # to handle nan equality and the end of iteration. If group
+                    # changes we do not record seeing a new value in the group
+                    if not group_changed and (next_val_diff or (check_mask and
+                                              (mask[sort_indexer[i]]
+                                               ^ mask[sort_indexer[i+1]]))):
+                        dups = sum_ranks = 0
+                        grp_vals_seen += 1
+
+                    # Similar to the previous conditional, check now if we are
+                    # moving to a new group. If so, keep track of the index where
+                    # the new group occurs, so the tiebreaker calculations can
+                    # decrement that from their position. Fill in the size of each
+                    # group encountered (used by pct calculations later). Also be
+                    # sure to reset any of the items helping to calculate dups
+                    if group_changed:
+
+                        # If not dense tiebreak, group size used to compute
+                        # percentile will be # of non-null elements in group
+                        if tiebreak != TIEBREAK_DENSE:
+                            grp_size = i - grp_start + 1 - grp_na_count
+
+                        # Otherwise, it will be the number of distinct values
+                        # in the group, subtracting 1 if NaNs are present
+                        # since that is a distinct value we shouldn't count
+                        else:
+                            grp_size = grp_vals_seen - (grp_na_count > 0)
+
+                        for j in range(grp_start, i + 1):
+                            grp_sizes[sort_indexer[j]] = grp_size
+
+                        dups = sum_ranks = 0
+                        grp_na_count = 0
+                        grp_start = i + 1
+                        grp_vals_seen = 1
+    else:
         for i in range(N):
             at_end = i == N - 1
 
@@ -1371,18 +1474,16 @@ ctypedef fused out_t:
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def diff_2d(
-    # TODO: cython bug (post Cython 3) prevents update to "const diff_t[:, :] arr"
-    ndarray[diff_t, ndim=2] arr,
-    out_t[:, :] out,
+    ndarray[diff_t, ndim=2] arr,  # TODO(cython3) update to "const diff_t[:, :] arr"
+    ndarray[out_t, ndim=2] out,
     Py_ssize_t periods,
     int axis,
     bint datetimelike=False,
 ):
     cdef:
         Py_ssize_t i, j, sx, sy, start, stop
         bint f_contig = arr.flags.f_contiguous
-        # TODO: change to this when arr becomes a memoryview
-        # bint f_contig = arr.is_f_contig()
+        # bint f_contig = arr.is_f_contig()  # TODO(cython3)
         diff_t left, right
 
     # Disable for unsupported dtype combinations,

diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi
@@ -26,7 +26,7 @@ class NDArrayBacked:
     def size(self) -> int: ...
     @property
     def nbytes(self) -> int: ...
-    def copy(self, order=...): ...
+    def copy(self): ...
     def delete(self, loc, axis=...): ...
     def swapaxes(self, axis1, axis2): ...
     def repeat(self, repeats: int | Sequence[int], axis: int | None = ...): ...

diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx
@@ -126,7 +126,8 @@ cdef class NDArrayBacked:
 
     @property
     def size(self) -> int:
-        return self._ndarray.size
+        # TODO(cython3): use self._ndarray.size
+        return cnp.PyArray_SIZE(self._ndarray)
 
     @property
     def nbytes(self) -> int:

diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
@@ -44,6 +44,7 @@ def group_fillna_indexer(
     labels: np.ndarray,  # ndarray[int64_t]
     sorted_labels: npt.NDArray[np.intp],
     mask: npt.NDArray[np.uint8],
+    direction: Literal["ffill", "bfill"],
     limit: int,  # int64_t
     dropna: bool,
 ) -> None: ...
@@ -54,7 +55,7 @@ def group_any_all(
     mask: np.ndarray,  # const uint8_t[::1]
     val_test: Literal["any", "all"],
     skipna: bool,
-    result_mask: np.ndarray | None,
+    nullable: bool,
 ) -> None: ...
 def group_sum(
     out: np.ndarray,  # complexfloatingintuint_t[:, ::1]