Skip to content

REF: cython cleanups and optimizations #23382

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 13 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 42 additions & 22 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,9 @@ class NegInfinity(object):
__ge__ = lambda self, other: isinstance(other, NegInfinity)


cpdef ndarray[int64_t, ndim=1] unique_deltas(ndarray[int64_t] arr):
@cython.wraparound(False)
@cython.boundscheck(False)
cpdef ndarray[int64_t, ndim=1] unique_deltas(int64_t[:] arr):
"""
Efficiently find the unique first-differences of the given array.

Expand All @@ -98,6 +100,8 @@ cpdef ndarray[int64_t, ndim=1] unique_deltas(ndarray[int64_t] arr):
int ret = 0
list uniques = []

util.require_not_none(arr)

table = kh_init_int64()
kh_resize_int64(table, 10)
for i in range(n - 1):
Expand Down Expand Up @@ -151,7 +155,7 @@ def is_lexsorted(list_of_arrays: list) -> bint:

@cython.boundscheck(False)
@cython.wraparound(False)
def groupsort_indexer(ndarray[int64_t] index, Py_ssize_t ngroups):
def groupsort_indexer(int64_t[:] index, Py_ssize_t ngroups):
"""
compute a 1-d indexer that is an ordering of the passed index,
ordered by the groups. This is a reverse of the label
Expand All @@ -171,6 +175,8 @@ def groupsort_indexer(ndarray[int64_t] index, Py_ssize_t ngroups):
Py_ssize_t i, loc, label, n
ndarray[int64_t] counts, where, result

util.require_not_none(index)

counts = np.zeros(ngroups + 1, dtype=np.int64)
n = len(index)
result = np.zeros(n, dtype=np.int64)
Expand Down Expand Up @@ -236,7 +242,7 @@ def nancorr(ndarray[float64_t, ndim=2] mat, bint cov=0, minp=None):
Py_ssize_t i, j, xi, yi, N, K
bint minpv
ndarray[float64_t, ndim=2] result
ndarray[uint8_t, ndim=2] mask
uint8_t[:, :] mask
int64_t nobs = 0
float64_t vx, vy, sumx, sumy, sumxx, sumyy, meanx, meany, divisor

Expand Down Expand Up @@ -301,7 +307,7 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1):
ndarray[float64_t, ndim=2] result
ndarray[float64_t, ndim=1] maskedx
ndarray[float64_t, ndim=1] maskedy
ndarray[uint8_t, ndim=2] mask
uint8_t[:, :] mask
int64_t nobs = 0
float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor

Expand Down Expand Up @@ -373,7 +379,7 @@ ctypedef fused algos_t:
# TODO: unused; needed?
@cython.wraparound(False)
@cython.boundscheck(False)
cpdef map_indices(ndarray[algos_t] index):
cpdef map_indices(algos_t[:] index):
"""
Produce a dict mapping the values of the input array to their respective
locations.
Expand All @@ -387,6 +393,8 @@ cpdef map_indices(ndarray[algos_t] index):
Py_ssize_t i, length
dict result = {}

util.require_not_none(index)

length = len(index)

for i in range(length):
Expand All @@ -397,13 +405,16 @@ cpdef map_indices(ndarray[algos_t] index):

@cython.boundscheck(False)
@cython.wraparound(False)
def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None):
def pad(algos_t[:] old, algos_t[:] new, limit=None):
cdef:
Py_ssize_t i, j, nleft, nright
ndarray[int64_t, ndim=1] indexer
algos_t cur, next
int lim, fill_count = 0

util.require_not_none(old)
util.require_not_none(new)

nleft = len(old)
nright = len(new)
indexer = np.empty(nright, dtype=np.int64)
Expand Down Expand Up @@ -475,14 +486,15 @@ pad_bool = pad["uint8_t"]

@cython.boundscheck(False)
@cython.wraparound(False)
def pad_inplace(ndarray[algos_t] values,
ndarray[uint8_t, cast=True] mask,
limit=None):
def pad_inplace(algos_t[:] values, uint8_t[:] mask, limit=None):
cdef:
Py_ssize_t i, N
algos_t val
int lim, fill_count = 0

util.require_not_none(values)
util.require_not_none(mask)

N = len(values)

# GH#2778
Expand Down Expand Up @@ -521,14 +533,15 @@ pad_inplace_bool = pad_inplace["uint8_t"]

@cython.boundscheck(False)
@cython.wraparound(False)
def pad_2d_inplace(ndarray[algos_t, ndim=2] values,
ndarray[uint8_t, ndim=2] mask,
limit=None):
def pad_2d_inplace(algos_t[:, :] values, uint8_t[:, :] mask, limit=None):
cdef:
Py_ssize_t i, j, N, K
algos_t val
int lim, fill_count = 0

util.require_not_none(values)
util.require_not_none(mask)

K, N = (<object> values).shape

# GH#2778
Expand Down Expand Up @@ -595,13 +608,16 @@ D

@cython.boundscheck(False)
@cython.wraparound(False)
def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None):
def backfill(algos_t[:] old, algos_t[:] new, limit=None):
cdef:
Py_ssize_t i, j, nleft, nright
ndarray[int64_t, ndim=1] indexer
algos_t cur, prev
int lim, fill_count = 0

util.require_not_none(old)
util.require_not_none(new)

nleft = len(old)
nright = len(new)
indexer = np.empty(nright, dtype=np.int64)
Expand Down Expand Up @@ -674,14 +690,15 @@ backfill_bool = backfill["uint8_t"]

@cython.boundscheck(False)
@cython.wraparound(False)
def backfill_inplace(ndarray[algos_t] values,
ndarray[uint8_t, cast=True] mask,
limit=None):
def backfill_inplace(algos_t[:] values, uint8_t[:] mask, limit=None):
cdef:
Py_ssize_t i, N
algos_t val
int lim, fill_count = 0

util.require_not_none(values)
util.require_not_none(mask)

N = len(values)

# GH#2778
Expand Down Expand Up @@ -720,14 +737,15 @@ backfill_inplace_bool = backfill_inplace["uint8_t"]

@cython.boundscheck(False)
@cython.wraparound(False)
def backfill_2d_inplace(ndarray[algos_t, ndim=2] values,
ndarray[uint8_t, ndim=2] mask,
limit=None):
def backfill_2d_inplace(algos_t[:, :] values, uint8_t[:, :] mask, limit=None):
cdef:
Py_ssize_t i, j, N, K
algos_t val
int lim, fill_count = 0

util.require_not_none(values)
util.require_not_none(mask)

K, N = (<object> values).shape

# GH#2778
Expand Down Expand Up @@ -768,14 +786,16 @@ backfill_2d_inplace_bool = backfill_2d_inplace["uint8_t"]

@cython.wraparound(False)
@cython.boundscheck(False)
def arrmap(ndarray[algos_t] index, object func):
def arrmap(algos_t[:] index, object func):
cdef:
Py_ssize_t length = index.shape[0]
Py_ssize_t i = 0
ndarray[object] result = np.empty(length, dtype=np.object_)
object[:] result = np.empty(length, dtype=np.object_)

from pandas._libs.lib import maybe_convert_objects

util.require_not_none(index)

for i in range(length):
result[i] = func(index[i])

Expand All @@ -793,7 +813,7 @@ arrmap_bool = arrmap["uint8_t"]

@cython.boundscheck(False)
@cython.wraparound(False)
def is_monotonic(ndarray[algos_t] arr, bint timelike):
def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike):
"""
Returns
-------
Expand Down
6 changes: 3 additions & 3 deletions pandas/_libs/algos_common_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def get_dispatch(dtypes):
@cython.boundscheck(False)
@cython.wraparound(False)
def diff_2d_{{name}}(ndarray[{{c_type}}, ndim=2] arr,
ndarray[{{dest_type}}, ndim=2] out,
{{dest_type}}[:, :] out,
Py_ssize_t periods, int axis):
cdef:
Py_ssize_t i, j, sx, sy
Expand Down Expand Up @@ -72,8 +72,8 @@ def diff_2d_{{name}}(ndarray[{{c_type}}, ndim=2] arr,


def put2d_{{name}}_{{dest_name}}(ndarray[{{c_type}}, ndim=2, cast=True] values,
ndarray[int64_t] indexer, Py_ssize_t loc,
ndarray[{{dest_type}}] out):
int64_t[:] indexer, Py_ssize_t loc,
{{dest_type}}[:, :] out):
cdef:
Py_ssize_t i, j, k

Expand Down
41 changes: 25 additions & 16 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ from numpy cimport (ndarray,
cnp.import_array()


from util cimport numeric, get_nat
from util cimport numeric, get_nat, require_not_none

from algos cimport (swap, TiebreakEnumType, TIEBREAK_AVERAGE, TIEBREAK_MIN,
TIEBREAK_MAX, TIEBREAK_FIRST, TIEBREAK_DENSE)
Expand Down Expand Up @@ -98,21 +98,24 @@ cdef inline float64_t kth_smallest_c(float64_t* a,

@cython.boundscheck(False)
@cython.wraparound(False)
def group_median_float64(ndarray[float64_t, ndim=2] out,
ndarray[int64_t] counts,
def group_median_float64(float64_t[:, :] out,
int64_t[:] counts,
ndarray[float64_t, ndim=2] values,
ndarray[int64_t] labels,
int64_t[:] labels,
Py_ssize_t min_count=-1):
"""
Only aggregates on axis=0
"""
cdef:
Py_ssize_t i, j, N, K, ngroups, size
ndarray[int64_t] _counts
ndarray data
int64_t[:] _counts
ndarray[float64_t, ndim=2] data
float64_t* ptr

assert min_count == -1, "'min_count' only used in add and prod"
require_not_none(counts)
require_not_none(out)
require_not_none(labels)

ngroups = len(counts)
N, K = (<object> values).shape
Expand Down Expand Up @@ -217,7 +220,7 @@ def group_cumsum(numeric[:, :] out,

@cython.boundscheck(False)
@cython.wraparound(False)
def group_shift_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,
def group_shift_indexer(int64_t[:] out, int64_t[:] labels,
int ngroups, int periods):
cdef:
Py_ssize_t N, i, j, ii
Expand Down Expand Up @@ -269,8 +272,8 @@ def group_shift_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,

@cython.wraparound(False)
@cython.boundscheck(False)
def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,
ndarray[uint8_t] mask, object direction,
def group_fillna_indexer(int64_t[:] out, ndarray[int64_t] labels,
uint8_t[:] mask, object direction,
int64_t limit):
"""Indexes how to fill values forwards or backwards within a group

Expand All @@ -291,7 +294,7 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,
"""
cdef:
Py_ssize_t i, N
ndarray[int64_t] sorted_labels
int64_t[:] sorted_labels
int64_t idx, curr_fill_idx=-1, filled_vals=0

N = len(out)
Expand All @@ -301,6 +304,7 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,

sorted_labels = np.argsort(labels, kind='mergesort').astype(
np.int64, copy=False)

if direction == 'bfill':
sorted_labels = sorted_labels[::-1]

Expand All @@ -327,10 +331,10 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,

@cython.boundscheck(False)
@cython.wraparound(False)
def group_any_all(ndarray[uint8_t] out,
ndarray[int64_t] labels,
ndarray[uint8_t] values,
ndarray[uint8_t] mask,
def group_any_all(uint8_t[:] out,
int64_t[:] labels,
uint8_t[:] values,
uint8_t[:] mask,
object val_test,
bint skipna):
"""Aggregated boolean values to show truthfulness of group elements
Expand All @@ -353,10 +357,15 @@ def group_any_all(ndarray[uint8_t] out,
The returned values will either be 0 or 1 (False or True, respectively).
"""
cdef:
Py_ssize_t i, N=len(labels)
Py_ssize_t i, N = len(labels)
int64_t lab
uint8_t flag_val

require_not_none(out)
require_not_none(labels)
require_not_none(values)
require_not_none(mask)

if val_test == 'all':
# Because the 'all' value of an empty iterable in Python is True we can
# start with an array full of ones and set to zero when a False value
Expand All @@ -370,7 +379,7 @@ def group_any_all(ndarray[uint8_t] out,
else:
raise ValueError("'bool_func' must be either 'any' or 'all'!")

out.fill(1 - flag_val)
out[:] = 1 - flag_val

with nogil:
for i in range(N):
Expand Down
Loading