Skip to content

Commit 6a5c34c

Browse files
jbrockmendeljreback
authored andcommitted
PERF: cython optimizations (#23477)
1 parent ce8e05d commit 6a5c34c

13 files changed

+55
-47
lines changed

pandas/_libs/algos.pyx

+2-2
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,7 @@ def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None):
409409
nleft = len(old)
410410
nright = len(new)
411411
indexer = np.empty(nright, dtype=np.int64)
412-
indexer.fill(-1)
412+
indexer[:] = -1
413413

414414
if limit is None:
415415
lim = nright
@@ -607,7 +607,7 @@ def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None):
607607
nleft = len(old)
608608
nright = len(new)
609609
indexer = np.empty(nright, dtype=np.int64)
610-
indexer.fill(-1)
610+
indexer[:] = -1
611611

612612
if limit is None:
613613
lim = nright

pandas/_libs/algos_rank_helper.pxi.in

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@ Template for each `dtype` helper function for rank
44
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
55
"""
66

7-
#----------------------------------------------------------------------
7+
# ----------------------------------------------------------------------
88
# rank_1d, rank_2d
9-
#----------------------------------------------------------------------
9+
# ----------------------------------------------------------------------
1010

1111
{{py:
1212

pandas/_libs/groupby.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -370,7 +370,7 @@ def group_any_all(ndarray[uint8_t] out,
370370
else:
371371
raise ValueError("'bool_func' must be either 'any' or 'all'!")
372372

373-
out.fill(1 - flag_val)
373+
out[:] = 1 - flag_val
374374

375375
with nogil:
376376
for i in range(N):

pandas/_libs/groupby_helper.pxi.in

+11-11
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@ cdef extern from "numpy/npy_math.h":
88
double NAN "NPY_NAN"
99
_int64_max = np.iinfo(np.int64).max
1010

11-
#----------------------------------------------------------------------
11+
# ----------------------------------------------------------------------
1212
# group_add, group_prod, group_var, group_mean, group_ohlc
13-
#----------------------------------------------------------------------
13+
# ----------------------------------------------------------------------
1414

1515
{{py:
1616

@@ -246,7 +246,7 @@ def group_ohlc_{{name}}(ndarray[{{c_type}}, ndim=2] out,
246246
if K > 1:
247247
raise NotImplementedError("Argument 'values' must have only "
248248
"one dimension")
249-
out.fill(np.nan)
249+
out[:] = np.nan
250250

251251
with nogil:
252252
for i in range(N):
@@ -629,10 +629,10 @@ def group_max(ndarray[groupby_t, ndim=2] out,
629629
maxx = np.empty_like(out)
630630
if groupby_t is int64_t:
631631
# Note: evaluated at compile-time
632-
maxx.fill(-_int64_max)
632+
maxx[:] = -_int64_max
633633
nan_val = iNaT
634634
else:
635-
maxx.fill(-np.inf)
635+
maxx[:] = -np.inf
636636
nan_val = NAN
637637

638638
N, K = (<object>values).shape
@@ -691,10 +691,10 @@ def group_min(ndarray[groupby_t, ndim=2] out,
691691

692692
minx = np.empty_like(out)
693693
if groupby_t is int64_t:
694-
minx.fill(_int64_max)
694+
minx[:] = _int64_max
695695
nan_val = iNaT
696696
else:
697-
minx.fill(np.inf)
697+
minx[:] = np.inf
698698
nan_val = NAN
699699

700700
N, K = (<object>values).shape
@@ -747,9 +747,9 @@ def group_cummin(ndarray[groupby_t, ndim=2] out,
747747
N, K = (<object>values).shape
748748
accum = np.empty_like(values)
749749
if groupby_t is int64_t:
750-
accum.fill(_int64_max)
750+
accum[:] = _int64_max
751751
else:
752-
accum.fill(np.inf)
752+
accum[:] = np.inf
753753

754754
with nogil:
755755
for i in range(N):
@@ -795,9 +795,9 @@ def group_cummax(ndarray[groupby_t, ndim=2] out,
795795
N, K = (<object>values).shape
796796
accum = np.empty_like(values)
797797
if groupby_t is int64_t:
798-
accum.fill(-_int64_max)
798+
accum[:] = -_int64_max
799799
else:
800-
accum.fill(-np.inf)
800+
accum[:] = -np.inf
801801

802802
with nogil:
803803
for i in range(N):

pandas/_libs/join.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ def _get_result_indexer(sorter, indexer):
212212
else:
213213
# length-0 case
214214
res = np.empty(len(indexer), dtype=np.int64)
215-
res.fill(-1)
215+
res[:] = -1
216216

217217
return res
218218

pandas/_libs/lib.pyx

+3-3
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ def get_reverse_indexer(ndarray[int64_t] indexer, Py_ssize_t length):
347347
int64_t idx
348348

349349
rev_indexer = np.empty(length, dtype=np.int64)
350-
rev_indexer.fill(-1)
350+
rev_indexer[:] = -1
351351
for i in range(n):
352352
idx = indexer[i]
353353
if idx != -1:
@@ -1670,7 +1670,7 @@ cdef class TimedeltaValidator(TemporalValidator):
16701670

16711671

16721672
# TODO: Not used outside of tests; remove?
1673-
def is_timedelta_array(values: ndarray) -> bint:
1673+
def is_timedelta_array(values: ndarray) -> bool:
16741674
cdef:
16751675
TimedeltaValidator validator = TimedeltaValidator(len(values),
16761676
skipna=True)
@@ -1683,7 +1683,7 @@ cdef class Timedelta64Validator(TimedeltaValidator):
16831683

16841684

16851685
# TODO: Not used outside of tests; remove?
1686-
def is_timedelta64_array(values: ndarray) -> bint:
1686+
def is_timedelta64_array(values: ndarray) -> bool:
16871687
cdef:
16881688
Timedelta64Validator validator = Timedelta64Validator(len(values),
16891689
skipna=True)

pandas/_libs/missing.pyx

+2-2
Original file line numberDiff line numberDiff line change
@@ -278,14 +278,14 @@ def isnaobj2d_old(ndarray arr):
278278
return result.view(np.bool_)
279279

280280

281-
cpdef bint isposinf_scalar(object val):
281+
def isposinf_scalar(val: object) -> bool:
282282
if util.is_float_object(val) and val == INF:
283283
return True
284284
else:
285285
return False
286286

287287

288-
cpdef bint isneginf_scalar(object val):
288+
def isneginf_scalar(val: object) -> bool:
289289
if util.is_float_object(val) and val == NEGINF:
290290
return True
291291
else:

pandas/_libs/sparse.pyx

+4-4
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ cdef class IntIndex(SparseIndex):
221221

222222
n = len(indexer)
223223
results = np.empty(n, dtype=np.int32)
224-
results.fill(-1)
224+
results[:] = -1
225225

226226
if self.npoints == 0:
227227
return results
@@ -250,9 +250,9 @@ cdef class IntIndex(SparseIndex):
250250
sinds = self.indices
251251

252252
result = np.empty(other.npoints, dtype=np.float64)
253-
result.fill(fill_value)
253+
result[:] = fill_value
254254

255-
for 0 <= i < other.npoints:
255+
for i in range(other.npoints):
256256
while oinds[i] > sinds[j] and j < self.npoints:
257257
j += 1
258258

@@ -582,7 +582,7 @@ cdef class BlockIndex(SparseIndex):
582582

583583
n = len(indexer)
584584
results = np.empty(n, dtype=np.int32)
585-
results.fill(-1)
585+
results[:] = -1
586586

587587
if self.npoints == 0:
588588
return results

pandas/_libs/tslibs/conversion.pyx

+9-7
Original file line numberDiff line numberDiff line change
@@ -869,10 +869,10 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None,
869869
"""
870870
cdef:
871871
ndarray[int64_t] trans
872-
int64_t[:] deltas, idx_shifted
872+
int64_t[:] deltas, idx_shifted, idx_shifted_left, idx_shifted_right
873873
ndarray ambiguous_array
874874
Py_ssize_t i, idx, pos, ntrans, n = len(vals)
875-
Py_ssize_t delta_idx_offset, delta_idx
875+
Py_ssize_t delta_idx_offset, delta_idx, pos_left, pos_right
876876
int64_t *tdata
877877
int64_t v, left, right, val, v_left, v_right, new_local, remaining_mins
878878
ndarray[int64_t] result, result_a, result_b, dst_hours
@@ -927,8 +927,8 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None,
927927
# result_a) or right of the DST transition (store in result_b)
928928
result_a = np.empty(n, dtype=np.int64)
929929
result_b = np.empty(n, dtype=np.int64)
930-
result_a.fill(NPY_NAT)
931-
result_b.fill(NPY_NAT)
930+
result_a[:] = NPY_NAT
931+
result_b[:] = NPY_NAT
932932

933933
idx_shifted_left = (np.maximum(0, trans.searchsorted(
934934
vals - DAY_NS, side='right') - 1)).astype(np.int64)
@@ -952,7 +952,7 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None,
952952

953953
if infer_dst:
954954
dst_hours = np.empty(n, dtype=np.int64)
955-
dst_hours.fill(NPY_NAT)
955+
dst_hours[:] = NPY_NAT
956956

957957
# Get the ambiguous hours (given the above, these are the hours
958958
# where result_a != result_b and neither of them are NAT)
@@ -1045,8 +1045,10 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None,
10451045
return result
10461046

10471047

1048-
cdef inline bisect_right_i8(int64_t *data, int64_t val, Py_ssize_t n):
1049-
cdef Py_ssize_t pivot, left = 0, right = n
1048+
cdef inline Py_ssize_t bisect_right_i8(int64_t *data,
1049+
int64_t val, Py_ssize_t n):
1050+
cdef:
1051+
Py_ssize_t pivot, left = 0, right = n
10501052

10511053
assert n >= 1
10521054

pandas/_libs/tslibs/frequencies.pyx

+8-4
Original file line numberDiff line numberDiff line change
@@ -154,8 +154,7 @@ cpdef get_freq_code(freqstr):
154154
freqstr = (freqstr.rule_code, freqstr.n)
155155

156156
if isinstance(freqstr, tuple):
157-
if (is_integer_object(freqstr[0]) and
158-
is_integer_object(freqstr[1])):
157+
if is_integer_object(freqstr[0]) and is_integer_object(freqstr[1]):
159158
# e.g., freqstr = (2000, 1)
160159
return freqstr
161160
else:
@@ -171,7 +170,7 @@ cpdef get_freq_code(freqstr):
171170
return code, stride
172171

173172
if is_integer_object(freqstr):
174-
return (freqstr, 1)
173+
return freqstr, 1
175174

176175
base, stride = _base_and_stride(freqstr)
177176
code = _period_str_to_code(base)
@@ -183,6 +182,11 @@ cpdef _base_and_stride(freqstr):
183182
"""
184183
Return base freq and stride info from string representation
185184
185+
Returns
186+
-------
187+
base : str
188+
stride : int
189+
186190
Examples
187191
--------
188192
_freq_and_stride('5Min') -> 'Min', 5
@@ -201,7 +205,7 @@ cpdef _base_and_stride(freqstr):
201205

202206
base = groups.group(2)
203207

204-
return (base, stride)
208+
return base, stride
205209

206210

207211
cpdef _period_str_to_code(freqstr):

pandas/_libs/tslibs/offsets.pyx

+2-1
Original file line numberDiff line numberDiff line change
@@ -532,7 +532,8 @@ cdef inline int month_add_months(npy_datetimestruct dts, int months) nogil:
532532
New month number after shifting npy_datetimestruct
533533
number of months.
534534
"""
535-
cdef int new_month = (dts.month + months) % 12
535+
cdef:
536+
int new_month = (dts.month + months) % 12
536537
return 12 if new_month == 0 else new_month
537538

538539

pandas/_libs/tslibs/timestamps.pyx

+2-1
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,8 @@ def round_nsint64(values, mode, freq):
177177

178178
# if/elif above should catch all rounding modes defined in enum 'RoundTo':
179179
# if flow of control arrives here, it is a bug
180-
assert False, "round_nsint64 called with an unrecognized rounding mode"
180+
raise AssertionError("round_nsint64 called with an unrecognized "
181+
"rounding mode")
181182

182183

183184
# This is PITA. Because we inherit from datetime, which has very specific

pandas/_libs/writers.pyx

+8-8
Original file line numberDiff line numberDiff line change
@@ -128,16 +128,16 @@ def max_len_string_array(pandas_string[:] arr) -> Py_ssize_t:
128128
""" return the maximum size of elements in a 1-dim string array """
129129
cdef:
130130
Py_ssize_t i, m = 0, l = 0, length = arr.shape[0]
131-
pandas_string v
131+
pandas_string val
132132

133133
for i in range(length):
134-
v = arr[i]
135-
if isinstance(v, str):
136-
l = PyString_GET_SIZE(v)
137-
elif isinstance(v, bytes):
138-
l = PyBytes_GET_SIZE(v)
139-
elif isinstance(v, unicode):
140-
l = PyUnicode_GET_SIZE(v)
134+
val = arr[i]
135+
if isinstance(val, str):
136+
l = PyString_GET_SIZE(val)
137+
elif isinstance(val, bytes):
138+
l = PyBytes_GET_SIZE(val)
139+
elif isinstance(val, unicode):
140+
l = PyUnicode_GET_SIZE(val)
141141

142142
if l > m:
143143
m = l

0 commit comments

Comments
 (0)