Skip to content

PERF: cython optimizations #23477

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Nov 6, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,7 @@ def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None):
nleft = len(old)
nright = len(new)
indexer = np.empty(nright, dtype=np.int64)
indexer.fill(-1)
indexer[:] = -1

if limit is None:
lim = nright
Expand Down Expand Up @@ -607,7 +607,7 @@ def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None):
nleft = len(old)
nright = len(new)
indexer = np.empty(nright, dtype=np.int64)
indexer.fill(-1)
indexer[:] = -1

if limit is None:
lim = nright
Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/algos_rank_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ Template for each `dtype` helper function for rank
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
"""

#----------------------------------------------------------------------
# ----------------------------------------------------------------------
# rank_1d, rank_2d
#----------------------------------------------------------------------
# ----------------------------------------------------------------------

{{py:

Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ def group_any_all(ndarray[uint8_t] out,
else:
raise ValueError("'bool_func' must be either 'any' or 'all'!")

out.fill(1 - flag_val)
out[:] = 1 - flag_val

with nogil:
for i in range(N):
Expand Down
22 changes: 11 additions & 11 deletions pandas/_libs/groupby_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ cdef extern from "numpy/npy_math.h":
double NAN "NPY_NAN"
_int64_max = np.iinfo(np.int64).max

#----------------------------------------------------------------------
# ----------------------------------------------------------------------
# group_add, group_prod, group_var, group_mean, group_ohlc
#----------------------------------------------------------------------
# ----------------------------------------------------------------------

{{py:

Expand Down Expand Up @@ -246,7 +246,7 @@ def group_ohlc_{{name}}(ndarray[{{c_type}}, ndim=2] out,
if K > 1:
raise NotImplementedError("Argument 'values' must have only "
"one dimension")
out.fill(np.nan)
out[:] = np.nan

with nogil:
for i in range(N):
Expand Down Expand Up @@ -629,10 +629,10 @@ def group_max(ndarray[groupby_t, ndim=2] out,
maxx = np.empty_like(out)
if groupby_t is int64_t:
# Note: evaluated at compile-time
maxx.fill(-_int64_max)
maxx[:] = -_int64_max
nan_val = iNaT
else:
maxx.fill(-np.inf)
maxx[:] = -np.inf
nan_val = NAN

N, K = (<object>values).shape
Expand Down Expand Up @@ -691,10 +691,10 @@ def group_min(ndarray[groupby_t, ndim=2] out,

minx = np.empty_like(out)
if groupby_t is int64_t:
minx.fill(_int64_max)
minx[:] = _int64_max
nan_val = iNaT
else:
minx.fill(np.inf)
minx[:] = np.inf
nan_val = NAN

N, K = (<object>values).shape
Expand Down Expand Up @@ -747,9 +747,9 @@ def group_cummin(ndarray[groupby_t, ndim=2] out,
N, K = (<object>values).shape
accum = np.empty_like(values)
if groupby_t is int64_t:
accum.fill(_int64_max)
accum[:] = _int64_max
else:
accum.fill(np.inf)
accum[:] = np.inf

with nogil:
for i in range(N):
Expand Down Expand Up @@ -795,9 +795,9 @@ def group_cummax(ndarray[groupby_t, ndim=2] out,
N, K = (<object>values).shape
accum = np.empty_like(values)
if groupby_t is int64_t:
accum.fill(-_int64_max)
accum[:] = -_int64_max
else:
accum.fill(-np.inf)
accum[:] = -np.inf

with nogil:
for i in range(N):
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/join.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def _get_result_indexer(sorter, indexer):
else:
# length-0 case
res = np.empty(len(indexer), dtype=np.int64)
res.fill(-1)
res[:] = -1

return res

Expand Down
6 changes: 3 additions & 3 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ def get_reverse_indexer(ndarray[int64_t] indexer, Py_ssize_t length):
int64_t idx

rev_indexer = np.empty(length, dtype=np.int64)
rev_indexer.fill(-1)
rev_indexer[:] = -1
for i in range(n):
idx = indexer[i]
if idx != -1:
Expand Down Expand Up @@ -1670,7 +1670,7 @@ cdef class TimedeltaValidator(TemporalValidator):


# TODO: Not used outside of tests; remove?
def is_timedelta_array(values: ndarray) -> bint:
def is_timedelta_array(values: ndarray) -> bool:
cdef:
TimedeltaValidator validator = TimedeltaValidator(len(values),
skipna=True)
Expand All @@ -1683,7 +1683,7 @@ cdef class Timedelta64Validator(TimedeltaValidator):


# TODO: Not used outside of tests; remove?
def is_timedelta64_array(values: ndarray) -> bint:
def is_timedelta64_array(values: ndarray) -> bool:
cdef:
Timedelta64Validator validator = Timedelta64Validator(len(values),
skipna=True)
Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/missing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -278,14 +278,14 @@ def isnaobj2d_old(ndarray arr):
return result.view(np.bool_)


cpdef bint isposinf_scalar(object val):
def isposinf_scalar(val: object) -> bool:
if util.is_float_object(val) and val == INF:
return True
else:
return False


cpdef bint isneginf_scalar(object val):
def isneginf_scalar(val: object) -> bool:
if util.is_float_object(val) and val == NEGINF:
return True
else:
Expand Down
8 changes: 4 additions & 4 deletions pandas/_libs/sparse.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ cdef class IntIndex(SparseIndex):

n = len(indexer)
results = np.empty(n, dtype=np.int32)
results.fill(-1)
results[:] = -1

if self.npoints == 0:
return results
Expand Down Expand Up @@ -250,9 +250,9 @@ cdef class IntIndex(SparseIndex):
sinds = self.indices

result = np.empty(other.npoints, dtype=np.float64)
result.fill(fill_value)
result[:] = fill_value

for 0 <= i < other.npoints:
for i in range(other.npoints):
while oinds[i] > sinds[j] and j < self.npoints:
j += 1

Expand Down Expand Up @@ -582,7 +582,7 @@ cdef class BlockIndex(SparseIndex):

n = len(indexer)
results = np.empty(n, dtype=np.int32)
results.fill(-1)
results[:] = -1

if self.npoints == 0:
return results
Expand Down
18 changes: 10 additions & 8 deletions pandas/_libs/tslibs/conversion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -869,10 +869,10 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None,
"""
cdef:
ndarray[int64_t] trans
int64_t[:] deltas, idx_shifted
int64_t[:] deltas, idx_shifted, idx_shifted_left, idx_shifted_right
ndarray ambiguous_array
Py_ssize_t i, idx, pos, ntrans, n = len(vals)
Py_ssize_t delta_idx_offset, delta_idx
Py_ssize_t delta_idx_offset, delta_idx, pos_left, pos_right
int64_t *tdata
int64_t v, left, right, val, v_left, v_right, new_local, remaining_mins
ndarray[int64_t] result, result_a, result_b, dst_hours
Expand Down Expand Up @@ -920,15 +920,15 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None,

trans, deltas, typ = get_dst_info(tz)

tdata = <int64_t*> cnp.PyArray_DATA(trans)
tdata = <int64_t*>cnp.PyArray_DATA(trans)
ntrans = len(trans)

# Determine whether each date lies left of the DST transition (store in
# result_a) or right of the DST transition (store in result_b)
result_a = np.empty(n, dtype=np.int64)
result_b = np.empty(n, dtype=np.int64)
result_a.fill(NPY_NAT)
result_b.fill(NPY_NAT)
result_a[:] = NPY_NAT
result_b[:] = NPY_NAT

idx_shifted_left = (np.maximum(0, trans.searchsorted(
vals - DAY_NS, side='right') - 1)).astype(np.int64)
Expand All @@ -952,7 +952,7 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None,

if infer_dst:
dst_hours = np.empty(n, dtype=np.int64)
dst_hours.fill(NPY_NAT)
dst_hours[:] = NPY_NAT

# Get the ambiguous hours (given the above, these are the hours
# where result_a != result_b and neither of them are NAT)
Expand Down Expand Up @@ -1045,8 +1045,10 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None,
return result


cdef inline bisect_right_i8(int64_t *data, int64_t val, Py_ssize_t n):
cdef Py_ssize_t pivot, left = 0, right = n
cdef inline Py_ssize_t bisect_right_i8(int64_t *data,
int64_t val, Py_ssize_t n):
cdef:
Py_ssize_t pivot, left = 0, right = n

assert n >= 1

Expand Down
12 changes: 8 additions & 4 deletions pandas/_libs/tslibs/frequencies.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,7 @@ cpdef get_freq_code(freqstr):
freqstr = (freqstr.rule_code, freqstr.n)

if isinstance(freqstr, tuple):
if (is_integer_object(freqstr[0]) and
is_integer_object(freqstr[1])):
if is_integer_object(freqstr[0]) and is_integer_object(freqstr[1]):
# e.g., freqstr = (2000, 1)
return freqstr
else:
Expand All @@ -171,7 +170,7 @@ cpdef get_freq_code(freqstr):
return code, stride

if is_integer_object(freqstr):
return (freqstr, 1)
return freqstr, 1

base, stride = _base_and_stride(freqstr)
code = _period_str_to_code(base)
Expand All @@ -183,6 +182,11 @@ cpdef _base_and_stride(freqstr):
"""
Return base freq and stride info from string representation

Returns
-------
base : str
stride : int

Examples
--------
_freq_and_stride('5Min') -> 'Min', 5
Expand All @@ -201,7 +205,7 @@ cpdef _base_and_stride(freqstr):

base = groups.group(2)

return (base, stride)
return base, stride


cpdef _period_str_to_code(freqstr):
Expand Down
3 changes: 2 additions & 1 deletion pandas/_libs/tslibs/offsets.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -532,7 +532,8 @@ cdef inline int month_add_months(npy_datetimestruct dts, int months) nogil:
New month number after shifting npy_datetimestruct
number of months.
"""
cdef int new_month = (dts.month + months) % 12
cdef:
int new_month = (dts.month + months) % 12
return 12 if new_month == 0 else new_month


Expand Down
3 changes: 2 additions & 1 deletion pandas/_libs/tslibs/timestamps.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,8 @@ def round_nsint64(values, mode, freq):

# if/elif above should catch all rounding modes defined in enum 'RoundTo':
# if flow of control arrives here, it is a bug
assert False, "round_nsint64 called with an unrecognized rounding mode"
raise AssertionError("round_nsint64 called with an unrecognized "
"rounding mode")


# This is PITA. Because we inherit from datetime, which has very specific
Expand Down
16 changes: 8 additions & 8 deletions pandas/_libs/writers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -128,16 +128,16 @@ def max_len_string_array(pandas_string[:] arr) -> Py_ssize_t:
""" return the maximum size of elements in a 1-dim string array """
cdef:
Py_ssize_t i, m = 0, l = 0, length = arr.shape[0]
pandas_string v
pandas_string val

for i in range(length):
v = arr[i]
if isinstance(v, str):
l = PyString_GET_SIZE(v)
elif isinstance(v, bytes):
l = PyBytes_GET_SIZE(v)
elif isinstance(v, unicode):
l = PyUnicode_GET_SIZE(v)
val = arr[i]
if isinstance(val, str):
l = PyString_GET_SIZE(val)
elif isinstance(val, bytes):
l = PyBytes_GET_SIZE(val)
elif isinstance(val, unicode):
l = PyUnicode_GET_SIZE(val)

if l > m:
m = l
Expand Down