From de87047686db9ae4e3b32c12f7b2938e3b456843 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 25 Oct 2018 12:31:26 -0700 Subject: [PATCH 01/10] use more memoryviews --- pandas/_libs/algos.pyx | 17 +++++++++-------- pandas/_libs/lib.pyx | 2 +- pandas/_libs/reduction.pyx | 1 + pandas/_libs/sparse.pyx | 19 +++++++++++-------- pandas/_libs/tslibs/conversion.pyx | 2 +- pandas/_libs/tslibs/timestamps.pyx | 1 + 6 files changed, 24 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 3ba4c2375b4e8..cfaccaaa4e40e 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -151,7 +151,7 @@ def is_lexsorted(list_of_arrays: list) -> bint: @cython.boundscheck(False) @cython.wraparound(False) -def groupsort_indexer(ndarray[int64_t] index, Py_ssize_t ngroups): +def groupsort_indexer(int64_t[:] index, Py_ssize_t ngroups): """ compute a 1-d indexer that is an ordering of the passed index, ordered by the groups. This is a reverse of the label @@ -373,7 +373,7 @@ ctypedef fused algos_t: # TODO: unused; needed? @cython.wraparound(False) @cython.boundscheck(False) -cpdef map_indices(ndarray[algos_t] index): +cpdef map_indices(algos_t[:] index): """ Produce a dict mapping the values of the input array to their respective locations. @@ -397,7 +397,7 @@ cpdef map_indices(ndarray[algos_t] index): @cython.boundscheck(False) @cython.wraparound(False) -def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None): +def pad(algos_t[:] old, algos_t[:] new, limit=None): cdef: Py_ssize_t i, j, nleft, nright ndarray[int64_t, ndim=1] indexer @@ -475,8 +475,9 @@ pad_bool = pad["uint8_t"] @cython.boundscheck(False) @cython.wraparound(False) -def pad_inplace(ndarray[algos_t] values, +def pad_inplace(algos_t[:] values, ndarray[uint8_t, cast=True] mask, + # TODO: What does the cast=True mean? If unneeded, use bint[:]? limit=None): cdef: Py_ssize_t i, N @@ -595,7 +596,7 @@ D @cython.boundscheck(False) @cython.wraparound(False) -def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None): +def backfill(algos_t[:] old, algos_t[:] new, limit=None): cdef: Py_ssize_t i, j, nleft, nright ndarray[int64_t, ndim=1] indexer @@ -674,7 +675,7 @@ backfill_bool = backfill["uint8_t"] @cython.boundscheck(False) @cython.wraparound(False) -def backfill_inplace(ndarray[algos_t] values, +def backfill_inplace(algos_t[:] values, ndarray[uint8_t, cast=True] mask, limit=None): cdef: @@ -768,7 +769,7 @@ backfill_2d_inplace_bool = backfill_2d_inplace["uint8_t"] @cython.wraparound(False) @cython.boundscheck(False) -def arrmap(ndarray[algos_t] index, object func): +def arrmap(algos_t[:] index, object func): cdef: Py_ssize_t length = index.shape[0] Py_ssize_t i = 0 @@ -793,7 +794,7 @@ arrmap_bool = arrmap["uint8_t"] @cython.boundscheck(False) @cython.wraparound(False) -def is_monotonic(ndarray[algos_t] arr, bint timelike): +def is_monotonic(algos_t[:] arr, bint timelike): """ Returns ------- diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c5d5a431e8139..b931898d1ba46 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1909,7 +1909,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, @cython.boundscheck(False) @cython.wraparound(False) -def maybe_convert_objects(ndarray[object] objects, bint try_float=0, +def maybe_convert_objects(ndarray[object, ndim=1] objects, bint try_float=0, bint safe=0, bint convert_datetime=0, bint convert_timedelta=0): """ diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 681ea2c6295f2..119060bd28a1c 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -438,6 +438,7 @@ cdef inline _extract_result(object res): res = res[0] return res + cdef class Slider: """ Only handles contiguous data for now diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index 6b6c442632e4c..8358555c4966b 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -315,6 +315,7 @@ cpdef get_blocks(ndarray[int32_t, ndim=1] indices): lens = lens[:result_indexer] return locs, lens + # ----------------------------------------------------------------------------- # BlockIndex @@ -805,10 +806,11 @@ include "sparse_op_helper.pxi" # Indexing operations def get_reindexer(ndarray[object, ndim=1] values, dict index_map): - cdef object idx - cdef Py_ssize_t i - cdef Py_ssize_t new_length = len(values) - cdef ndarray[int32_t, ndim=1] indexer + cdef: + object idx + Py_ssize_t i + Py_ssize_t new_length = len(values) + ndarray[int32_t, ndim=1] indexer indexer = np.empty(new_length, dtype=np.int32) @@ -861,10 +863,11 @@ def reindex_integer(ndarray[float64_t, ndim=1] values, # SparseArray mask create operations def make_mask_object_ndarray(ndarray[object, ndim=1] arr, object fill_value): - cdef object value - cdef Py_ssize_t i - cdef Py_ssize_t new_length = len(arr) - cdef ndarray[int8_t, ndim=1] mask + cdef: + object value + Py_ssize_t i + Py_ssize_t new_length = len(arr) + ndarray[int8_t, ndim=1] mask mask = np.ones(new_length, dtype=np.int8) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index f9c604cd76472..1e4ec740328b8 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -49,10 +49,10 @@ TD_DTYPE = np.dtype('m8[ns]') UTC = pytz.UTC + # ---------------------------------------------------------------------- # Misc Helpers -# TODO: How to declare np.datetime64 as the input type? cdef inline int64_t get_datetime64_nanos(object val) except? -1: """ Extract the value and unit from a np.datetime64 object, then convert the diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 08b0c5472549e..51f415f3713f1 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -107,6 +107,7 @@ cdef inline _npdivmod(x1, x2): try: from numpy import divmod as npdivmod except ImportError: + # numpy < 1.13 npdivmod = _npdivmod From 9a5d635e63cd9d2855fada57acdcd192d5390cbb Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Fri, 26 Oct 2018 10:19:37 -0700 Subject: [PATCH 02/10] use more memoryviews --- pandas/_libs/algos.pyx | 23 ++++--------- pandas/_libs/algos_common_helper.pxi.in | 14 ++------ pandas/_libs/groupby.pyx | 20 +++++------ pandas/_libs/groupby_helper.pxi.in | 26 +++++++-------- pandas/_libs/lib.pyx | 44 +++++++++++++------------ 5 files changed, 55 insertions(+), 72 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index cfaccaaa4e40e..fefc703e04335 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -77,7 +77,7 @@ class NegInfinity(object): __ge__ = lambda self, other: isinstance(other, NegInfinity) -cpdef ndarray[int64_t, ndim=1] unique_deltas(ndarray[int64_t] arr): +cpdef ndarray[int64_t, ndim=1] unique_deltas(int64_t[:] arr): """ Efficiently find the unique first-differences of the given array. @@ -236,7 +236,7 @@ def nancorr(ndarray[float64_t, ndim=2] mat, bint cov=0, minp=None): Py_ssize_t i, j, xi, yi, N, K bint minpv ndarray[float64_t, ndim=2] result - ndarray[uint8_t, ndim=2] mask + uint8_t[:, :] mask int64_t nobs = 0 float64_t vx, vy, sumx, sumy, sumxx, sumyy, meanx, meany, divisor @@ -301,7 +301,7 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1): ndarray[float64_t, ndim=2] result ndarray[float64_t, ndim=1] maskedx ndarray[float64_t, ndim=1] maskedy - ndarray[uint8_t, ndim=2] mask + uint8_t[:, :] mask int64_t nobs = 0 float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor @@ -475,10 +475,7 @@ pad_bool = pad["uint8_t"] @cython.boundscheck(False) @cython.wraparound(False) -def pad_inplace(algos_t[:] values, - ndarray[uint8_t, cast=True] mask, - # TODO: What does the cast=True mean? If unneeded, use bint[:]? - limit=None): +def pad_inplace(algos_t[:] values, uint8_t[:] mask, limit=None): cdef: Py_ssize_t i, N algos_t val @@ -522,9 +519,7 @@ pad_inplace_bool = pad_inplace["uint8_t"] @cython.boundscheck(False) @cython.wraparound(False) -def pad_2d_inplace(ndarray[algos_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): +def pad_2d_inplace(algos_t[:, :] values, uint8_t[:, :] mask, limit=None): cdef: Py_ssize_t i, j, N, K algos_t val @@ -675,9 +670,7 @@ backfill_bool = backfill["uint8_t"] @cython.boundscheck(False) @cython.wraparound(False) -def backfill_inplace(algos_t[:] values, - ndarray[uint8_t, cast=True] mask, - limit=None): +def backfill_inplace(algos_t[:] values, uint8_t[:] mask, limit=None): cdef: Py_ssize_t i, N algos_t val @@ -721,9 +714,7 @@ backfill_inplace_bool = backfill_inplace["uint8_t"] @cython.boundscheck(False) @cython.wraparound(False) -def backfill_2d_inplace(ndarray[algos_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): +def backfill_2d_inplace(algos_t[:, :] values, uint8_t[:, :] mask, limit=None): cdef: Py_ssize_t i, j, N, K algos_t val diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in index 518664d70cf06..25b5f43a0bab5 100644 --- a/pandas/_libs/algos_common_helper.pxi.in +++ b/pandas/_libs/algos_common_helper.pxi.in @@ -1,16 +1,6 @@ """ Template for each `dtype` helper function using 1-d template -# 1-d template -- pad -- pad_1d -- pad_2d -- backfill -- backfill_1d -- backfill_2d -- is_monotonic -- arrmap - WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ @@ -39,7 +29,7 @@ def get_dispatch(dtypes): @cython.boundscheck(False) @cython.wraparound(False) def diff_2d_{{name}}(ndarray[{{c_type}}, ndim=2] arr, - ndarray[{{dest_type}}, ndim=2] out, + {{dest_type}}[:, :] out, Py_ssize_t periods, int axis): cdef: Py_ssize_t i, j, sx, sy @@ -83,7 +73,7 @@ def diff_2d_{{name}}(ndarray[{{c_type}}, ndim=2] arr, def put2d_{{name}}_{{dest_name}}(ndarray[{{c_type}}, ndim=2, cast=True] values, ndarray[int64_t] indexer, Py_ssize_t loc, - ndarray[{{dest_type}}] out): + {{dest_type}}[:, :] out): cdef: Py_ssize_t i, j, k diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index d683c93c9b32e..4ae2f3cc713f2 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -99,16 +99,16 @@ cdef inline float64_t kth_smallest_c(float64_t* a, @cython.boundscheck(False) @cython.wraparound(False) def group_median_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, + int64_t[:] counts, ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels, + int64_t[:] labels, Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ cdef: Py_ssize_t i, j, N, K, ngroups, size - ndarray[int64_t] _counts + int64_t[:] _counts ndarray data float64_t* ptr @@ -217,7 +217,7 @@ def group_cumsum(numeric[:, :] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_shift_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, +def group_shift_indexer(int64_t[:] out, int64_t[:] labels, int ngroups, int periods): cdef: Py_ssize_t N, i, j, ii @@ -269,8 +269,8 @@ def group_shift_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, @cython.wraparound(False) @cython.boundscheck(False) -def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, - ndarray[uint8_t] mask, object direction, +def group_fillna_indexer(int64_t[:] out, ndarray[int64_t] labels, + uint8_t[:] mask, object direction, int64_t limit): """Indexes how to fill values forwards or backwards within a group @@ -328,9 +328,9 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, @cython.boundscheck(False) @cython.wraparound(False) def group_any_all(ndarray[uint8_t] out, - ndarray[int64_t] labels, - ndarray[uint8_t] values, - ndarray[uint8_t] mask, + int64_t[:] labels, + uint8_t[:] values, + uint8_t[:] mask, object val_test, bint skipna): """Aggregated boolean values to show truthfulness of group elements @@ -353,7 +353,7 @@ def group_any_all(ndarray[uint8_t] out, The returned values will either be 0 or 1 (False or True, respectively). """ cdef: - Py_ssize_t i, N=len(labels) + Py_ssize_t i, N = len(labels) int64_t lab uint8_t flag_val diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index addbb2b3e8165..128ac3840928b 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -30,9 +30,9 @@ def get_dispatch(dtypes): @cython.wraparound(False) @cython.boundscheck(False) def group_add_{{name}}(ndarray[{{c_type}}, ndim=2] out, - ndarray[int64_t] counts, + int64_t[:] counts, ndarray[{{c_type}}, ndim=2] values, - ndarray[int64_t] labels, + int64_t[:] labels, Py_ssize_t min_count=0): """ Only aggregates on axis=0 @@ -40,7 +40,7 @@ def group_add_{{name}}(ndarray[{{c_type}}, ndim=2] out, cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) {{c_type}} val, count - ndarray[{{c_type}}, ndim=2] sumx, nobs + {{c_type}}[:, :] sumx, nobs if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") @@ -77,9 +77,9 @@ def group_add_{{name}}(ndarray[{{c_type}}, ndim=2] out, @cython.wraparound(False) @cython.boundscheck(False) def group_prod_{{name}}(ndarray[{{c_type}}, ndim=2] out, - ndarray[int64_t] counts, + int64_t[:] counts, ndarray[{{c_type}}, ndim=2] values, - ndarray[int64_t] labels, + int64_t[:] labels, Py_ssize_t min_count=0): """ Only aggregates on axis=0 @@ -87,7 +87,7 @@ def group_prod_{{name}}(ndarray[{{c_type}}, ndim=2] out, cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) {{c_type}} val, count - ndarray[{{c_type}}, ndim=2] prodx, nobs + {{c_type}}[:, :] prodx, nobs if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") @@ -124,14 +124,14 @@ def group_prod_{{name}}(ndarray[{{c_type}}, ndim=2] out, @cython.boundscheck(False) @cython.cdivision(True) def group_var_{{name}}(ndarray[{{c_type}}, ndim=2] out, - ndarray[int64_t] counts, + int64_t[:] counts, ndarray[{{c_type}}, ndim=2] values, - ndarray[int64_t] labels, + int64_t[:] labels, Py_ssize_t min_count=-1): cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) {{c_type}} val, ct, oldmean - ndarray[{{c_type}}, ndim=2] nobs, mean + {{c_type}}[:, :] nobs, mean assert min_count == -1, "'min_count' only used in add and prod" @@ -176,14 +176,14 @@ def group_var_{{name}}(ndarray[{{c_type}}, ndim=2] out, @cython.wraparound(False) @cython.boundscheck(False) def group_mean_{{name}}(ndarray[{{c_type}}, ndim=2] out, - ndarray[int64_t] counts, + int64_t[:] counts, ndarray[{{c_type}}, ndim=2] values, ndarray[int64_t] labels, Py_ssize_t min_count=-1): cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) {{c_type}} val, count - ndarray[{{c_type}}, ndim=2] sumx, nobs + {{c_type}}[:, :] sumx, nobs assert min_count == -1, "'min_count' only used in add and prod" @@ -221,9 +221,9 @@ def group_mean_{{name}}(ndarray[{{c_type}}, ndim=2] out, @cython.wraparound(False) @cython.boundscheck(False) def group_ohlc_{{name}}(ndarray[{{c_type}}, ndim=2] out, - ndarray[int64_t] counts, + int64_t[:] counts, ndarray[{{c_type}}, ndim=2] values, - ndarray[int64_t] labels, + int64_t[:] labels, Py_ssize_t min_count=-1): """ Only aggregates on axis=0 diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index b931898d1ba46..d348a2f0acc8d 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -329,7 +329,7 @@ def fast_zip(list ndarrays): return result -def get_reverse_indexer(ndarray[int64_t] indexer, Py_ssize_t length): +def get_reverse_indexer(int64_t[:] indexer, Py_ssize_t length): """ Reverse indexing operation. @@ -356,7 +356,7 @@ def get_reverse_indexer(ndarray[int64_t] indexer, Py_ssize_t length): return rev_indexer -def has_infs_f4(ndarray[float32_t] arr) -> bint: +def has_infs_f4(float32_t[:] arr) -> bint: cdef: Py_ssize_t i, n = len(arr) float32_t inf, neginf, val @@ -371,7 +371,7 @@ def has_infs_f4(ndarray[float32_t] arr) -> bint: return False -def has_infs_f8(ndarray[float64_t] arr) -> bint: +def has_infs_f8(float64_t[:] arr) -> bint: cdef: Py_ssize_t i, n = len(arr) float64_t inf, neginf, val @@ -473,7 +473,7 @@ def array_equivalent_object(left: object[:], right: object[:]) -> bint: return True -def astype_intsafe(ndarray[object] arr, new_dtype): +def astype_intsafe(object[:] arr, new_dtype): cdef: Py_ssize_t i, n = len(arr) object v @@ -494,8 +494,7 @@ def astype_intsafe(ndarray[object] arr, new_dtype): return result -def astype_unicode(arr: ndarray, - skipna: bool=False) -> ndarray[object]: +def astype_unicode(arr: ndarray, skipna: bool=False) -> ndarray[object]: """ Convert all elements in an array to unicode. @@ -528,8 +527,7 @@ def astype_unicode(arr: ndarray, return result -def astype_str(arr: ndarray, - skipna: bool=False) -> ndarray[object]: +def astype_str(arr: ndarray, skipna: bool=False) -> ndarray[object]: """ Convert all elements in an array to string. @@ -605,7 +603,7 @@ def clean_index_list(list obj): # is a general, O(max(len(values), len(binner))) method. @cython.boundscheck(False) @cython.wraparound(False) -def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner, +def generate_bins_dt64(ndarray[int64_t] values, int64_t[:] binner, object closed='left', bint hasnans=0): """ Int64 (datetime64) version of generic python version in groupby.py @@ -712,8 +710,7 @@ def row_bool_subset_object(ndarray[object, ndim=2] values, @cython.boundscheck(False) @cython.wraparound(False) -def get_level_sorter(ndarray[int64_t, ndim=1] label, - ndarray[int64_t, ndim=1] starts): +def get_level_sorter(ndarray[int64_t, ndim=1] label, int64_t[:] starts): """ argsort for a single level of a multi-index, keeping the order of higher levels unchanged. `starts` points to starts of same-key indices w.r.t @@ -736,7 +733,7 @@ def get_level_sorter(ndarray[int64_t, ndim=1] label, @cython.boundscheck(False) @cython.wraparound(False) def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, - ndarray[int64_t, ndim=1] labels, + int64_t[:] labels, Py_ssize_t max_bin, int axis): cdef: @@ -763,7 +760,7 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, return counts -def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups): +def generate_slices(int64_t[:] labels, Py_ssize_t ngroups): cdef: Py_ssize_t i, group_size, n, start int64_t lab @@ -792,7 +789,7 @@ def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups): return starts, ends -def indices_fast(object index, ndarray[int64_t] labels, list keys, +def indices_fast(object index, int64_t[:] labels, list keys, list sorted_labels): cdef: Py_ssize_t i, j, k, lab, cur, start, n = len(labels) @@ -1909,7 +1906,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, @cython.boundscheck(False) @cython.wraparound(False) -def maybe_convert_objects(ndarray[object, ndim=1] objects, bint try_float=0, +def maybe_convert_objects(object[:] objects, bint try_float=0, bint safe=0, bint convert_datetime=0, bint convert_timedelta=0): """ @@ -1922,12 +1919,18 @@ def maybe_convert_objects(ndarray[object, ndim=1] objects, bint try_float=0, ndarray[int64_t] ints ndarray[uint64_t] uints ndarray[uint8_t] bools - ndarray[int64_t] idatetimes - ndarray[int64_t] itimedeltas + int64_t[:] idatetimes + int64_t[:] itimedeltas Seen seen = Seen() object val float64_t fval, fnan + if objects is None: + # Without explicitly raising, groupby.ops _aggregate_series_pure_python + # can pass None and incorrectly raise an AttributeError when trying + # to access `objects.base` below. + raise TypeError + n = len(objects) floats = np.empty(n, dtype='f8') @@ -2036,7 +2039,7 @@ def maybe_convert_objects(ndarray[object, ndim=1] objects, bint try_float=0, if seen.datetimetz_: if len({getattr(val, 'tzinfo', None) for val in objects}) == 1: from pandas import DatetimeIndex - return DatetimeIndex(objects) + return DatetimeIndex(objects.base) seen.object_ = 1 if not seen.object_: @@ -2101,11 +2104,10 @@ def maybe_convert_objects(ndarray[object, ndim=1] objects, bint try_float=0, elif seen.is_bool: return bools.view(np.bool_) - return objects + return objects.base # `.base` to access underlying np.ndarray -def map_infer_mask(ndarray arr, object f, ndarray[uint8_t] mask, - bint convert=1): +def map_infer_mask(ndarray arr, object f, uint8_t[:] mask, bint convert=1): """ Substitute for np.vectorize with pandas-friendly dtype inference From ca7fb487c0a4d113c4a53799ef7133a9650b1b29 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Fri, 26 Oct 2018 15:35:27 -0700 Subject: [PATCH 03/10] memoryviews, avoid built-in names --- pandas/_libs/algos.pyx | 4 +- pandas/_libs/algos_common_helper.pxi.in | 2 +- pandas/_libs/lib.pyx | 30 ++-- pandas/_libs/window.pyx | 228 ++++++++++++------------ 4 files changed, 134 insertions(+), 130 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index fefc703e04335..44ce1589de2c0 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -764,7 +764,7 @@ def arrmap(algos_t[:] index, object func): cdef: Py_ssize_t length = index.shape[0] Py_ssize_t i = 0 - ndarray[object] result = np.empty(length, dtype=np.object_) + object[:] result = np.empty(length, dtype=np.object_) from pandas._libs.lib import maybe_convert_objects @@ -785,7 +785,7 @@ arrmap_bool = arrmap["uint8_t"] @cython.boundscheck(False) @cython.wraparound(False) -def is_monotonic(algos_t[:] arr, bint timelike): +def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike): """ Returns ------- diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in index 25b5f43a0bab5..a10f0493166a0 100644 --- a/pandas/_libs/algos_common_helper.pxi.in +++ b/pandas/_libs/algos_common_helper.pxi.in @@ -72,7 +72,7 @@ def diff_2d_{{name}}(ndarray[{{c_type}}, ndim=2] arr, def put2d_{{name}}_{{dest_name}}(ndarray[{{c_type}}, ndim=2, cast=True] values, - ndarray[int64_t] indexer, Py_ssize_t loc, + int64_t[:] indexer, Py_ssize_t loc, {{dest_type}}[:, :] out): cdef: Py_ssize_t i, j, k diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d348a2f0acc8d..872d0ffff771e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -18,7 +18,7 @@ PyDateTime_IMPORT import numpy as np cimport numpy as cnp -from numpy cimport (ndarray, PyArray_NDIM, PyArray_GETITEM, +from numpy cimport (ndarray, PyArray_GETITEM, PyArray_ITER_DATA, PyArray_ITER_NEXT, PyArray_IterNew, flatiter, NPY_OBJECT, int64_t, @@ -74,9 +74,9 @@ cdef bint PY2 = sys.version_info[0] == 2 cdef double nan = np.NaN -def values_from_object(object obj): +def values_from_object(obj: object): """ return my values or the object if we are say an ndarray """ - cdef func # TODO: Does declaring this without a type accomplish anything? + func: object func = getattr(obj, 'get_values', None) if func is not None: @@ -170,7 +170,7 @@ def item_from_zerodim(val: object) -> object: @cython.boundscheck(False) def fast_unique_multiple(list arrays): cdef: - ndarray[object] buf + object[:] buf Py_ssize_t k = len(arrays) Py_ssize_t i, j, n list uniques = [] @@ -586,7 +586,7 @@ def clean_index_list(list obj): return np.asarray(obj, dtype=object), 0 elif inferred in ['integer']: - # TODO: we infer an integer but it *could* be a unint64 + # TODO: we infer an integer but it *could* be a uint64 try: return np.asarray(obj, dtype='int64'), 0 except OverflowError: @@ -688,13 +688,13 @@ def row_bool_subset(ndarray[float64_t, ndim=2] values, @cython.boundscheck(False) @cython.wraparound(False) -def row_bool_subset_object(ndarray[object, ndim=2] values, +def row_bool_subset_object(object[:, :] values, ndarray[uint8_t, cast=True] mask): cdef: Py_ssize_t i, j, n, k, pos = 0 ndarray[object, ndim=2] out - n, k = ( values).shape + n, k = (values).shape assert (n == len(mask)) out = np.empty((mask.sum(), k), dtype=object) @@ -2129,11 +2129,11 @@ def map_infer_mask(ndarray arr, object f, uint8_t[:] mask, bint convert=1): result = np.empty(n, dtype=object) for i in range(n): if mask[i]: - val = util.get_value_at(arr, i) + val = arr[i] else: - val = f(util.get_value_at(arr, i)) + val = f(arr[i]) - if util.is_array(val) and PyArray_NDIM(val) == 0: + if cnp.PyArray_IsZeroDim(val): # unbox 0-dim arrays, GH#690 # TODO: is there a faster way to unbox? # item_from_zerodim? @@ -2165,15 +2165,15 @@ def map_infer(ndarray arr, object f, bint convert=1): """ cdef: Py_ssize_t i, n - ndarray[object] result + object[:] result object val n = len(arr) result = np.empty(n, dtype=object) for i in range(n): - val = f(util.get_value_at(arr, i)) + val = f(arr[i]) - if util.is_array(val) and PyArray_NDIM(val) == 0: + if cnp.PyArray_IsZeroDim(val): # unbox 0-dim arrays, GH#690 # TODO: is there a faster way to unbox? # item_from_zerodim? @@ -2187,7 +2187,7 @@ def map_infer(ndarray arr, object f, bint convert=1): convert_datetime=0, convert_timedelta=0) - return result + return result.base # `.base` to access underlying np.ndarray def to_object_array(list rows, int min_width=0): @@ -2284,7 +2284,7 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): cdef: Py_ssize_t i, n = len(keys) object val - ndarray[object] output = np.empty(n, dtype='O') + object[:] output = np.empty(n, dtype='O') if n == 0: # kludge, for Series diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 989dc4dd17a37..770e0d2ef0f09 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -122,14 +122,14 @@ cdef class MockFixedWindowIndexer(WindowIndexer): Parameters ---------- - input: ndarray - input data array + values: ndarray + values data array win: int64_t window size minp: int64_t min number of obs in a window to consider non-NaN index: object - index of the input + index of the values floor: optional unit for flooring left_closed: bint @@ -138,13 +138,13 @@ cdef class MockFixedWindowIndexer(WindowIndexer): right endpoint closedness """ - def __init__(self, ndarray input, int64_t win, int64_t minp, + def __init__(self, ndarray values, int64_t win, int64_t minp, bint left_closed, bint right_closed, object index=None, object floor=None): assert index is None self.is_variable = 0 - self.N = len(input) + self.N = len(values) self.minp = _check_minp(win, minp, self.N, floor=floor) self.start = np.empty(0, dtype='int64') self.end = np.empty(0, dtype='int64') @@ -160,14 +160,14 @@ cdef class FixedWindowIndexer(WindowIndexer): Parameters ---------- - input: ndarray - input data array + values: ndarray + values data array win: int64_t window size minp: int64_t min number of obs in a window to consider non-NaN index: object - index of the input + index of the values floor: optional unit for flooring the unit left_closed: bint @@ -176,14 +176,14 @@ cdef class FixedWindowIndexer(WindowIndexer): right endpoint closedness """ - def __init__(self, ndarray input, int64_t win, int64_t minp, + def __init__(self, ndarray values, int64_t win, int64_t minp, bint left_closed, bint right_closed, object index=None, object floor=None): cdef ndarray start_s, start_e, end_s, end_e assert index is None self.is_variable = 0 - self.N = len(input) + self.N = len(values) self.minp = _check_minp(win, minp, self.N, floor=floor) start_s = np.zeros(win, dtype='int64') @@ -205,14 +205,14 @@ cdef class VariableWindowIndexer(WindowIndexer): Parameters ---------- - input: ndarray - input data array + values: ndarray + values data array win: int64_t window size minp: int64_t min number of obs in a window to consider non-NaN index: ndarray - index of the input + index of the values left_closed: bint left endpoint closedness True if the left endpoint is closed, False if open @@ -222,7 +222,7 @@ cdef class VariableWindowIndexer(WindowIndexer): floor: optional unit for flooring the unit """ - def __init__(self, ndarray input, int64_t win, int64_t minp, + def __init__(self, ndarray values, int64_t win, int64_t minp, bint left_closed, bint right_closed, ndarray index, object floor=None): @@ -241,7 +241,7 @@ cdef class VariableWindowIndexer(WindowIndexer): # max window size self.win = (self.end - self.start).max() - def build(self, ndarray[int64_t] index, int64_t win, bint left_closed, + def build(self, int64_t[:] index, int64_t win, bint left_closed, bint right_closed): cdef: @@ -294,18 +294,18 @@ cdef class VariableWindowIndexer(WindowIndexer): end[i] -= 1 -def get_window_indexer(input, win, minp, index, closed, +def get_window_indexer(values, win, minp, index, closed, floor=None, use_mock=True): """ return the correct window indexer for the computation Parameters ---------- - input: 1d ndarray + values: 1d ndarray win: integer, window size minp: integer, minimum periods index: 1d ndarray, optional - index to the input array + index to the values array closed: string, default None {'right', 'left', 'both', 'neither'} window endpoint closedness. Defaults to 'right' in @@ -342,13 +342,13 @@ def get_window_indexer(input, win, minp, index, closed, left_closed = True if index is not None: - indexer = VariableWindowIndexer(input, win, minp, left_closed, + indexer = VariableWindowIndexer(values, win, minp, left_closed, right_closed, index, floor) elif use_mock: - indexer = MockFixedWindowIndexer(input, win, minp, left_closed, + indexer = MockFixedWindowIndexer(values, win, minp, left_closed, right_closed, index, floor) else: - indexer = FixedWindowIndexer(input, win, minp, left_closed, + indexer = FixedWindowIndexer(values, win, minp, left_closed, right_closed, index, floor) return indexer.get_data() @@ -357,7 +357,7 @@ def get_window_indexer(input, win, minp, index, closed, # this is only an impl for index not None, IOW, freq aware -def roll_count(ndarray[double_t] input, int64_t win, int64_t minp, +def roll_count(ndarray[double_t] values, int64_t win, int64_t minp, object index, object closed): cdef: double val, count_x = 0.0 @@ -366,7 +366,7 @@ def roll_count(ndarray[double_t] input, int64_t win, int64_t minp, ndarray[int64_t] start, end ndarray[double_t] output - start, end, N, win, minp, _ = get_window_indexer(input, win, + start, end, N, win, minp, _ = get_window_indexer(values, win, minp, index, closed) output = np.empty(N, dtype=float) @@ -381,7 +381,7 @@ def roll_count(ndarray[double_t] input, int64_t win, int64_t minp, # setup count_x = 0.0 for j in range(s, e): - val = input[j] + val = values[j] if notnan(val): count_x += 1.0 @@ -389,13 +389,13 @@ def roll_count(ndarray[double_t] input, int64_t win, int64_t minp, # calculate deletes for j in range(start[i - 1], s): - val = input[j] + val = values[j] if notnan(val): count_x -= 1.0 # calculate adds for j in range(end[i - 1], e): - val = input[j] + val = values[j] if notnan(val): count_x += 1.0 @@ -438,7 +438,7 @@ cdef inline void remove_sum(double val, int64_t *nobs, double *sum_x) nogil: sum_x[0] = sum_x[0] - val -def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp, +def roll_sum(ndarray[double_t] values, int64_t win, int64_t minp, object index, object closed): cdef: double val, prev_x, sum_x = 0 @@ -448,7 +448,7 @@ def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp, ndarray[int64_t] start, end ndarray[double_t] output - start, end, N, win, minp, is_variable = get_window_indexer(input, win, + start, end, N, win, minp, is_variable = get_window_indexer(values, win, minp, index, closed, floor=0) @@ -473,17 +473,17 @@ def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp, sum_x = 0.0 nobs = 0 for j in range(s, e): - add_sum(input[j], &nobs, &sum_x) + add_sum(values[j], &nobs, &sum_x) else: # calculate deletes for j in range(start[i - 1], s): - remove_sum(input[j], &nobs, &sum_x) + remove_sum(values[j], &nobs, &sum_x) # calculate adds for j in range(end[i - 1], e): - add_sum(input[j], &nobs, &sum_x) + add_sum(values[j], &nobs, &sum_x) output[i] = calc_sum(minp, nobs, sum_x) @@ -496,15 +496,15 @@ def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp, with nogil: for i in range(0, range_endpoint): - add_sum(input[i], &nobs, &sum_x) + add_sum(values[i], &nobs, &sum_x) output[i] = NaN for i in range(range_endpoint, N): - val = input[i] + val = values[i] add_sum(val, &nobs, &sum_x) if i > win - 1: - prev_x = input[i - win] + prev_x = values[i - win] remove_sum(prev_x, &nobs, &sum_x) output[i] = calc_sum(minp, nobs, sum_x) @@ -557,7 +557,7 @@ cdef inline void remove_mean(double val, Py_ssize_t *nobs, double *sum_x, neg_ct[0] = neg_ct[0] - 1 -def roll_mean(ndarray[double_t] input, int64_t win, int64_t minp, +def roll_mean(ndarray[double_t] values, int64_t win, int64_t minp, object index, object closed): cdef: double val, prev_x, result, sum_x = 0 @@ -567,7 +567,7 @@ def roll_mean(ndarray[double_t] input, int64_t win, int64_t minp, ndarray[int64_t] start, end ndarray[double_t] output - start, end, N, win, minp, is_variable = get_window_indexer(input, win, + start, end, N, win, minp, is_variable = get_window_indexer(values, win, minp, index, closed) output = np.empty(N, dtype=float) @@ -590,19 +590,19 @@ def roll_mean(ndarray[double_t] input, int64_t win, int64_t minp, sum_x = 0.0 nobs = 0 for j in range(s, e): - val = input[j] + val = values[j] add_mean(val, &nobs, &sum_x, &neg_ct) else: # calculate deletes for j in range(start[i - 1], s): - val = input[j] + val = values[j] remove_mean(val, &nobs, &sum_x, &neg_ct) # calculate adds for j in range(end[i - 1], e): - val = input[j] + val = values[j] add_mean(val, &nobs, &sum_x, &neg_ct) output[i] = calc_mean(minp, nobs, neg_ct, sum_x) @@ -611,16 +611,16 @@ def roll_mean(ndarray[double_t] input, int64_t win, int64_t minp, with nogil: for i in range(minp - 1): - val = input[i] + val = values[i] add_mean(val, &nobs, &sum_x, &neg_ct) output[i] = NaN for i in range(minp - 1, N): - val = input[i] + val = values[i] add_mean(val, &nobs, &sum_x, &neg_ct) if i > win - 1: - prev_x = input[i - win] + prev_x = values[i - win] remove_mean(prev_x, &nobs, &sum_x, &neg_ct) output[i] = calc_mean(minp, nobs, neg_ct, sum_x) @@ -685,7 +685,7 @@ cdef inline void remove_var(double val, double *nobs, double *mean_x, ssqdm_x[0] = 0 -def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, +def roll_var(ndarray[double_t] values, int64_t win, int64_t minp, object index, object closed, int ddof=1): """ Numerically stable implementation using Welford's method. @@ -698,7 +698,7 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, ndarray[int64_t] start, end ndarray[double_t] output - start, end, N, win, minp, is_variable = get_window_indexer(input, win, + start, end, N, win, minp, is_variable = get_window_indexer(values, win, minp, index, closed) output = np.empty(N, dtype=float) @@ -724,7 +724,7 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, if i == 0: for j in range(s, e): - add_var(input[j], &nobs, &mean_x, &ssqdm_x) + add_var(values[j], &nobs, &mean_x, &ssqdm_x) else: @@ -733,11 +733,11 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, # calculate adds for j in range(end[i - 1], e): - add_var(input[j], &nobs, &mean_x, &ssqdm_x) + add_var(values[j], &nobs, &mean_x, &ssqdm_x) # calculate deletes for j in range(start[i - 1], s): - remove_var(input[j], &nobs, &mean_x, &ssqdm_x) + remove_var(values[j], &nobs, &mean_x, &ssqdm_x) output[i] = calc_var(minp, ddof, nobs, ssqdm_x) @@ -748,7 +748,7 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, # Over the first window, observations can only be added, never # removed for i in range(win): - add_var(input[i], &nobs, &mean_x, &ssqdm_x) + add_var(values[i], &nobs, &mean_x, &ssqdm_x) output[i] = calc_var(minp, ddof, nobs, ssqdm_x) # a part of Welford's method for the online variance-calculation @@ -757,8 +757,8 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, # After the first window, observations can both be added and # removed for i in range(win, N): - val = input[i] - prev = input[i - win] + val = values[i] + prev = values[i - win] if notnan(val): if prev == prev: @@ -845,7 +845,7 @@ cdef inline void remove_skew(double val, int64_t *nobs, double *x, double *xx, xxx[0] = xxx[0] - val * val * val -def roll_skew(ndarray[double_t] input, int64_t win, int64_t minp, +def roll_skew(ndarray[double_t] values, int64_t win, int64_t minp, object index, object closed): cdef: double val, prev @@ -856,7 +856,7 @@ def roll_skew(ndarray[double_t] input, int64_t win, int64_t minp, ndarray[int64_t] start, end ndarray[double_t] output - start, end, N, win, minp, is_variable = get_window_indexer(input, win, + start, end, N, win, minp, is_variable = get_window_indexer(values, win, minp, index, closed) output = np.empty(N, dtype=float) @@ -875,7 +875,7 @@ def roll_skew(ndarray[double_t] input, int64_t win, int64_t minp, if i == 0: for j in range(s, e): - val = input[j] + val = values[j] add_skew(val, &nobs, &x, &xx, &xxx) else: @@ -885,12 +885,12 @@ def roll_skew(ndarray[double_t] input, int64_t win, int64_t minp, # calculate adds for j in range(end[i - 1], e): - val = input[j] + val = values[j] add_skew(val, &nobs, &x, &xx, &xxx) # calculate deletes for j in range(start[i - 1], s): - val = input[j] + val = values[j] remove_skew(val, &nobs, &x, &xx, &xxx) output[i] = calc_skew(minp, nobs, x, xx, xxx) @@ -899,16 +899,16 @@ def roll_skew(ndarray[double_t] input, int64_t win, int64_t minp, with nogil: for i in range(minp - 1): - val = input[i] + val = values[i] add_skew(val, &nobs, &x, &xx, &xxx) output[i] = NaN for i in range(minp - 1, N): - val = input[i] + val = values[i] add_skew(val, &nobs, &x, &xx, &xxx) if i > win - 1: - prev = input[i - win] + prev = values[i - win] remove_skew(prev, &nobs, &x, &xx, &xxx) output[i] = calc_skew(minp, nobs, x, xx, xxx) @@ -984,7 +984,7 @@ cdef inline void remove_kurt(double val, int64_t *nobs, double *x, double *xx, xxxx[0] = xxxx[0] - val * val * val * val -def roll_kurt(ndarray[double_t] input, int64_t win, int64_t minp, +def roll_kurt(ndarray[double_t] values, int64_t win, int64_t minp, object index, object closed): cdef: double val, prev @@ -995,7 +995,7 @@ def roll_kurt(ndarray[double_t] input, int64_t win, int64_t minp, ndarray[int64_t] start, end ndarray[double_t] output - start, end, N, win, minp, is_variable = get_window_indexer(input, win, + start, end, N, win, minp, is_variable = get_window_indexer(values, win, minp, index, closed) output = np.empty(N, dtype=float) @@ -1014,7 +1014,7 @@ def roll_kurt(ndarray[double_t] input, int64_t win, int64_t minp, if i == 0: for j in range(s, e): - add_kurt(input[j], &nobs, &x, &xx, &xxx, &xxxx) + add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) else: @@ -1023,11 +1023,11 @@ def roll_kurt(ndarray[double_t] input, int64_t win, int64_t minp, # calculate adds for j in range(end[i - 1], e): - add_kurt(input[j], &nobs, &x, &xx, &xxx, &xxxx) + add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) # calculate deletes for j in range(start[i - 1], s): - remove_kurt(input[j], &nobs, &x, &xx, &xxx, &xxxx) + remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) @@ -1036,14 +1036,14 @@ def roll_kurt(ndarray[double_t] input, int64_t win, int64_t minp, with nogil: for i in range(minp - 1): - add_kurt(input[i], &nobs, &x, &xx, &xxx, &xxxx) + add_kurt(values[i], &nobs, &x, &xx, &xxx, &xxxx) output[i] = NaN for i in range(minp - 1, N): - add_kurt(input[i], &nobs, &x, &xx, &xxx, &xxxx) + add_kurt(values[i], &nobs, &x, &xx, &xxx, &xxxx) if i > win - 1: - prev = input[i - win] + prev = values[i - win] remove_kurt(prev, &nobs, &x, &xx, &xxx, &xxxx) output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) @@ -1054,7 +1054,7 @@ def roll_kurt(ndarray[double_t] input, int64_t win, int64_t minp, # Rolling median, min, max -def roll_median_c(ndarray[float64_t] input, int64_t win, int64_t minp, +def roll_median_c(ndarray[float64_t] values, int64_t win, int64_t minp, object index, object closed): cdef: double val, res, prev @@ -1070,7 +1070,7 @@ def roll_median_c(ndarray[float64_t] input, int64_t win, int64_t minp, # we use the Fixed/Variable Indexer here as the # actual skiplist ops outweigh any window computation costs start, end, N, win, minp, is_variable = get_window_indexer( - input, win, + values, win, minp, index, closed, use_mock=False) output = np.empty(N, dtype=float) @@ -1088,7 +1088,7 @@ def roll_median_c(ndarray[float64_t] input, int64_t win, int64_t minp, if i == 0: # setup - val = input[i] + val = values[i] if notnan(val): nobs += 1 err = skiplist_insert(sl, val) != 1 @@ -1099,14 +1099,14 @@ def roll_median_c(ndarray[float64_t] input, int64_t win, int64_t minp, # calculate deletes for j in range(start[i - 1], s): - val = input[j] + val = values[j] if notnan(val): skiplist_remove(sl, val) nobs -= 1 # calculate adds for j in range(end[i - 1], e): - val = input[j] + val = values[j] if notnan(val): nobs += 1 err = skiplist_insert(sl, val) != 1 @@ -1180,14 +1180,14 @@ cdef inline numeric calc_mm(int64_t minp, Py_ssize_t nobs, return result -def roll_max(ndarray[numeric] input, int64_t win, int64_t minp, +def roll_max(ndarray[numeric] values, int64_t win, int64_t minp, object index, object closed): """ Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. Parameters ---------- - input: numpy array + values: numpy array window: int, size of rolling window minp: if number of observations in window is below this, output a NaN @@ -1197,27 +1197,27 @@ def roll_max(ndarray[numeric] input, int64_t win, int64_t minp, make the interval closed on the right, left, both or neither endpoints """ - return _roll_min_max(input, win, minp, index, closed=closed, is_max=1) + return _roll_min_max(values, win, minp, index, closed=closed, is_max=1) -def roll_min(ndarray[numeric] input, int64_t win, int64_t minp, +def roll_min(ndarray[numeric] values, int64_t win, int64_t minp, object index, object closed): """ Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. Parameters ---------- - input: numpy array + values: numpy array window: int, size of rolling window minp: if number of observations in window is below this, output a NaN index: ndarray, optional index for window computation """ - return _roll_min_max(input, win, minp, index, is_max=0, closed=closed) + return _roll_min_max(values, win, minp, index, is_max=0, closed=closed) -cdef _roll_min_max(ndarray[numeric] input, int64_t win, int64_t minp, +cdef _roll_min_max(ndarray[numeric] values, int64_t win, int64_t minp, object index, object closed, bint is_max): """ Moving min/max of 1d array of any numeric type along axis=0 @@ -1229,17 +1229,17 @@ cdef _roll_min_max(ndarray[numeric] input, int64_t win, int64_t minp, bint is_variable starti, endi, N, win, minp, is_variable = get_window_indexer( - input, win, + values, win, minp, index, closed) if is_variable: - return _roll_min_max_variable(input, starti, endi, N, win, minp, + return _roll_min_max_variable(values, starti, endi, N, win, minp, is_max) else: - return _roll_min_max_fixed(input, starti, endi, N, win, minp, is_max) + return _roll_min_max_fixed(values, starti, endi, N, win, minp, is_max) -cdef _roll_min_max_variable(ndarray[numeric] input, +cdef _roll_min_max_variable(ndarray[numeric] values, ndarray[int64_t] starti, ndarray[int64_t] endi, int64_t N, @@ -1266,16 +1266,18 @@ cdef _roll_min_max_variable(ndarray[numeric] input, # So the code was optimized for that for i from starti[0] <= i < endi[0]: - ai = init_mm(input[i], &nobs, is_max) + ai = init_mm(values[i], &nobs, is_max) # Discard previous entries if we find new min or max if is_max: - while not Q.empty() and ((ai >= input[Q.back()]) or - (input[Q.back()] != input[Q.back()])): + while not Q.empty() and ( + (ai >= values[Q.back()]) or + (values[Q.back()] != values[Q.back()])): Q.pop_back() else: - while not Q.empty() and ((ai <= input[Q.back()]) or - (input[Q.back()] != input[Q.back()])): + while not Q.empty() and ( + (ai <= values[Q.back()]) or + (values[Q.back()] != values[Q.back()])): Q.pop_back() Q.push_back(i) W.push_back(i) @@ -1286,20 +1288,22 @@ cdef _roll_min_max_variable(ndarray[numeric] input, for i in range(endi[0], endi[N-1]): if not Q.empty(): output[i-1+close_offset] = calc_mm( - minp, nobs, input[Q.front()]) + minp, nobs, values[Q.front()]) else: output[i-1+close_offset] = NaN - ai = init_mm(input[i], &nobs, is_max) + ai = init_mm(values[i], &nobs, is_max) # Discard previous entries if we find new min or max if is_max: - while not Q.empty() and ((ai >= input[Q.back()]) or - (input[Q.back()] != input[Q.back()])): + while not Q.empty() and ( + (ai >= values[Q.back()]) or + (values[Q.back()] != values[Q.back()])): Q.pop_back() else: - while not Q.empty() and ((ai <= input[Q.back()]) or - (input[Q.back()] != input[Q.back()])): + while not Q.empty() and ( + (ai <= values[Q.back()]) or + (values[Q.back()] != values[Q.back()])): Q.pop_back() # Maintain window/nobs retention @@ -1307,18 +1311,18 @@ cdef _roll_min_max_variable(ndarray[numeric] input, while not Q.empty() and Q.front() <= i - curr_win_size: Q.pop_front() while not W.empty() and W.front() <= i - curr_win_size: - remove_mm(input[W.front()], &nobs) + remove_mm(values[W.front()], &nobs) W.pop_front() Q.push_back(i) W.push_back(i) - output[N-1] = calc_mm(minp, nobs, input[Q.front()]) + output[N-1] = calc_mm(minp, nobs, values[Q.front()]) return output -cdef _roll_min_max_fixed(ndarray[numeric] input, +cdef _roll_min_max_fixed(ndarray[numeric] values, ndarray[int64_t] starti, ndarray[int64_t] endi, int64_t N, @@ -1345,18 +1349,18 @@ cdef _roll_min_max_fixed(ndarray[numeric] input, end = ring + win last = ring minvalue = ring - ai = input[0] - minvalue[0] = init_mm(input[0], &nobs, is_max) + ai = values[0] + minvalue[0] = init_mm(values[0], &nobs, is_max) death[0] = win nobs = 0 with nogil: for i in range(N): - ai = init_mm(input[i], &nobs, is_max) + ai = init_mm(values[i], &nobs, is_max) if i >= win: - remove_mm(input[i - win], &nobs) + remove_mm(values[i - win], &nobs) if death[minvalue - ring] == i: minvalue = minvalue + 1 @@ -1425,7 +1429,7 @@ interpolation_types = { } -def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, +def roll_quantile(ndarray[float64_t, cast=True] values, int64_t win, int64_t minp, object index, object closed, double quantile, str interpolation): """ @@ -1449,13 +1453,13 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, try: interpolation_type = interpolation_types[interpolation] except KeyError: - raise ValueError("Interpolation '{}' is not supported" - .format(interpolation)) + raise ValueError("Interpolation '{interp}' is not supported" + .format(interp=interpolation)) # we use the Fixed/Variable Indexer here as the # actual skiplist ops outweigh any window computation costs start, end, N, win, minp, is_variable = get_window_indexer( - input, win, + values, win, minp, index, closed, use_mock=False) output = np.empty(N, dtype=float) @@ -1471,7 +1475,7 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, if i == 0: # setup - val = input[i] + val = values[i] if notnan(val): nobs += 1 skiplist_insert(skiplist, val) @@ -1480,14 +1484,14 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, # calculate deletes for j in range(start[i - 1], s): - val = input[j] + val = values[j] if notnan(val): skiplist_remove(skiplist, val) nobs -= 1 # calculate adds for j in range(end[i - 1], e): - val = input[j] + val = values[j] if notnan(val): nobs += 1 skiplist_insert(skiplist, val) @@ -1635,18 +1639,18 @@ def roll_generic(object obj, return output -def roll_window(ndarray[float64_t, ndim=1, cast=True] input, +def roll_window(ndarray[float64_t, ndim=1, cast=True] values, ndarray[float64_t, ndim=1, cast=True] weights, int minp, bint avg=True): """ - Assume len(weights) << len(input) + Assume len(weights) << len(values) """ cdef: ndarray[double_t] output, tot_wgt, counts Py_ssize_t in_i, win_i, win_n, win_k, in_n, in_k float64_t val_in, val_win, c, w - in_n = len(input) + in_n = len(values) win_n = len(weights) output = np.zeros(in_n, dtype=float) counts = np.zeros(in_n, dtype=float) @@ -1662,7 +1666,7 @@ def roll_window(ndarray[float64_t, ndim=1, cast=True] input, continue for in_i from 0 <= in_i < in_n - (win_n - win_i) + 1: - val_in = input[in_i] + val_in = values[in_i] if val_in == val_in: output[in_i + (win_n - win_i) - 1] += val_in * val_win counts[in_i + (win_n - win_i) - 1] += 1 @@ -1686,7 +1690,7 @@ def roll_window(ndarray[float64_t, ndim=1, cast=True] input, continue for in_i from 0 <= in_i < in_n - (win_n - win_i) + 1: - val_in = input[in_i] + val_in = values[in_i] if val_in == val_in: output[in_i + (win_n - win_i) - 1] += val_in * val_win From dc14378f5c9fba582e512d463b4ae3ae64e5a0ba Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sat, 27 Oct 2018 07:48:11 -0700 Subject: [PATCH 04/10] cython optimizations and cleanup --- pandas/_libs/algos.pyx | 2 + pandas/_libs/groupby.pyx | 14 +++---- pandas/_libs/groupby_helper.pxi.in | 49 +++++++++++----------- pandas/_libs/hashtable.pyx | 8 ++-- pandas/_libs/hashtable_class_helper.pxi.in | 45 ++++++++++---------- pandas/_libs/hashtable_func_helper.pxi.in | 48 ++++++++++----------- pandas/_libs/join.pyx | 4 +- pandas/_libs/lib.pyx | 26 ++++++++++-- 8 files changed, 107 insertions(+), 89 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 44ce1589de2c0..bcd0eab1dfc5e 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -77,6 +77,8 @@ class NegInfinity(object): __ge__ = lambda self, other: isinstance(other, NegInfinity) +@cython.wraparound(False) +@cython.boundscheck(False) cpdef ndarray[int64_t, ndim=1] unique_deltas(int64_t[:] arr): """ Efficiently find the unique first-differences of the given array. diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 4ae2f3cc713f2..aed884e3aa010 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -98,7 +98,7 @@ cdef inline float64_t kth_smallest_c(float64_t* a, @cython.boundscheck(False) @cython.wraparound(False) -def group_median_float64(ndarray[float64_t, ndim=2] out, +def group_median_float64(float64_t[:, :] out, int64_t[:] counts, ndarray[float64_t, ndim=2] values, int64_t[:] labels, @@ -109,7 +109,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, cdef: Py_ssize_t i, j, N, K, ngroups, size int64_t[:] _counts - ndarray data + ndarray[float64_t, ndim=2] data float64_t* ptr assert min_count == -1, "'min_count' only used in add and prod" @@ -291,7 +291,7 @@ def group_fillna_indexer(int64_t[:] out, ndarray[int64_t] labels, """ cdef: Py_ssize_t i, N - ndarray[int64_t] sorted_labels + int64_t[:] sorted_labels int64_t idx, curr_fill_idx=-1, filled_vals=0 N = len(out) @@ -299,8 +299,8 @@ def group_fillna_indexer(int64_t[:] out, ndarray[int64_t] labels, # Make sure all arrays are the same size assert N == len(labels) == len(mask) - sorted_labels = np.argsort(labels, kind='mergesort').astype( - np.int64, copy=False) + sorted_labels = np.argsort(labels, kind='mergesort').astype(np.int64, + copy=False) if direction == 'bfill': sorted_labels = sorted_labels[::-1] @@ -327,7 +327,7 @@ def group_fillna_indexer(int64_t[:] out, ndarray[int64_t] labels, @cython.boundscheck(False) @cython.wraparound(False) -def group_any_all(ndarray[uint8_t] out, +def group_any_all(uint8_t[:] out, int64_t[:] labels, uint8_t[:] values, uint8_t[:] mask, @@ -370,7 +370,7 @@ def group_any_all(ndarray[uint8_t] out, else: raise ValueError("'bool_func' must be either 'any' or 'all'!") - out.fill(1 - flag_val) + out[:] = 1 - flag_val with nogil: for i in range(N): diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 128ac3840928b..b7f47b5a1d69d 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -220,7 +220,7 @@ def group_mean_{{name}}(ndarray[{{c_type}}, ndim=2] out, @cython.wraparound(False) @cython.boundscheck(False) -def group_ohlc_{{name}}(ndarray[{{c_type}}, ndim=2] out, +def group_ohlc_{{name}}({{c_type}}[:, :] out, int64_t[:] counts, ndarray[{{c_type}}, ndim=2] values, int64_t[:] labels, @@ -246,7 +246,8 @@ def group_ohlc_{{name}}(ndarray[{{c_type}}, ndim=2] out, if K > 1: raise NotImplementedError("Argument 'values' must have only " "one dimension") - out.fill(np.nan) + + out[:] = np.nan with nogil: for i in range(N): @@ -304,8 +305,8 @@ def group_last_{{name}}(ndarray[{{c_type}}, ndim=2] out, cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) {{c_type}} val - ndarray[{{c_type}}, ndim=2] resx - ndarray[int64_t, ndim=2] nobs + {{c_type}}[:, :] resx + int64_t[:, :] nobs assert min_count == -1, "'min_count' only used in add and prod" @@ -313,7 +314,7 @@ def group_last_{{name}}(ndarray[{{c_type}}, ndim=2] out, raise AssertionError("len(index) != len(labels)") nobs = np.zeros(( out).shape, dtype=np.int64) - {{if name=='object'}} + {{if name == 'object'}} resx = np.empty(( out).shape, dtype=object) {{else}} resx = np.empty_like(out) @@ -361,8 +362,8 @@ def group_nth_{{name}}(ndarray[{{c_type}}, ndim=2] out, cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) {{c_type}} val - ndarray[{{c_type}}, ndim=2] resx - ndarray[int64_t, ndim=2] nobs + {{c_type}}[:, :] resx + int64_t[:, :] nobs assert min_count == -1, "'min_count' only used in add and prod" @@ -411,7 +412,7 @@ def group_nth_{{name}}(ndarray[{{c_type}}, ndim=2] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, +def group_rank_{{name}}(float64_t[:, :] out, ndarray[{{c_type}}, ndim=2] values, ndarray[int64_t] labels, bint is_datetimelike, object ties_method, @@ -453,8 +454,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, TiebreakEnumType tiebreak Py_ssize_t i, j, N, K, grp_start=0, dups=0, sum_ranks=0 Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0 - ndarray[int64_t] _as - ndarray[float64_t, ndim=2] grp_sizes + int64_t[:] _as + float64_t[:, :] grp_sizes ndarray[{{c_type}}] masked_vals ndarray[uint8_t] mask bint keep_na @@ -617,7 +618,7 @@ def group_max(ndarray[groupby_t, ndim=2] out, cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) groupby_t val, count, nan_val - ndarray[groupby_t, ndim=2] maxx, nobs + groupby_t[:, :] maxx, nobs assert min_count == -1, "'min_count' only used in add and prod" @@ -629,10 +630,10 @@ def group_max(ndarray[groupby_t, ndim=2] out, maxx = np.empty_like(out) if groupby_t is int64_t: # Note: evaluated at compile-time - maxx.fill(-_int64_max) + maxx[:] = -_int64_max nan_val = iNaT else: - maxx.fill(-np.inf) + maxx[:] = -np.inf nan_val = NAN N, K = ( values).shape @@ -685,7 +686,7 @@ def group_min(ndarray[groupby_t, ndim=2] out, cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) groupby_t val, count, nan_val - ndarray[groupby_t, ndim=2] minx, nobs + groupby_t[:, :] minx, nobs assert min_count == -1, "'min_count' only used in add and prod" @@ -696,10 +697,10 @@ def group_min(ndarray[groupby_t, ndim=2] out, minx = np.empty_like(out) if groupby_t is int64_t: - minx.fill(_int64_max) + minx[:] = _int64_max nan_val = iNaT else: - minx.fill(np.inf) + minx[:] = np.inf nan_val = NAN N, K = ( values).shape @@ -741,7 +742,7 @@ group_min_int64 = group_min["int64_t"] @cython.boundscheck(False) @cython.wraparound(False) -def group_cummin(ndarray[groupby_t, ndim=2] out, +def group_cummin(groupby_t[:, :] out, ndarray[groupby_t, ndim=2] values, ndarray[int64_t] labels, bint is_datetimelike): @@ -751,15 +752,15 @@ def group_cummin(ndarray[groupby_t, ndim=2] out, cdef: Py_ssize_t i, j, N, K, size groupby_t val, mval - ndarray[groupby_t, ndim=2] accum + groupby_t[:, :] accum int64_t lab N, K = ( values).shape accum = np.empty_like(values) if groupby_t is int64_t: - accum.fill(_int64_max) + accum[:] = _int64_max else: - accum.fill(np.inf) + accum[:] = np.inf with nogil: for i in range(N): @@ -794,7 +795,7 @@ group_cummin_int64 = group_cummin["int64_t"] @cython.boundscheck(False) @cython.wraparound(False) -def group_cummax(ndarray[groupby_t, ndim=2] out, +def group_cummax(groupby_t[:, :] out, ndarray[groupby_t, ndim=2] values, ndarray[int64_t] labels, bint is_datetimelike): @@ -804,15 +805,15 @@ def group_cummax(ndarray[groupby_t, ndim=2] out, cdef: Py_ssize_t i, j, N, K, size groupby_t val, mval - ndarray[groupby_t, ndim=2] accum + groupby_t[:, :] accum int64_t lab N, K = ( values).shape accum = np.empty_like(values) if groupby_t is int64_t: - accum.fill(-_int64_max) + accum[:] = -_int64_max else: - accum.fill(-np.inf) + accum[:] = -np.inf with nogil: for i in range(N): diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 2ced98198afc6..d38b72ccebbb2 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -2,10 +2,8 @@ cimport cython -from cpython cimport (PyObject, Py_INCREF, PyList_Check, PyTuple_Check, - PyMem_Malloc, PyMem_Realloc, PyMem_Free, - PyString_Check, PyBytes_Check, - PyUnicode_Check) +from cpython cimport (PyObject, Py_INCREF, + PyMem_Malloc, PyMem_Realloc, PyMem_Free) from libc.stdlib cimport malloc, free @@ -153,7 +151,7 @@ def unique_label_indices(ndarray[int64_t, ndim=1] labels): cdef: int ret = 0 Py_ssize_t i, n = len(labels) - kh_int64_t * table = kh_init_int64() + kh_int64_t *table = kh_init_int64() Int64Vector idx = Int64Vector() ndarray[int64_t, ndim=1] arr Int64VectorData *ud = idx.data diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index c061102fbaddc..b321efb6d0c6f 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -5,9 +5,9 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # VectorData -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- {{py: @@ -86,12 +86,12 @@ cdef class {{name}}Vector: self.data.n = 0 self.data.m = _INIT_VEC_CAP self.ao = np.empty(self.data.m, dtype={{idtype}}) - self.data.data = <{{arg}}*> self.ao.data + self.data.data = <{{arg}}*>self.ao.data cdef resize(self): self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) self.ao.resize(self.data.m, refcheck=False) - self.data.data = <{{arg}}*> self.ao.data + self.data.data = <{{arg}}*>self.ao.data def __dealloc__(self): if self.data is not NULL: @@ -134,14 +134,13 @@ cdef class StringVector: bint external_view_exists def __cinit__(self): - self.data = PyMem_Malloc( - sizeof(StringVectorData)) + self.data = PyMem_Malloc(sizeof(StringVectorData)) if not self.data: raise MemoryError() self.external_view_exists = False self.data.n = 0 self.data.m = _INIT_VEC_CAP - self.data.data = malloc(self.data.m * sizeof(char *)) + self.data.data = malloc(self.data.m * sizeof(char *)) if not self.data.data: raise MemoryError() @@ -154,7 +153,7 @@ cdef class StringVector: self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) orig_data = self.data.data - self.data.data = malloc(self.data.m * sizeof(char *)) + self.data.data = malloc(self.data.m * sizeof(char *)) if not self.data.data: raise MemoryError() for i in range(m): @@ -184,7 +183,7 @@ cdef class StringVector: self.data.m = self.data.n return ao - cdef inline void append(self, char * x): + cdef inline void append(self, char *x): if needs_resize(self.data): self.resize() @@ -209,22 +208,22 @@ cdef class ObjectVector: self.n = 0 self.m = _INIT_VEC_CAP self.ao = np.empty(_INIT_VEC_CAP, dtype=object) - self.data = self.ao.data + self.data = self.ao.data def __len__(self): return self.n - cdef inline append(self, object o): + cdef inline append(self, object obj): if self.n == self.m: if self.external_view_exists: raise ValueError("external reference but " "Vector.resize() needed") self.m = max(self.m * 2, _INIT_VEC_CAP) self.ao.resize(self.m, refcheck=False) - self.data = self.ao.data + self.data = self.ao.data - Py_INCREF(o) - self.data[self.n] = o + Py_INCREF(obj) + self.data[self.n] = obj self.n += 1 def to_array(self): @@ -283,9 +282,9 @@ cdef class {{name}}HashTable(HashTable): def sizeof(self, deep=False): """ return the size of my table in bytes """ - return self.table.n_buckets * (sizeof({{dtype}}_t) + # keys - sizeof(Py_ssize_t) + # vals - sizeof(uint32_t)) # flags + return self.table.n_buckets * (sizeof({{dtype}}_t) + # keys + sizeof(Py_ssize_t) + # vals + sizeof(uint32_t)) # flags cpdef get_item(self, {{dtype}}_t val): cdef khiter_t k @@ -319,10 +318,10 @@ cdef class {{name}}HashTable(HashTable): for i in range(n): key = keys[i] k = kh_put_{{dtype}}(self.table, key, &ret) - self.table.vals[k] = values[i] + self.table.vals[k] = values[i] @cython.boundscheck(False) - def map_locations(self, ndarray[{{dtype}}_t, ndim=1] values): + def map_locations(self, const {{dtype}}_t[:] values): cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -591,7 +590,7 @@ cdef class StringHashTable(HashTable): cdef: Py_ssize_t i, n = len(values) ndarray[int64_t] labels = np.empty(n, dtype=np.int64) - int64_t *resbuf = labels.data + int64_t *resbuf = labels.data khiter_t k kh_str_t *table = self.table const char *v @@ -679,7 +678,7 @@ cdef class StringHashTable(HashTable): for i in range(n): val = values[i] - if PyUnicode_Check(val) or PyString_Check(val): + if isinstance(val, (str, unicode)): v = util.get_c_string(val) else: v = util.get_c_string(self.na_string_sentinel) @@ -712,7 +711,7 @@ cdef class StringHashTable(HashTable): for i in range(n): val = values[i] - if PyUnicode_Check(val) or PyString_Check(val): + if isinstance(val, (str, unicode)): v = util.get_c_string(val) else: v = util.get_c_string(self.na_string_sentinel) @@ -773,7 +772,7 @@ cdef class StringHashTable(HashTable): for i in range(n): val = values[i] - if ((PyUnicode_Check(val) or PyString_Check(val)) + if (isinstance(val, (str, unicode)) and not (use_na_value and val == na_value)): v = util.get_c_string(val) vecs[i] = v diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 3d35e7014b408..70f76c7edc7d5 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -4,9 +4,9 @@ Template for each `dtype` helper function for hashtable WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # VectorData -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- {{py: @@ -45,11 +45,11 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values, val = values[i] if not checknull(val) or not dropna: - k = kh_get_{{ttype}}(table, val) + k = kh_get_{{ttype}}(table, val) if k != table.n_buckets: table.vals[k] += 1 else: - k = kh_put_{{ttype}}(table, val, &ret) + k = kh_put_{{ttype}}(table, val, &ret) table.vals[k] = 1 {{else}} with nogil: @@ -80,7 +80,7 @@ cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna): cpdef value_count_{{dtype}}({{scalar}}[:] values, bint dropna): {{endif}} cdef: - Py_ssize_t i=0 + Py_ssize_t i = 0 kh_{{ttype}}_t *table {{if dtype != 'object'}} @@ -103,7 +103,7 @@ cpdef value_count_{{dtype}}({{scalar}}[:] values, bint dropna): {{if dtype == 'object'}} for k in range(table.n_buckets): if kh_exist_{{ttype}}(table, k): - result_keys[i] = <{{dtype}}> table.keys[k] + result_keys[i] = <{{dtype}}>table.keys[k] result_counts[i] = table.vals[k] i += 1 {{else}} @@ -141,7 +141,7 @@ def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'): {{dtype}}_t value {{endif}} Py_ssize_t k, i, n = len(values) - kh_{{ttype}}_t * table = kh_init_{{ttype}}() + kh_{{ttype}}_t *table = kh_init_{{ttype}}() ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') kh_resize_{{ttype}}(table, min(n, _SIZE_HINT_LIMIT)) @@ -152,7 +152,7 @@ def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'): if keep == 'last': {{if dtype == 'object'}} for i from n > i >= 0: - kh_put_{{ttype}}(table, values[i], &ret) + kh_put_{{ttype}}(table, values[i], &ret) out[i] = ret == 0 {{else}} with nogil: @@ -163,7 +163,7 @@ def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'): elif keep == 'first': {{if dtype == 'object'}} for i in range(n): - kh_put_{{ttype}}(table, values[i], &ret) + kh_put_{{ttype}}(table, values[i], &ret) out[i] = ret == 0 {{else}} with nogil: @@ -175,13 +175,13 @@ def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'): {{if dtype == 'object'}} for i in range(n): value = values[i] - k = kh_get_{{ttype}}(table, value) + k = kh_get_{{ttype}}(table, value) if k != table.n_buckets: out[table.vals[k]] = 1 out[i] = 1 else: - k = kh_put_{{ttype}}(table, value, &ret) - table.keys[k] = value + k = kh_put_{{ttype}}(table, value, &ret) + table.keys[k] = value table.vals[k] = i out[i] = 0 {{else}} @@ -202,9 +202,9 @@ def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'): return out -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Membership -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- @cython.wraparound(False) @@ -237,7 +237,7 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values): int ret = 0 ndarray[uint8_t] result {{scalar}} val - kh_{{ttype}}_t * table = kh_init_{{ttype}}() + kh_{{ttype}}_t *table = kh_init_{{ttype}}() # construct the table n = len(values) @@ -245,7 +245,7 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values): {{if dtype == 'object'}} for i in range(n): - kh_put_{{ttype}}(table, values[i], &ret) + kh_put_{{ttype}}(table, values[i], &ret) {{else}} with nogil: for i in range(n): @@ -259,7 +259,7 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values): {{if dtype == 'object'}} for i in range(n): val = arr[i] - k = kh_get_{{ttype}}(table, val) + k = kh_get_{{ttype}}(table, val) result[i] = (k != table.n_buckets) {{else}} with nogil: @@ -305,17 +305,13 @@ def mode_{{dtype}}({{ctype}}[:] values, bint dropna): {{endif}} cdef: int count, max_count = 1 - int j = -1 # so you can do += + int j = -1 # so you can do += Py_ssize_t k kh_{{table_type}}_t *table - ndarray[{{ctype}}] modes + {{ctype}}[:] modes table = kh_init_{{table_type}}() - {{if dtype == 'object'}} - build_count_table_{{dtype}}(values, table, dropna) - {{else}} build_count_table_{{dtype}}(values, table, dropna) - {{endif}} modes = np.empty(table.n_buckets, dtype=np.{{npy_dtype}}) @@ -346,11 +342,13 @@ def mode_{{dtype}}({{ctype}}[:] values, bint dropna): else: continue - modes[j] = table.keys[k] + modes[j] = table.keys[k] {{endif}} kh_destroy_{{table_type}}(table) - return modes[:j + 1] + # Note: For reasons unknown, slicing modes.base works but modes[:j+1].base + # returns an object with an incorrect length + return modes.base[:j + 1] # `.base` to access underlying np.ndarray {{endfor}} diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index c6afeda6a37dc..c92e0a4a7aa23 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- -cimport cython -from cython cimport Py_ssize_t +import cython +from cython import Py_ssize_t import numpy as np cimport numpy as cnp diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 872d0ffff771e..e16eb20d917cf 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -195,7 +195,7 @@ def fast_unique_multiple(list arrays): @cython.wraparound(False) @cython.boundscheck(False) -def fast_unique_multiple_list(list lists, bint sort=True): +def fast_unique_multiple_list(lists: list, sort: bint = True) -> list: cdef: list buf Py_ssize_t k = len(lists) @@ -356,6 +356,8 @@ def get_reverse_indexer(int64_t[:] indexer, Py_ssize_t length): return rev_indexer +@cython.wraparound(False) +@cython.boundscheck(False) def has_infs_f4(float32_t[:] arr) -> bint: cdef: Py_ssize_t i, n = len(arr) @@ -371,6 +373,8 @@ def has_infs_f4(float32_t[:] arr) -> bint: return False +@cython.wraparound(False) +@cython.boundscheck(False) def has_infs_f8(float64_t[:] arr) -> bint: cdef: Py_ssize_t i, n = len(arr) @@ -423,6 +427,8 @@ def maybe_indices_to_slice(ndarray[int64_t] indices, int max_len): return slice(vstart, vlast - 1, k) +@cython.wraparound(False) +@cython.boundscheck(False) def maybe_booleans_to_slice(ndarray[uint8_t] mask): cdef: Py_ssize_t i, n = len(mask) @@ -473,6 +479,8 @@ def array_equivalent_object(left: object[:], right: object[:]) -> bint: return True +@cython.wraparound(False) +@cython.boundscheck(False) def astype_intsafe(object[:] arr, new_dtype): cdef: Py_ssize_t i, n = len(arr) @@ -494,6 +502,8 @@ def astype_intsafe(object[:] arr, new_dtype): return result +@cython.wraparound(False) +@cython.boundscheck(False) def astype_unicode(arr: ndarray, skipna: bool=False) -> ndarray[object]: """ Convert all elements in an array to unicode. @@ -527,7 +537,9 @@ def astype_unicode(arr: ndarray, skipna: bool=False) -> ndarray[object]: return result -def astype_str(arr: ndarray, skipna: bool=False) -> ndarray[object]: +@cython.wraparound(False) +@cython.boundscheck(False) +def astype_str(arr: ndarray, skipna: bool = False) -> ndarray[object]: """ Convert all elements in an array to string. @@ -560,6 +572,8 @@ def astype_str(arr: ndarray, skipna: bool=False) -> ndarray[object]: return result +@cython.wraparound(False) +@cython.boundscheck(False) def clean_index_list(list obj): """ Utility used in pandas.core.index.ensure_index @@ -1037,7 +1051,7 @@ cdef _try_infer_map(v): return None -def infer_dtype(object value, bint skipna=False): +def infer_dtype(value: object, skipna: bint = False) -> bint: """ Efficiently infer the type of a passed val, or list-like array of values. Return a string describing the type. @@ -1617,6 +1631,8 @@ cpdef bint is_datetime64_array(ndarray values): return validator.validate(values) +@cython.wraparound(False) +@cython.boundscheck(False) def is_datetime_with_singletz_array(values: ndarray) -> bint: """ Check values have the same tzinfo attribute. @@ -2150,6 +2166,8 @@ def map_infer_mask(ndarray arr, object f, uint8_t[:] mask, bint convert=1): return result +@cython.wraparound(False) +@cython.boundscheck(False) def map_infer(ndarray arr, object f, bint convert=1): """ Substitute for np.vectorize with pandas-friendly dtype inference @@ -2280,6 +2298,8 @@ def to_object_array_tuples(list rows): return result +@cython.wraparound(False) +@cython.boundscheck(False) def fast_multiget(dict mapping, ndarray keys, default=np.nan): cdef: Py_ssize_t i, n = len(keys) From 43c085bc42264f4c546973fa0ffaf627e1087f7c Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sat, 27 Oct 2018 10:36:49 -0700 Subject: [PATCH 05/10] fixup typo --- pandas/_libs/lib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e16eb20d917cf..7e58b8211dfc3 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1051,7 +1051,7 @@ cdef _try_infer_map(v): return None -def infer_dtype(value: object, skipna: bint = False) -> bint: +def infer_dtype(value: object, skipna: bint = False) -> str: """ Efficiently infer the type of a passed val, or list-like array of values. Return a string describing the type. From 99c7e276ef3900e5ef1001a0555c345a174f52df Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sat, 27 Oct 2018 12:00:17 -0700 Subject: [PATCH 06/10] types, cleanup, let cython handle dtype dispatch --- pandas/_libs/groupby_helper.pxi.in | 15 ----- pandas/_libs/lib.pyx | 90 +++++++++++++++--------------- 2 files changed, 44 insertions(+), 61 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index b7f47b5a1d69d..755e223443dee 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -735,11 +735,6 @@ def group_min(ndarray[groupby_t, ndim=2] out, out[i, j] = minx[i, j] -group_min_float64 = group_min["float64_t"] -group_min_float32 = group_min["float32_t"] -group_min_int64 = group_min["int64_t"] - - @cython.boundscheck(False) @cython.wraparound(False) def group_cummin(groupby_t[:, :] out, @@ -788,11 +783,6 @@ def group_cummin(groupby_t[:, :] out, out[i, j] = mval -group_cummin_float64 = group_cummin["float64_t"] -group_cummin_float32 = group_cummin["float32_t"] -group_cummin_int64 = group_cummin["int64_t"] - - @cython.boundscheck(False) @cython.wraparound(False) def group_cummax(groupby_t[:, :] out, @@ -838,8 +828,3 @@ def group_cummax(groupby_t[:, :] out, if val > mval: accum[lab, j] = mval = val out[i, j] = mval - - -group_cummax_float64 = group_cummax["float64_t"] -group_cummax_float32 = group_cummax["float32_t"] -group_cummax_int64 = group_cummax["int64_t"] diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 7e58b8211dfc3..4a9e294891039 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -263,10 +263,10 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True): @cython.wraparound(False) @cython.boundscheck(False) -def dicts_to_array(list dicts, list columns): +def dicts_to_array(dicts: list, columns: list): cdef: Py_ssize_t i, j, k, n - ndarray[object, ndim=2] result + object[:, :] result dict row object col, onan = np.nan @@ -284,7 +284,7 @@ def dicts_to_array(list dicts, list columns): else: result[i, j] = onan - return result + return result.base # `.base` to access underlying np.ndarray def fast_zip(list ndarrays): @@ -343,17 +343,17 @@ def get_reverse_indexer(int64_t[:] indexer, Py_ssize_t length): cdef: Py_ssize_t i, n = len(indexer) - ndarray[int64_t] rev_indexer + int64_t[:] rev_indexer int64_t idx rev_indexer = np.empty(length, dtype=np.int64) - rev_indexer.fill(-1) + rev_indexer[:] = -1 for i in range(n): idx = indexer[i] if idx != -1: rev_indexer[idx] = i - return rev_indexer + return rev_indexer.base # `.base` to access underlying np.ndarray @cython.wraparound(False) @@ -460,7 +460,7 @@ def maybe_booleans_to_slice(ndarray[uint8_t] mask): @cython.wraparound(False) @cython.boundscheck(False) -def array_equivalent_object(left: object[:], right: object[:]) -> bint: +def array_equivalent_object(left: object[:], right: object[:]) -> bool: """ perform an element by element comparion on 1-d object arrays taking into account nan positions """ cdef: @@ -484,7 +484,7 @@ def array_equivalent_object(left: object[:], right: object[:]) -> bint: def astype_intsafe(object[:] arr, new_dtype): cdef: Py_ssize_t i, n = len(arr) - object v + object val bint is_datelike ndarray result @@ -493,11 +493,11 @@ def astype_intsafe(object[:] arr, new_dtype): result = np.empty(n, dtype=new_dtype) for i in range(n): - v = arr[i] - if is_datelike and checknull(v): + val = arr[i] + if is_datelike and checknull(val): result[i] = NPY_NAT else: - result[i] = v + result[i] = val return result @@ -524,7 +524,7 @@ def astype_unicode(arr: ndarray, skipna: bool=False) -> ndarray[object]: cdef: object arr_i Py_ssize_t i, n = arr.size - ndarray[object] result = np.empty(n, dtype=object) + object[:] result = np.empty(n, dtype=object) for i in range(n): arr_i = arr[i] @@ -534,7 +534,7 @@ def astype_unicode(arr: ndarray, skipna: bool=False) -> ndarray[object]: result[i] = arr_i - return result + return result.base # `.base` to access underlying np.ndarray @cython.wraparound(False) @@ -559,7 +559,7 @@ def astype_str(arr: ndarray, skipna: bool = False) -> ndarray[object]: cdef: object arr_i Py_ssize_t i, n = arr.size - ndarray[object] result = np.empty(n, dtype=object) + object[:] result = np.empty(n, dtype=object) for i in range(n): arr_i = arr[i] @@ -569,24 +569,24 @@ def astype_str(arr: ndarray, skipna: bool = False) -> ndarray[object]: result[i] = arr_i - return result + return result.base # `.base` to access underlying np.ndarray @cython.wraparound(False) @cython.boundscheck(False) -def clean_index_list(list obj): +def clean_index_list(obj: list): """ Utility used in pandas.core.index.ensure_index """ cdef: Py_ssize_t i, n = len(obj) - object v + object val bint all_arrays = 1 for i in range(n): - v = obj[i] - if not (isinstance(v, list) or - util.is_array(v) or hasattr(v, '_data')): + val = obj[i] + if not (isinstance(val, list) or + util.is_array(val) or hasattr(val, '_data')): all_arrays = 0 break @@ -595,11 +595,9 @@ def clean_index_list(list obj): # don't force numpy coerce with nan's inferred = infer_dtype(obj) - if inferred in ['string', 'bytes', 'unicode', - 'mixed', 'mixed-integer']: + if inferred in ['string', 'bytes', 'unicode', 'mixed', 'mixed-integer']: return np.asarray(obj, dtype=object), 0 elif inferred in ['integer']: - # TODO: we infer an integer but it *could* be a uint64 try: return np.asarray(obj, dtype='int64'), 0 @@ -680,13 +678,13 @@ def generate_bins_dt64(ndarray[int64_t] values, int64_t[:] binner, @cython.boundscheck(False) @cython.wraparound(False) -def row_bool_subset(ndarray[float64_t, ndim=2] values, +def row_bool_subset(float64_t[:, :] values, ndarray[uint8_t, cast=True] mask): cdef: Py_ssize_t i, j, n, k, pos = 0 - ndarray[float64_t, ndim=2] out + float64_t[:, :] out - n, k = ( values).shape + n, k = (values).shape assert (n == len(mask)) out = np.empty((mask.sum(), k), dtype=np.float64) @@ -697,7 +695,7 @@ def row_bool_subset(ndarray[float64_t, ndim=2] values, out[pos, j] = values[i, j] pos += 1 - return out + return out.base # `.base` to access underlying np.ndarray @cython.boundscheck(False) @@ -706,7 +704,7 @@ def row_bool_subset_object(object[:, :] values, ndarray[uint8_t, cast=True] mask): cdef: Py_ssize_t i, j, n, k, pos = 0 - ndarray[object, ndim=2] out + object[:, :] out n, k = (values).shape assert (n == len(mask)) @@ -719,7 +717,7 @@ def row_bool_subset_object(object[:, :] values, out[pos, j] = values[i, j] pos += 1 - return out + return out.base # `.base` to access underlying np.ndarray @cython.boundscheck(False) @@ -846,19 +844,19 @@ def indices_fast(object index, int64_t[:] labels, list keys, # core.common import for fast inference checks -def is_float(obj: object) -> bint: +def is_float(obj: object) -> bool: return util.is_float_object(obj) -def is_integer(obj: object) -> bint: +def is_integer(obj: object) -> bool: return util.is_integer_object(obj) -def is_bool(obj: object) -> bint: +def is_bool(obj: object) -> bool: return util.is_bool_object(obj) -def is_complex(obj: object) -> bint: +def is_complex(obj: object) -> bool: return util.is_complex_object(obj) @@ -870,7 +868,7 @@ cpdef bint is_interval(object obj): return getattr(obj, '_typ', '_typ') == 'interval' -def is_period(val: object) -> bint: +def is_period(val: object) -> bool: """ Return a boolean if this is a Period object """ return util.is_period_object(val) @@ -1352,7 +1350,7 @@ def infer_datetimelike_array(arr: object) -> object: seen_datetime = 1 elif PyDate_Check(v): seen_date = 1 - elif is_timedelta(v) or util.is_timedelta64_object(v): + elif is_timedelta(v): # timedelta, or timedelta64 seen_timedelta = 1 else: @@ -1633,7 +1631,7 @@ cpdef bint is_datetime64_array(ndarray values): @cython.wraparound(False) @cython.boundscheck(False) -def is_datetime_with_singletz_array(values: ndarray) -> bint: +def is_datetime_with_singletz_array(values: ndarray) -> bool: """ Check values have the same tzinfo attribute. Doesn't check values are datetime-like types. @@ -2138,7 +2136,7 @@ def map_infer_mask(ndarray arr, object f, uint8_t[:] mask, bint convert=1): """ cdef: Py_ssize_t i, n - ndarray[object] result + object[:] result object val n = len(arr) @@ -2163,7 +2161,7 @@ def map_infer_mask(ndarray arr, object f, uint8_t[:] mask, bint convert=1): convert_datetime=0, convert_timedelta=0) - return result + return result.base # `.base` to access underlying np.ndarray @cython.wraparound(False) @@ -2208,7 +2206,7 @@ def map_infer(ndarray arr, object f, bint convert=1): return result.base # `.base` to access underlying np.ndarray -def to_object_array(list rows, int min_width=0): +def to_object_array(rows: list, min_width: int = 0): """ Convert a list of lists into an object array. @@ -2228,7 +2226,7 @@ def to_object_array(list rows, int min_width=0): """ cdef: Py_ssize_t i, j, n, k, tmp - ndarray[object, ndim=2] result + object[:, :] result list row n = len(rows) @@ -2247,13 +2245,13 @@ def to_object_array(list rows, int min_width=0): for j in range(len(row)): result[i, j] = row[j] - return result + return result.base # `.base` to access underlying np.ndarray def tuples_to_object_array(ndarray[object] tuples): cdef: Py_ssize_t i, j, n, k, tmp - ndarray[object, ndim=2] result + object[:, :] result tuple tup n = len(tuples) @@ -2264,13 +2262,13 @@ def tuples_to_object_array(ndarray[object] tuples): for j in range(k): result[i, j] = tup[j] - return result + return result.base # `.base` to access underlying np.ndarray -def to_object_array_tuples(list rows): +def to_object_array_tuples(rows: list): cdef: Py_ssize_t i, j, n, k, tmp - ndarray[object, ndim=2] result + object[:, :] result tuple row n = len(rows) @@ -2295,7 +2293,7 @@ def to_object_array_tuples(list rows): for j in range(len(row)): result[i, j] = row[j] - return result + return result.base # `.base` to access underlying np.ndarray @cython.wraparound(False) From 0f45833905a69fdafe0e312f86074e18d98ebc9e Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sat, 27 Oct 2018 12:05:00 -0700 Subject: [PATCH 07/10] remove unnecessary specialization --- pandas/_libs/groupby_helper.pxi.in | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 755e223443dee..70c1d8868461a 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -668,11 +668,6 @@ def group_max(ndarray[groupby_t, ndim=2] out, out[i, j] = maxx[i, j] -group_max_float64 = group_max["float64_t"] -group_max_float32 = group_max["float32_t"] -group_max_int64 = group_max["int64_t"] - - @cython.wraparound(False) @cython.boundscheck(False) def group_min(ndarray[groupby_t, ndim=2] out, From 79137e3bf516199a031567930b907055f3ffc23d Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Sun, 28 Oct 2018 20:14:12 +0000 Subject: [PATCH 08/10] unpin openpyxl (#23361) --- ci/azure-36-locale_slow.yaml | 2 +- ci/azure-37-locale.yaml | 2 +- ci/azure-macos-35.yaml | 2 +- ci/azure-windows-27.yaml | 2 +- ci/azure-windows-36.yaml | 2 +- ci/circle-36-locale.yaml | 2 +- ci/requirements-optional-conda.txt | 2 +- ci/requirements-optional-pip.txt | 2 +- ci/travis-36-doc.yaml | 2 +- ci/travis-36-slow.yaml | 2 +- ci/travis-36.yaml | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/ci/azure-36-locale_slow.yaml b/ci/azure-36-locale_slow.yaml index 14b23dd6f3e4c..7e40bd1a9979e 100644 --- a/ci/azure-36-locale_slow.yaml +++ b/ci/azure-36-locale_slow.yaml @@ -14,7 +14,7 @@ dependencies: - nomkl - numexpr - numpy - - openpyxl=2.5.5 + - openpyxl - psycopg2 - pymysql - pytables diff --git a/ci/azure-37-locale.yaml b/ci/azure-37-locale.yaml index ef97b85406709..59c8818eaef1e 100644 --- a/ci/azure-37-locale.yaml +++ b/ci/azure-37-locale.yaml @@ -13,7 +13,7 @@ dependencies: - nomkl - numexpr - numpy - - openpyxl=2.5.5 + - openpyxl - psycopg2 - pymysql - pytables diff --git a/ci/azure-macos-35.yaml b/ci/azure-macos-35.yaml index 6ccdc79d11b27..065deb914dae6 100644 --- a/ci/azure-macos-35.yaml +++ b/ci/azure-macos-35.yaml @@ -12,7 +12,7 @@ dependencies: - nomkl - numexpr - numpy=1.12.0 - - openpyxl=2.5.5 + - openpyxl - pytables - python=3.5* - pytz diff --git a/ci/azure-windows-27.yaml b/ci/azure-windows-27.yaml index d48a9ba986a93..dc68129a5e6d3 100644 --- a/ci/azure-windows-27.yaml +++ b/ci/azure-windows-27.yaml @@ -13,7 +13,7 @@ dependencies: - matplotlib=2.0.1 - numexpr - numpy=1.12* - - openpyxl=2.5.5 + - openpyxl - pytables - python=2.7.* - pytz diff --git a/ci/azure-windows-36.yaml b/ci/azure-windows-36.yaml index d03a6cbbd662c..979443661f99b 100644 --- a/ci/azure-windows-36.yaml +++ b/ci/azure-windows-36.yaml @@ -11,7 +11,7 @@ dependencies: - matplotlib - numexpr - numpy=1.14* - - openpyxl=2.5.5 + - openpyxl - parquet-cpp - pyarrow - pytables diff --git a/ci/circle-36-locale.yaml b/ci/circle-36-locale.yaml index ef97b85406709..59c8818eaef1e 100644 --- a/ci/circle-36-locale.yaml +++ b/ci/circle-36-locale.yaml @@ -13,7 +13,7 @@ dependencies: - nomkl - numexpr - numpy - - openpyxl=2.5.5 + - openpyxl - psycopg2 - pymysql - pytables diff --git a/ci/requirements-optional-conda.txt b/ci/requirements-optional-conda.txt index e9afd7a551b6e..04abfede67163 100644 --- a/ci/requirements-optional-conda.txt +++ b/ci/requirements-optional-conda.txt @@ -12,7 +12,7 @@ lxml matplotlib>=2.0.0 nbsphinx numexpr>=2.6.1 -openpyxl=2.5.5 +openpyxl pyarrow pymysql pytables>=3.4.2 diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt index ebe0c4ca88ee6..0153bdb6edf04 100644 --- a/ci/requirements-optional-pip.txt +++ b/ci/requirements-optional-pip.txt @@ -14,7 +14,7 @@ lxml matplotlib>=2.0.0 nbsphinx numexpr>=2.6.1 -openpyxl==2.5.5 +openpyxl pyarrow pymysql tables diff --git a/ci/travis-36-doc.yaml b/ci/travis-36-doc.yaml index 8353659e7b9a9..f1f64546374af 100644 --- a/ci/travis-36-doc.yaml +++ b/ci/travis-36-doc.yaml @@ -22,7 +22,7 @@ dependencies: - notebook - numexpr - numpy=1.13* - - openpyxl=2.5.5 + - openpyxl - pandoc - pyqt - pytables diff --git a/ci/travis-36-slow.yaml b/ci/travis-36-slow.yaml index 1a7bc53e1b74b..3157ecac3a902 100644 --- a/ci/travis-36-slow.yaml +++ b/ci/travis-36-slow.yaml @@ -10,7 +10,7 @@ dependencies: - matplotlib - numexpr - numpy - - openpyxl=2.5.5 + - openpyxl - patsy - psycopg2 - pymysql diff --git a/ci/travis-36.yaml b/ci/travis-36.yaml index 7aa27beacf976..257f830ec6c48 100644 --- a/ci/travis-36.yaml +++ b/ci/travis-36.yaml @@ -21,7 +21,7 @@ dependencies: - nomkl - numexpr - numpy - - openpyxl=2.5.5 + - openpyxl - psycopg2 - pyarrow - pymysql From beb404843f7e94170b599fa6e67b933429436e0a Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 1 Nov 2018 08:38:46 -0700 Subject: [PATCH 09/10] use np.asarray, explicitly require not-none --- pandas/_libs/algos.pyx | 26 +++++++++++++++++ pandas/_libs/groupby.pyx | 15 ++++++++-- pandas/_libs/groupby_helper.pxi.in | 7 +++++ pandas/_libs/hashtable_class_helper.pxi.in | 2 ++ pandas/_libs/lib.pyx | 34 ++++++++++------------ pandas/_libs/tslibs/util.pxd | 9 ++++++ pandas/_libs/window.pyx | 2 ++ 7 files changed, 74 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index bcd0eab1dfc5e..2b60b0dc01d19 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -100,6 +100,8 @@ cpdef ndarray[int64_t, ndim=1] unique_deltas(int64_t[:] arr): int ret = 0 list uniques = [] + util.require_not_none(arr) + table = kh_init_int64() kh_resize_int64(table, 10) for i in range(n - 1): @@ -173,6 +175,8 @@ def groupsort_indexer(int64_t[:] index, Py_ssize_t ngroups): Py_ssize_t i, loc, label, n ndarray[int64_t] counts, where, result + util.require_not_none(index) + counts = np.zeros(ngroups + 1, dtype=np.int64) n = len(index) result = np.zeros(n, dtype=np.int64) @@ -389,6 +393,8 @@ cpdef map_indices(algos_t[:] index): Py_ssize_t i, length dict result = {} + util.require_not_none(index) + length = len(index) for i in range(length): @@ -406,6 +412,9 @@ def pad(algos_t[:] old, algos_t[:] new, limit=None): algos_t cur, next int lim, fill_count = 0 + util.require_not_none(old) + util.require_not_none(new) + nleft = len(old) nright = len(new) indexer = np.empty(nright, dtype=np.int64) @@ -483,6 +492,9 @@ def pad_inplace(algos_t[:] values, uint8_t[:] mask, limit=None): algos_t val int lim, fill_count = 0 + util.require_not_none(values) + util.require_not_none(mask) + N = len(values) # GH#2778 @@ -527,6 +539,9 @@ def pad_2d_inplace(algos_t[:, :] values, uint8_t[:, :] mask, limit=None): algos_t val int lim, fill_count = 0 + util.require_not_none(values) + util.require_not_none(mask) + K, N = ( values).shape # GH#2778 @@ -600,6 +615,9 @@ def backfill(algos_t[:] old, algos_t[:] new, limit=None): algos_t cur, prev int lim, fill_count = 0 + util.require_not_none(old) + util.require_not_none(new) + nleft = len(old) nright = len(new) indexer = np.empty(nright, dtype=np.int64) @@ -678,6 +696,9 @@ def backfill_inplace(algos_t[:] values, uint8_t[:] mask, limit=None): algos_t val int lim, fill_count = 0 + util.require_not_none(values) + util.require_not_none(mask) + N = len(values) # GH#2778 @@ -722,6 +743,9 @@ def backfill_2d_inplace(algos_t[:, :] values, uint8_t[:, :] mask, limit=None): algos_t val int lim, fill_count = 0 + util.require_not_none(values) + util.require_not_none(mask) + K, N = ( values).shape # GH#2778 @@ -770,6 +794,8 @@ def arrmap(algos_t[:] index, object func): from pandas._libs.lib import maybe_convert_objects + util.require_not_none(index) + for i in range(length): result[i] = func(index[i]) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index aed884e3aa010..45d9c3f111b5d 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -14,7 +14,7 @@ from numpy cimport (ndarray, cnp.import_array() -from util cimport numeric, get_nat +from util cimport numeric, get_nat, require_not_none from algos cimport (swap, TiebreakEnumType, TIEBREAK_AVERAGE, TIEBREAK_MIN, TIEBREAK_MAX, TIEBREAK_FIRST, TIEBREAK_DENSE) @@ -113,6 +113,9 @@ def group_median_float64(float64_t[:, :] out, float64_t* ptr assert min_count == -1, "'min_count' only used in add and prod" + require_not_none(counts) + require_not_none(out) + require_not_none(labels) ngroups = len(counts) N, K = ( values).shape @@ -299,8 +302,9 @@ def group_fillna_indexer(int64_t[:] out, ndarray[int64_t] labels, # Make sure all arrays are the same size assert N == len(labels) == len(mask) - sorted_labels = np.argsort(labels, kind='mergesort').astype(np.int64, - copy=False) + sorted_labels = np.argsort(labels, kind='mergesort').astype( + np.int64, copy=False) + if direction == 'bfill': sorted_labels = sorted_labels[::-1] @@ -357,6 +361,11 @@ def group_any_all(uint8_t[:] out, int64_t lab uint8_t flag_val + require_not_none(out) + require_not_none(labels) + require_not_none(values) + require_not_none(mask) + if val_test == 'all': # Because the 'all' value of an empty iterable in Python is True we can # start with an array full of ones and set to zero when a False value diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 70c1d8868461a..b626d4cf0faf1 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -42,6 +42,8 @@ def group_add_{{name}}(ndarray[{{c_type}}, ndim=2] out, {{c_type}} val, count {{c_type}}[:, :] sumx, nobs + require_not_none(counts) + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") @@ -89,6 +91,8 @@ def group_prod_{{name}}(ndarray[{{c_type}}, ndim=2] out, {{c_type}} val, count {{c_type}}[:, :] prodx, nobs + require_not_none(counts) + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") @@ -133,6 +137,7 @@ def group_var_{{name}}(ndarray[{{c_type}}, ndim=2] out, {{c_type}} val, ct, oldmean {{c_type}}[:, :] nobs, mean + require_not_none(counts) assert min_count == -1, "'min_count' only used in add and prod" if not len(values) == len(labels): @@ -185,6 +190,7 @@ def group_mean_{{name}}(ndarray[{{c_type}}, ndim=2] out, {{c_type}} val, count {{c_type}}[:, :] sumx, nobs + require_not_none(counts) assert min_count == -1, "'min_count' only used in add and prod" if not len(values) == len(labels): @@ -233,6 +239,7 @@ def group_ohlc_{{name}}({{c_type}}[:, :] out, {{c_type}} val, count Py_ssize_t ngroups = len(counts) + require_not_none(counts) assert min_count == -1, "'min_count' only used in add and prod" if len(labels) == 0: diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index b321efb6d0c6f..0342ec16d9841 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -328,6 +328,8 @@ cdef class {{name}}HashTable(HashTable): {{dtype}}_t val khiter_t k + util.require_not_none(values) + with nogil: for i in range(n): val = values[i] diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4a9e294891039..76a890da8e91f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -284,7 +284,7 @@ def dicts_to_array(dicts: list, columns: list): else: result[i, j] = onan - return result.base # `.base` to access underlying np.ndarray + return np.asarray(result) def fast_zip(list ndarrays): @@ -346,6 +346,8 @@ def get_reverse_indexer(int64_t[:] indexer, Py_ssize_t length): int64_t[:] rev_indexer int64_t idx + util.require_not_none(indexer) + rev_indexer = np.empty(length, dtype=np.int64) rev_indexer[:] = -1 for i in range(n): @@ -353,7 +355,7 @@ def get_reverse_indexer(int64_t[:] indexer, Py_ssize_t length): if idx != -1: rev_indexer[idx] = i - return rev_indexer.base # `.base` to access underlying np.ndarray + return np.asarray(rev_indexer) @cython.wraparound(False) @@ -534,7 +536,7 @@ def astype_unicode(arr: ndarray, skipna: bool=False) -> ndarray[object]: result[i] = arr_i - return result.base # `.base` to access underlying np.ndarray + return np.asarray(result) @cython.wraparound(False) @@ -569,7 +571,7 @@ def astype_str(arr: ndarray, skipna: bool = False) -> ndarray[object]: result[i] = arr_i - return result.base # `.base` to access underlying np.ndarray + return np.asarray(result) @cython.wraparound(False) @@ -695,7 +697,7 @@ def row_bool_subset(float64_t[:, :] values, out[pos, j] = values[i, j] pos += 1 - return out.base # `.base` to access underlying np.ndarray + return np.asarray(out) @cython.boundscheck(False) @@ -717,7 +719,7 @@ def row_bool_subset_object(object[:, :] values, out[pos, j] = values[i, j] pos += 1 - return out.base # `.base` to access underlying np.ndarray + return np.asarray(out) @cython.boundscheck(False) @@ -1939,11 +1941,7 @@ def maybe_convert_objects(object[:] objects, bint try_float=0, object val float64_t fval, fnan - if objects is None: - # Without explicitly raising, groupby.ops _aggregate_series_pure_python - # can pass None and incorrectly raise an AttributeError when trying - # to access `objects.base` below. - raise TypeError + util.require_not_none(objects) n = len(objects) @@ -2053,7 +2051,7 @@ def maybe_convert_objects(object[:] objects, bint try_float=0, if seen.datetimetz_: if len({getattr(val, 'tzinfo', None) for val in objects}) == 1: from pandas import DatetimeIndex - return DatetimeIndex(objects.base) + return DatetimeIndex(np.asarray(objects)) seen.object_ = 1 if not seen.object_: @@ -2118,7 +2116,7 @@ def maybe_convert_objects(object[:] objects, bint try_float=0, elif seen.is_bool: return bools.view(np.bool_) - return objects.base # `.base` to access underlying np.ndarray + return np.asarray(objects) def map_infer_mask(ndarray arr, object f, uint8_t[:] mask, bint convert=1): @@ -2161,7 +2159,7 @@ def map_infer_mask(ndarray arr, object f, uint8_t[:] mask, bint convert=1): convert_datetime=0, convert_timedelta=0) - return result.base # `.base` to access underlying np.ndarray + return np.asarray(result) @cython.wraparound(False) @@ -2203,7 +2201,7 @@ def map_infer(ndarray arr, object f, bint convert=1): convert_datetime=0, convert_timedelta=0) - return result.base # `.base` to access underlying np.ndarray + return np.asarray(result) def to_object_array(rows: list, min_width: int = 0): @@ -2245,7 +2243,7 @@ def to_object_array(rows: list, min_width: int = 0): for j in range(len(row)): result[i, j] = row[j] - return result.base # `.base` to access underlying np.ndarray + return np.asarray(result) def tuples_to_object_array(ndarray[object] tuples): @@ -2262,7 +2260,7 @@ def tuples_to_object_array(ndarray[object] tuples): for j in range(k): result[i, j] = tup[j] - return result.base # `.base` to access underlying np.ndarray + return np.asarray(result) def to_object_array_tuples(rows: list): @@ -2293,7 +2291,7 @@ def to_object_array_tuples(rows: list): for j in range(len(row)): result[i, j] = row[j] - return result.base # `.base` to access underlying np.ndarray + return np.asarray(result) @cython.wraparound(False) diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index 0ba61fcc58f46..6309cb1fcf8b2 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -51,6 +51,15 @@ cdef inline int import_array() except -1: _import_array() +cdef inline require_not_none(obj): + """ + Functions accepting cython memoryviews will also accept None. In order to + avoid silently returning incorrect answers, we explicitly check for None. + """ + if obj is None: + raise ValueError("An array or memoryview is required, not None.") + + # -------------------------------------------------------------------- # Type Checking diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 770e0d2ef0f09..c3b8673823975 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -249,6 +249,8 @@ cdef class VariableWindowIndexer(WindowIndexer): int64_t start_bound, end_bound, N Py_ssize_t i, j + util.require_not_none(index) + start = self.start end = self.end N = self.N From 75cfc6cf0370b58e0e2e33b244febdbe04f9f634 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 1 Nov 2018 10:31:02 -0700 Subject: [PATCH 10/10] fix ValueError-->TypeError --- pandas/_libs/tslibs/util.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index 6309cb1fcf8b2..8cb313513f7b7 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -57,7 +57,7 @@ cdef inline require_not_none(obj): avoid silently returning incorrect answers, we explicitly check for None. """ if obj is None: - raise ValueError("An array or memoryview is required, not None.") + raise TypeError("An array or memoryview is required, not None.") # --------------------------------------------------------------------