From 19dfc0b09d7b47522037c0d0da82c2e894633672 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 18 Oct 2019 17:49:29 -0700 Subject: [PATCH 1/3] CLN: cython for loops, take_helper params --- pandas/_libs/algos_common_helper.pxi.in | 5 +- pandas/_libs/algos_take_helper.pxi.in | 56 +++++++++++++---------- pandas/_libs/hashtable_func_helper.pxi.in | 6 ++- pandas/_libs/join.pyx | 24 ++++------ pandas/_libs/sparse_op_helper.pxi.in | 2 +- pandas/_libs/window.pyx | 8 ++-- 6 files changed, 53 insertions(+), 48 deletions(-) diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in index 91599fa223b57..eb6d689899073 100644 --- a/pandas/_libs/algos_common_helper.pxi.in +++ b/pandas/_libs/algos_common_helper.pxi.in @@ -17,12 +17,11 @@ dtypes = [('float64', 'float64_t', 'float64_t'), def get_dispatch(dtypes): for name, c_type, dest_type, in dtypes: - dest_name = dest_type[:-2] # i.e. strip "_t" - yield name, c_type, dest_type, dest_name + yield name, c_type, dest_type }} -{{for name, c_type, dest_type, dest_name +{{for name, c_type, dest_type in get_dispatch(dtypes)}} diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index e7ee212065c5b..7e1713880fb8a 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -10,28 +10,28 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: -# name, dest, c_type_in, c_type_out, preval, postval, can_copy, nogil +# c_type_in, c_type_out, preval, postval dtypes = [ - ('bool', 'bool', 'uint8_t', 'uint8_t', '', '', True), - ('bool', 'object', 'uint8_t', 'object', - 'True if ', ' > 0 else False', False), - ('int8', 'int8', 'int8_t', 'int8_t', '', '', True), - ('int8', 'int32', 'int8_t', 'int32_t', '', '', False), - ('int8', 'int64', 'int8_t', 'int64_t', '', '', False), - ('int8', 'float64', 'int8_t', 'float64_t', '', '', False), - ('int16', 'int16', 'int16_t', 'int16_t', '', '', True), - ('int16', 'int32', 'int16_t', 'int32_t', '', '', False), - ('int16', 'int64', 'int16_t', 'int64_t', '', '', False), - ('int16', 'float64', 'int16_t', 'float64_t', '', '', False), - ('int32', 'int32', 'int32_t', 'int32_t', '', '', True), - ('int32', 'int64', 'int32_t', 'int64_t', '', '', False), - ('int32', 'float64', 'int32_t', 'float64_t', '', '', False), - ('int64', 'int64', 'int64_t', 'int64_t', '', '', True), - ('int64', 'float64', 'int64_t', 'float64_t', '', '', False), - ('float32', 'float32', 'float32_t', 'float32_t', '', '', True), - ('float32', 'float64', 'float32_t', 'float64_t', '', '', False), - ('float64', 'float64', 'float64_t', 'float64_t', '', '', True), - ('object', 'object', 'object', 'object', '', '', False)] + ('uint8_t', 'uint8_t', '', ''), + ('uint8_t', 'object', 'True if ', ' > 0 else False'), + ('int8_t', 'int8_t', '', ''), + ('int8_t', 'int32_t', '', ''), + ('int8_t', 'int64_t', '', ''), + ('int8_t', 'float64_t', '', ''), + ('int16_t', 'int16_t', '', ''), + ('int16_t', 'int32_t', '', ''), + ('int16_t', 'int64_t', '', ''), + ('int16_t', 'float64_t', '', ''), + ('int32_t', 'int32_t', '', ''), + ('int32_t', 'int64_t', '', ''), + ('int32_t', 'float64_t', '', ''), + ('int64_t', 'int64_t', '', ''), + ('int64_t', 'float64_t', '', ''), + ('float32_t', 'float32_t', '', ''), + ('float32_t', 'float64_t', '', ''), + ('float64_t', 'float64_t', '', ''), + ('object', 'object', '', ''), +] def get_dispatch(dtypes): @@ -117,9 +117,9 @@ def get_dispatch(dtypes): out[i, j] = %(preval)svalues[i, idx]%(postval)s """ - for (name, dest, c_type_in, c_type_out, preval, postval, - can_copy) in dtypes: + for (c_type_in, c_type_out, preval, postval) in dtypes: + can_copy = ctype_in == c_type_out != "object" nogil = c_type_out != "object" if nogil: nogil_str = "with nogil:" @@ -128,6 +128,16 @@ def get_dispatch(dtypes): nogil_str = '' tab = '' + def get_name(dtype_name): + if dtype_name == "object": + return "object" + if dtype_name == "uint8_t": + return "bool" + return dtype_name[:-2] + + name = get_name(c_type_in) + dest = get_name(c_type_out) + args = dict(name=name, dest=dest, c_type_in=c_type_in, c_type_out=c_type_out, preval=preval, postval=postval, can_copy=can_copy, nogil_str=nogil_str, tab=tab) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index e400ec0e608f0..f6af93f85bd5a 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -151,12 +151,14 @@ def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'): if keep == 'last': {{if dtype == 'object'}} - for i from n > i >= 0: + for i in range(n - 1, -1, -1): + # equivalent: range(n)[::-1], which cython doesnt like in nogil kh_put_{{ttype}}(table, values[i], &ret) out[i] = ret == 0 {{else}} with nogil: - for i from n > i >= 0: + for i in range(n - 1, -1, -1): + # equivalent: range(n)[::-1], which cython doesnt like in nogil kh_put_{{ttype}}(table, values[i], &ret) out[i] = ret == 0 {{endif}} diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 238bfd0be0aa7..caf730389008a 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -13,6 +13,7 @@ from pandas._libs.algos import ( ) +@cython.boundscheck(False) def inner_join(const int64_t[:] left, const int64_t[:] right, Py_ssize_t max_groups): cdef: @@ -20,6 +21,8 @@ def inner_join(const int64_t[:] left, const int64_t[:] right, ndarray[int64_t] left_count, right_count, left_sorter, right_sorter ndarray[int64_t] left_indexer, right_indexer int64_t lc, rc + Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 + Py_ssize_t offset # NA group in location 0 @@ -34,11 +37,6 @@ def inner_join(const int64_t[:] left, const int64_t[:] right, if rc > 0 and lc > 0: count += lc * rc - # group 0 is the NA group - cdef: - Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 - Py_ssize_t offset - # exclude the NA group left_pos = left_count[0] right_pos = right_count[0] @@ -64,6 +62,7 @@ def inner_join(const int64_t[:] left, const int64_t[:] right, _get_result_indexer(right_sorter, right_indexer)) +@cython.boundscheck(False) def left_outer_join(const int64_t[:] left, const int64_t[:] right, Py_ssize_t max_groups, sort=True): cdef: @@ -72,6 +71,8 @@ def left_outer_join(const int64_t[:] left, const int64_t[:] right, ndarray rev ndarray[int64_t] left_indexer, right_indexer int64_t lc, rc + Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 + Py_ssize_t offset # NA group in location 0 @@ -85,11 +86,6 @@ def left_outer_join(const int64_t[:] left, const int64_t[:] right, else: count += left_count[i] - # group 0 is the NA group - cdef: - Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 - Py_ssize_t offset - # exclude the NA group left_pos = left_count[0] right_pos = right_count[0] @@ -137,6 +133,7 @@ def left_outer_join(const int64_t[:] left, const int64_t[:] right, return left_indexer, right_indexer +@cython.boundscheck(False) def full_outer_join(const int64_t[:] left, const int64_t[:] right, Py_ssize_t max_groups): cdef: @@ -144,6 +141,8 @@ def full_outer_join(const int64_t[:] left, const int64_t[:] right, ndarray[int64_t] left_count, right_count, left_sorter, right_sorter ndarray[int64_t] left_indexer, right_indexer int64_t lc, rc + int64_t left_pos = 0, right_pos = 0 + Py_ssize_t offset, position = 0 # NA group in location 0 @@ -160,11 +159,6 @@ def full_outer_join(const int64_t[:] left, const int64_t[:] right, else: count += lc + rc - # group 0 is the NA group - cdef: - int64_t left_pos = 0, right_pos = 0 - Py_ssize_t offset, position = 0 - # exclude the NA group left_pos = left_count[0] right_pos = right_count[0] diff --git a/pandas/_libs/sparse_op_helper.pxi.in b/pandas/_libs/sparse_op_helper.pxi.in index 5949a3fd0ed81..62ea477167b72 100644 --- a/pandas/_libs/sparse_op_helper.pxi.in +++ b/pandas/_libs/sparse_op_helper.pxi.in @@ -86,7 +86,7 @@ def get_op(tup): 'and': '{0} & {1}', # logical op 'or': '{0} | {1}'} - return ops_dict[opname].format(lval, rval, dtype) + return ops_dict[opname].format(lval, rval) def get_dispatch(dtypes): diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 8de593ce36c86..a2096d389823f 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1296,7 +1296,7 @@ cdef _roll_min_max_variable(ndarray[numeric] values, # The original impl didn't deal with variable window sizes # So the code was optimized for that - for i from starti[0] <= i < endi[0]: + for i in range(starti[0], endi[0]): ai = init_mm(values[i], &nobs, is_max) # Discard previous entries if we find new min or max @@ -1644,7 +1644,7 @@ def roll_generic(object obj, else: # truncated windows at the beginning, through first full-length window - for i from 0 <= i < (int_min(win, N) - offset): + for i in range((int_min(win, N) - offset)): if counts[i] >= minp: output[i] = func(arr[0: (i + offset + 1)], *args, **kwargs) else: @@ -1654,7 +1654,7 @@ def roll_generic(object obj, buf = arr.data bufarr = np.empty(win, dtype=float) oldbuf = bufarr.data - for i from (win - offset) <= i < (N - offset): + for i in range((win - offset), (N - offset)): buf = buf + 1 bufarr.data = buf if counts[i] >= minp: @@ -1664,7 +1664,7 @@ def roll_generic(object obj, bufarr.data = oldbuf # truncated windows at the end - for i from int_max(N - offset, 0) <= i < N: + for i in range(int_max(N - offset, 0), N): if counts[i] >= minp: output[i] = func(arr[int_max(i + offset - win + 1, 0): N], *args, From 04340438dee0cec51880aea1be16dfc2b78b2e6a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 18 Oct 2019 19:26:46 -0700 Subject: [PATCH 2/3] remove _take_2d specialized --- pandas/_libs/algos_take_helper.pxi.in | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index 7e1713880fb8a..bd5a488722f6d 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -119,7 +119,7 @@ def get_dispatch(dtypes): for (c_type_in, c_type_out, preval, postval) in dtypes: - can_copy = ctype_in == c_type_out != "object" + can_copy = c_type_in == c_type_out != "object" nogil = c_type_out != "object" if nogil: nogil_str = "with nogil:" @@ -301,9 +301,3 @@ cdef _take_2d(ndarray[take_t, ndim=2] values, object idx): for j in range(K): result[i, j] = values[i, indexer[i, j]] return result - - -_take_2d_object = _take_2d[object] -_take_2d_float64 = _take_2d[float64_t] -_take_2d_int64 = _take_2d[int64_t] -_take_2d_uint64 = _take_2d[uint64_t] From edbd691561154fd9fb084699c6d4a66eb78f14f8 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 18 Oct 2019 20:41:38 -0700 Subject: [PATCH 3/3] docstring fixups --- pandas/core/arrays/base.py | 2 +- pandas/core/generic.py | 2 +- pandas/core/groupby/groupby.py | 2 +- pandas/core/indexes/base.py | 5 ++--- pandas/core/indexes/datetimes.py | 3 +-- pandas/core/indexes/timedeltas.py | 4 +--- pandas/core/missing.py | 2 +- pandas/core/ops/docstrings.py | 4 ++-- pandas/core/strings.py | 2 +- pandas/io/pytables.py | 2 +- pandas/tests/plotting/common.py | 4 ++-- 11 files changed, 14 insertions(+), 18 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 53755695c97e3..08901df963f20 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1104,7 +1104,7 @@ def _create_method(cls, op, coerce_to_dtype=True): ---------- op : function An operator that takes arguments op(a, b) - coerce_to_dtype : bool, default True + coerce_to_dtype : bool, default True boolean indicating whether to attempt to convert the result to the underlying ExtensionArray dtype. If it's not possible to create a new ExtensionArray with the diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e3e59639de56b..a300748ee5bc8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2481,7 +2481,7 @@ def to_hdf(self, path_or_buf, key, **kwargs): like searching / selecting subsets of the data. append : bool, default False For Table formats, append the input data to the existing. - data_columns : list of columns or True, optional + data_columns : list of columns or True, optional List of columns to create as indexed data columns for on-disk queries, or True to use all columns. By default only the axes of the object are indexed. See :ref:`io.hdf5-query-data-columns`. diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b27d5bb05ee8f..f622480cfe4b7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2080,7 +2080,7 @@ def rank( * dense: like 'min', but rank always increases by 1 between groups ascending : bool, default True False for ranks by high (1) to low (N). - na_option : {'keep', 'top', 'bottom'}, default 'keep' + na_option : {'keep', 'top', 'bottom'}, default 'keep' * keep: leave NA values where they are * top: smallest rank if ascending * bottom: smallest rank if descending diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 1a08609ccd99a..9d6487f7a8ae4 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2031,7 +2031,7 @@ def fillna(self, value=None, downcast=None): Parameters ---------- - how : {'any', 'all'}, default 'any' + how : {'any', 'all'}, default 'any' If the Index is a MultiIndex, drop the value when any or all levels are NaN. @@ -5016,12 +5016,11 @@ def _validate_indexer(self, form, key, kind): Returns ------- - label : object + label : object Notes ----- Value of `side` parameter should be validated in caller. - """ @Appender(_index_shared_docs["_maybe_cast_slice_bound"]) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 477525d7ab272..49c11c5505d00 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1079,12 +1079,11 @@ def _maybe_cast_slice_bound(self, label, side, kind): Returns ------- - label : object + label : object Notes ----- Value of `side` parameter should be validated in caller. - """ assert kind in ["ix", "loc", "getitem", None] diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 62a74fefa6577..c404e205e603c 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -550,7 +550,6 @@ def _maybe_cast_slice_bound(self, label, side, kind): """ If label is a string, cast it to timedelta according to resolution. - Parameters ---------- label : object @@ -559,8 +558,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): Returns ------- - label : object - + label : object """ assert kind in ["ix", "loc", "getitem", None] diff --git a/pandas/core/missing.py b/pandas/core/missing.py index bc81fbb7e1ce0..f2655c126b9e5 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -420,7 +420,7 @@ def _akima_interpolate(xi, yi, x, der=0, axis=0): ---------- xi : array_like A sorted list of x-coordinates, of length N. - yi : array_like + yi : array_like A 1-D array of real values. `yi`'s length along the interpolation axis must be equal to the length of `xi`. If N-D array, use axis parameter to select correct axis. diff --git a/pandas/core/ops/docstrings.py b/pandas/core/ops/docstrings.py index 93f197366cf32..5d3f9cd92aa1a 100644 --- a/pandas/core/ops/docstrings.py +++ b/pandas/core/ops/docstrings.py @@ -387,7 +387,7 @@ def _make_flex_doc(op_name, typ): ---------- other : scalar, sequence, Series, or DataFrame Any single or multiple element data structure, or list-like object. -axis : {{0 or 'index', 1 or 'columns'}} +axis : {{0 or 'index', 1 or 'columns'}} Whether to compare by the index (0 or 'index') or columns (1 or 'columns'). For Series input, axis to match Series index on. level : int or label @@ -541,7 +541,7 @@ def _make_flex_doc(op_name, typ): ---------- other : scalar, sequence, Series, or DataFrame Any single or multiple element data structure, or list-like object. -axis : {{0 or 'index', 1 or 'columns'}}, default 'columns' +axis : {{0 or 'index', 1 or 'columns'}}, default 'columns' Whether to compare by the index (0 or 'index') or columns (1 or 'columns'). level : int or label diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 2f2e7234999f2..e50da168af4d2 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1343,7 +1343,7 @@ def str_pad(arr, width, side="left", fillchar=" "): character. Equivalent to ``Series.str.pad(side='right')``. Series.str.center : Fills boths sides of strings with an arbitrary character. Equivalent to ``Series.str.pad(side='both')``. - Series.str.zfill : Pad strings in the Series/Index by prepending '0' + Series.str.zfill : Pad strings in the Series/Index by prepending '0' character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``. Examples diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index c87cad5472bd9..94f863d8970f1 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1027,7 +1027,7 @@ def append( / selecting subsets of the data append : bool, default True Append the input data to the existing. - data_columns : list of columns, or True, default None + data_columns : list of columns, or True, default None List of columns to create as indexed data columns for on-disk queries, or True to use all columns. By default only the axes of the object are indexed. See `here diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 65d0c3d9fb17d..f0ba5f14d59c6 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -311,7 +311,7 @@ def _check_ax_scales(self, axes, xaxis="linear", yaxis="linear"): axes : matplotlib Axes object, or its list-like xaxis : {'linear', 'log'} expected xaxis scale - yaxis : {'linear', 'log'} + yaxis : {'linear', 'log'} expected yaxis scale """ axes = self._flatten_visible(axes) @@ -329,7 +329,7 @@ def _check_axes_shape(self, axes, axes_num=None, layout=None, figsize=None): axes_num : number expected number of axes. Unnecessary axes should be set to invisible. - layout : tuple + layout : tuple expected layout, (expected number of rows , columns) figsize : tuple expected figsize. default is matplotlib default