From e3e763d46a6a08949d71ff98595863753806372e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 4 Nov 2019 15:25:44 -0800 Subject: [PATCH 1/2] CLN: assorted cleanups --- pandas/_libs/sparse.pyx | 3 -- pandas/core/groupby/generic.py | 4 --- pandas/core/groupby/groupby.py | 51 ++++++++++++--------------- pandas/core/groupby/ops.py | 16 +++------ pandas/core/indexes/base.py | 5 ++- pandas/core/indexes/multi.py | 8 +++-- pandas/core/internals/construction.py | 1 + pandas/core/reshape/concat.py | 2 +- pandas/core/reshape/merge.py | 4 +-- pandas/core/reshape/pivot.py | 4 ++- pandas/core/reshape/reshape.py | 4 +-- 11 files changed, 43 insertions(+), 59 deletions(-) diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index 4906e45c884e9..3aa1f91c49f28 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -746,9 +746,6 @@ cdef class BlockUnion(BlockMerge): nend = xend[xi] - # print 'here xi=%d, yi=%d, mode=%d, nend=%d' % (self.xi, self.yi, - # mode, nend) - # done with y? if yi == ynblocks: self._set_current_indices(xi + 1, yi, mode) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 009e83b861523..1e38dde2096ba 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1124,10 +1124,6 @@ def _decide_output_index(self, output, labels): output_keys = labels else: output_keys = sorted(output) - try: - output_keys.sort() - except TypeError: - pass if isinstance(labels, MultiIndex): output_keys = MultiIndex.from_tuples(output_keys, names=labels.names) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 642b1e93a057a..59b118431cfc9 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1092,9 +1092,8 @@ def result_to_bool(result: np.ndarray, inference: Type) -> np.ndarray: return self._get_cythonized_result( "group_any_all", - self.grouper, aggregate=True, - cython_dtype=np.uint8, + cython_dtype=np.dtype(np.uint8), needs_values=True, needs_mask=True, pre_processing=objs_to_bool, @@ -1305,7 +1304,7 @@ def size(self): result = self.grouper.size() if isinstance(self.obj, Series): - result.name = getattr(self.obj, "name", None) + result.name = self.obj.name return result @classmethod @@ -1586,9 +1585,8 @@ def _fill(self, direction, limit=None): return self._get_cythonized_result( "group_fillna_indexer", - self.grouper, needs_mask=True, - cython_dtype=np.int64, + cython_dtype=np.dtype(np.int64), result_is_index=True, direction=direction, limit=limit, @@ -1882,11 +1880,10 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: if is_scalar(q): return self._get_cythonized_result( "group_quantile", - self.grouper, aggregate=True, needs_values=True, needs_mask=True, - cython_dtype=np.float64, + cython_dtype=np.dtype(np.float64), pre_processing=pre_processor, post_processing=post_processor, q=q, @@ -1896,11 +1893,10 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: results = [ self._get_cythonized_result( "group_quantile", - self.grouper, aggregate=True, needs_values=True, needs_mask=True, - cython_dtype=np.float64, + cython_dtype=np.dtype(np.float64), pre_processing=pre_processor, post_processing=post_processor, q=qi, @@ -2167,14 +2163,13 @@ def cummax(self, axis=0, **kwargs): def _get_cythonized_result( self, - how, - grouper, - aggregate=False, - cython_dtype=None, - needs_values=False, - needs_mask=False, - needs_ngroups=False, - result_is_index=False, + how: str, + cython_dtype: np.dtype, + aggregate: bool = False, + needs_values: bool = False, + needs_mask: bool = False, + needs_ngroups: bool = False, + result_is_index: bool = False, pre_processing=None, post_processing=None, **kwargs @@ -2185,13 +2180,11 @@ def _get_cythonized_result( Parameters ---------- how : str, Cythonized function name to be called - grouper : Grouper object containing pertinent group info + cython_dtype : np.dtype + Type of the array that will be modified by the Cython call. aggregate : bool, default False Whether the result should be aggregated to match the number of groups - cython_dtype : default None - Type of the array that will be modified by the Cython call. If - `None`, the type will be inferred from the values of each slice needs_values : bool, default False Whether the values should be a part of the Cython call signature @@ -2234,8 +2227,10 @@ def _get_cythonized_result( "Cannot use 'pre_processing' without specifying 'needs_values'!" ) + grouper = self.grouper + labels, _, ngroups = grouper.group_info - output = collections.OrderedDict() + output = collections.OrderedDict() # type: dict base_func = getattr(libgroupby, how) for name, obj in self._iterate_slices(): @@ -2246,9 +2241,6 @@ def _get_cythonized_result( else: result_sz = len(values) - if not cython_dtype: - cython_dtype = values.dtype - result = np.zeros(result_sz, dtype=cython_dtype) func = partial(base_func, result, labels) inferences = None @@ -2308,8 +2300,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): return self._get_cythonized_result( "group_shift_indexer", - self.grouper, - cython_dtype=np.int64, + cython_dtype=np.dtype(np.int64), needs_ngroups=True, result_is_index=True, periods=periods, @@ -2478,11 +2469,13 @@ def _reindex_output(self, output): @Appender(GroupBy.__doc__) -def groupby(obj, by, **kwds): +def groupby(obj: NDFrame, by, **kwds): if isinstance(obj, Series): from pandas.core.groupby.generic import SeriesGroupBy - klass = SeriesGroupBy + klass = ( + SeriesGroupBy + ) # type: Union[Type["SeriesGroupBy"], Type["DataFrameGroupBy"]] elif isinstance(obj, DataFrame): from pandas.core.groupby.generic import DataFrameGroupBy diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7918e463c73ac..9bbe73c1851b5 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -592,13 +592,10 @@ def agg_series(self, obj, func): return self._aggregate_series_pure_python(obj, func) def _aggregate_series_fast(self, obj, func): + # At this point we have already checked that obj.index is not a MultiIndex + # and that obj is backed by an ndarray, not ExtensionArray func = self._is_builtin_func(func) - # TODO: pre-empt this, also pre-empt get_result raising TypError if we pass a EA - # for EAs backed by ndarray we may have a performant workaround - if obj.index._has_complex_internals: - raise TypeError("Incompatible index for Cython grouper") - group_index, _, ngroups = self.group_info # avoids object / Series creation overhead @@ -842,15 +839,12 @@ def __iter__(self): def _get_sorted_data(self): return self.data.take(self.sort_idx, axis=self.axis) - def _chop(self, sdata, slice_obj): - raise AbstractMethodError(self) - - def apply(self, f): + def _chop(self, sdata, slice_obj: slice): raise AbstractMethodError(self) class SeriesSplitter(DataSplitter): - def _chop(self, sdata, slice_obj): + def _chop(self, sdata, slice_obj: slice): return sdata._get_values(slice_obj) @@ -862,7 +856,7 @@ def fast_apply(self, f, names): sdata = self._get_sorted_data() return libreduction.apply_frame_axis0(sdata, f, names, starts, ends) - def _chop(self, sdata, slice_obj): + def _chop(self, sdata, slice_obj: slice): if self.axis == 0: return sdata.iloc[slice_obj] else: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5751ce6ea730e..c9697c530628a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4747,10 +4747,9 @@ def get_indexer_for(self, target, **kwargs): def _maybe_promote(self, other): # A hack, but it works - from pandas import DatetimeIndex - if self.inferred_type == "date" and isinstance(other, DatetimeIndex): - return DatetimeIndex(self), other + if self.inferred_type == "date" and isinstance(other, ABCDatetimeIndex): + return type(other)(self), other elif self.inferred_type == "boolean": if not is_object_dtype(self.dtype): return self.astype("object"), other.astype("object") diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index caaf55546189c..730c9af0149f5 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2172,14 +2172,16 @@ def drop(self, codes, level=None, errors="raise"): if level is not None: return self._drop_from_level(codes, level) + if not isinstance(codes, (np.ndarray, Index)): + codes = com.index_labels_to_array(codes) try: - if not isinstance(codes, (np.ndarray, Index)): - codes = com.index_labels_to_array(codes) indexer = self.get_indexer(codes) mask = indexer == -1 if mask.any(): if errors != "ignore": - raise ValueError("codes %s not contained in axis" % codes[mask]) + raise ValueError( + "codes {codes} not contained in axis".format(codes=codes[mask]) + ) except Exception: pass diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 4a8216cc73264..05a2803b3fc2f 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -167,6 +167,7 @@ def init_ndarray(values, index, columns, dtype=None, copy=False): try: values = values.astype(dtype) except Exception as orig: + # e.g. ValueError when trying to cast object dtype to float64 raise ValueError( "failed to cast to '{dtype}' (Exception " "was: {orig})".format(dtype=dtype, orig=orig) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index c11915c00c59d..39e00047ea968 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -478,7 +478,7 @@ def get_result(self): self, method="concat" ) - def _get_result_dim(self): + def _get_result_dim(self) -> int: if self._is_series and self.axis == 1: return 2 else: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 9845c570ca704..6ef13a62ee366 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1948,13 +1948,13 @@ def _get_join_keys(llab, rlab, shape, sort): return _get_join_keys(llab, rlab, shape, sort) -def _should_fill(lname, rname): +def _should_fill(lname, rname) -> bool: if not isinstance(lname, str) or not isinstance(rname, str): return True return lname == rname -def _any(x): +def _any(x) -> bool: return x is not None and com.any_not_none(*x) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index d653dd87308cf..404292fe4d539 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -620,7 +620,9 @@ def _normalize(table, normalize, margins, margins_name="All"): if (margins_name not in table.iloc[-1, :].name) | ( margins_name != table.iloc[:, -1].name ): - raise ValueError("{} not in pivoted DataFrame".format(margins_name)) + raise ValueError( + "{mname} not in pivoted DataFrame".format(mname=margins_name) + ) column_margin = table.iloc[:-1, -1] index_margin = table.iloc[-1, :-1] diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index d7eae1c543804..7537dd0ac2065 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -88,7 +88,7 @@ class _Unstacker: def __init__( self, - values, + values: np.ndarray, index, level=-1, value_columns=None, @@ -985,7 +985,7 @@ def get_empty_frame(data): else: # PY2 embedded unicode, gh-22084 - def _make_col_name(prefix, prefix_sep, level): + def _make_col_name(prefix, prefix_sep, level) -> str: fstr = "{prefix}{prefix_sep}{level}" return fstr.format(prefix=prefix, prefix_sep=prefix_sep, level=level) From a4d4ef997ce3fb9e2369d20b02496c55917bc213 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 4 Nov 2019 16:56:16 -0800 Subject: [PATCH 2/2] revert --- pandas/_libs/sparse.pyx | 2 +- pandas/core/indexes/multi.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index 3aa1f91c49f28..6abaaca010b00 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -597,7 +597,7 @@ cdef class BlockIndex(SparseIndex): result = np.empty(other.npoints, dtype=np.float64) - for 0 <= i < other.nblocks: + for i in range(other.nblocks): ocur = olocs[i] ocurlen = olens[i] diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 730c9af0149f5..2e3f440573a0f 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2172,9 +2172,9 @@ def drop(self, codes, level=None, errors="raise"): if level is not None: return self._drop_from_level(codes, level) - if not isinstance(codes, (np.ndarray, Index)): - codes = com.index_labels_to_array(codes) try: + if not isinstance(codes, (np.ndarray, Index)): + codes = com.index_labels_to_array(codes) indexer = self.get_indexer(codes) mask = indexer == -1 if mask.any():