CLN: assorted cleanups (pandas-dev#29406)

jbrockmendel · Mateusz Górski · commit 27d0735b5856 · 2019-11-18T15:09:57.000+01:00
diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx
@@ -597,7 +597,7 @@ cdef class BlockIndex(SparseIndex):
 
         result = np.empty(other.npoints, dtype=np.float64)
 
-        for 0 <= i < other.nblocks:
+        for i in range(other.nblocks):
             ocur = olocs[i]
             ocurlen = olens[i]
 
@@ -746,9 +746,6 @@ cdef class BlockUnion(BlockMerge):
 
         nend = xend[xi]
 
-        # print 'here xi=%d, yi=%d, mode=%d, nend=%d' % (self.xi, self.yi,
-        #                                                mode, nend)
-
         # done with y?
         if yi == ynblocks:
             self._set_current_indices(xi + 1, yi, mode)
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -1124,10 +1124,6 @@ def _decide_output_index(self, output, labels):
             output_keys = labels
         else:
             output_keys = sorted(output)
-            try:
-                output_keys.sort()
-            except TypeError:
-                pass
 
             if isinstance(labels, MultiIndex):
                 output_keys = MultiIndex.from_tuples(output_keys, names=labels.names)
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1092,9 +1092,8 @@ def result_to_bool(result: np.ndarray, inference: Type) -> np.ndarray:
 
         return self._get_cythonized_result(
             "group_any_all",
-            self.grouper,
             aggregate=True,
-            cython_dtype=np.uint8,
+            cython_dtype=np.dtype(np.uint8),
             needs_values=True,
             needs_mask=True,
             pre_processing=objs_to_bool,
@@ -1305,7 +1304,7 @@ def size(self):
         result = self.grouper.size()
 
         if isinstance(self.obj, Series):
-            result.name = getattr(self.obj, "name", None)
+            result.name = self.obj.name
         return result
 
     @classmethod
@@ -1586,9 +1585,8 @@ def _fill(self, direction, limit=None):
 
         return self._get_cythonized_result(
             "group_fillna_indexer",
-            self.grouper,
             needs_mask=True,
-            cython_dtype=np.int64,
+            cython_dtype=np.dtype(np.int64),
             result_is_index=True,
             direction=direction,
             limit=limit,
@@ -1882,11 +1880,10 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray:
         if is_scalar(q):
             return self._get_cythonized_result(
                 "group_quantile",
-                self.grouper,
                 aggregate=True,
                 needs_values=True,
                 needs_mask=True,
-                cython_dtype=np.float64,
+                cython_dtype=np.dtype(np.float64),
                 pre_processing=pre_processor,
                 post_processing=post_processor,
                 q=q,
@@ -1896,11 +1893,10 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray:
             results = [
                 self._get_cythonized_result(
                     "group_quantile",
-                    self.grouper,
                     aggregate=True,
                     needs_values=True,
                     needs_mask=True,
-                    cython_dtype=np.float64,
+                    cython_dtype=np.dtype(np.float64),
                     pre_processing=pre_processor,
                     post_processing=post_processor,
                     q=qi,
@@ -2167,14 +2163,13 @@ def cummax(self, axis=0, **kwargs):
 
     def _get_cythonized_result(
         self,
-        how,
-        grouper,
-        aggregate=False,
-        cython_dtype=None,
-        needs_values=False,
-        needs_mask=False,
-        needs_ngroups=False,
-        result_is_index=False,
+        how: str,
+        cython_dtype: np.dtype,
+        aggregate: bool = False,
+        needs_values: bool = False,
+        needs_mask: bool = False,
+        needs_ngroups: bool = False,
+        result_is_index: bool = False,
         pre_processing=None,
         post_processing=None,
         **kwargs
@@ -2185,13 +2180,11 @@ def _get_cythonized_result(
         Parameters
         ----------
         how : str, Cythonized function name to be called
-        grouper : Grouper object containing pertinent group info
+        cython_dtype : np.dtype
+            Type of the array that will be modified by the Cython call.
         aggregate : bool, default False
             Whether the result should be aggregated to match the number of
             groups
-        cython_dtype : default None
-            Type of the array that will be modified by the Cython call. If
-            `None`, the type will be inferred from the values of each slice
         needs_values : bool, default False
             Whether the values should be a part of the Cython call
             signature
@@ -2234,8 +2227,10 @@ def _get_cythonized_result(
                     "Cannot use 'pre_processing' without specifying 'needs_values'!"
                 )
 
+        grouper = self.grouper
+
         labels, _, ngroups = grouper.group_info
-        output = collections.OrderedDict()
+        output = collections.OrderedDict()  # type: dict
         base_func = getattr(libgroupby, how)
 
         for name, obj in self._iterate_slices():
@@ -2246,9 +2241,6 @@ def _get_cythonized_result(
             else:
                 result_sz = len(values)
 
-            if not cython_dtype:
-                cython_dtype = values.dtype
-
             result = np.zeros(result_sz, dtype=cython_dtype)
             func = partial(base_func, result, labels)
             inferences = None
@@ -2308,8 +2300,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
 
         return self._get_cythonized_result(
             "group_shift_indexer",
-            self.grouper,
-            cython_dtype=np.int64,
+            cython_dtype=np.dtype(np.int64),
             needs_ngroups=True,
             result_is_index=True,
             periods=periods,
@@ -2478,11 +2469,13 @@ def _reindex_output(self, output):
 
 
 @Appender(GroupBy.__doc__)
-def groupby(obj, by, **kwds):
+def groupby(obj: NDFrame, by, **kwds):
     if isinstance(obj, Series):
         from pandas.core.groupby.generic import SeriesGroupBy
 
-        klass = SeriesGroupBy
+        klass = (
+            SeriesGroupBy
+        )  # type: Union[Type["SeriesGroupBy"], Type["DataFrameGroupBy"]]
     elif isinstance(obj, DataFrame):
         from pandas.core.groupby.generic import DataFrameGroupBy
 
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -592,13 +592,10 @@ def agg_series(self, obj, func):
         return self._aggregate_series_pure_python(obj, func)
 
     def _aggregate_series_fast(self, obj, func):
+        # At this point we have already checked that obj.index is not a MultiIndex
+        #  and that obj is backed by an ndarray, not ExtensionArray
         func = self._is_builtin_func(func)
 
-        # TODO: pre-empt this, also pre-empt get_result raising TypError if we pass a EA
-        #   for EAs backed by ndarray we may have a performant workaround
-        if obj.index._has_complex_internals:
-            raise TypeError("Incompatible index for Cython grouper")
-
         group_index, _, ngroups = self.group_info
 
         # avoids object / Series creation overhead
@@ -842,15 +839,12 @@ def __iter__(self):
     def _get_sorted_data(self):
         return self.data.take(self.sort_idx, axis=self.axis)
 
-    def _chop(self, sdata, slice_obj):
-        raise AbstractMethodError(self)
-
-    def apply(self, f):
+    def _chop(self, sdata, slice_obj: slice):
         raise AbstractMethodError(self)
 
 
 class SeriesSplitter(DataSplitter):
-    def _chop(self, sdata, slice_obj):
+    def _chop(self, sdata, slice_obj: slice):
         return sdata._get_values(slice_obj)
 
 
@@ -862,7 +856,7 @@ def fast_apply(self, f, names):
         sdata = self._get_sorted_data()
         return libreduction.apply_frame_axis0(sdata, f, names, starts, ends)
 
-    def _chop(self, sdata, slice_obj):
+    def _chop(self, sdata, slice_obj: slice):
         if self.axis == 0:
             return sdata.iloc[slice_obj]
         else:
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -4747,10 +4747,9 @@ def get_indexer_for(self, target, **kwargs):
 
     def _maybe_promote(self, other):
         # A hack, but it works
-        from pandas import DatetimeIndex
 
-        if self.inferred_type == "date" and isinstance(other, DatetimeIndex):
-            return DatetimeIndex(self), other
+        if self.inferred_type == "date" and isinstance(other, ABCDatetimeIndex):
+            return type(other)(self), other
         elif self.inferred_type == "boolean":
             if not is_object_dtype(self.dtype):
                 return self.astype("object"), other.astype("object")
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -2179,7 +2179,9 @@ def drop(self, codes, level=None, errors="raise"):
             mask = indexer == -1
             if mask.any():
                 if errors != "ignore":
-                    raise ValueError("codes %s not contained in axis" % codes[mask])
+                    raise ValueError(
+                        "codes {codes} not contained in axis".format(codes=codes[mask])
+                    )
         except Exception:
             pass
 
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -167,6 +167,7 @@ def init_ndarray(values, index, columns, dtype=None, copy=False):
             try:
                 values = values.astype(dtype)
             except Exception as orig:
+                # e.g. ValueError when trying to cast object dtype to float64
                 raise ValueError(
                     "failed to cast to '{dtype}' (Exception "
                     "was: {orig})".format(dtype=dtype, orig=orig)
diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
@@ -478,7 +478,7 @@ def get_result(self):
                 self, method="concat"
             )
 
-    def _get_result_dim(self):
+    def _get_result_dim(self) -> int:
         if self._is_series and self.axis == 1:
             return 2
         else:
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -1948,13 +1948,13 @@ def _get_join_keys(llab, rlab, shape, sort):
     return _get_join_keys(llab, rlab, shape, sort)
 
 
-def _should_fill(lname, rname):
+def _should_fill(lname, rname) -> bool:
     if not isinstance(lname, str) or not isinstance(rname, str):
         return True
     return lname == rname
 
 
-def _any(x):
+def _any(x) -> bool:
     return x is not None and com.any_not_none(*x)
 
 
diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
@@ -620,7 +620,9 @@ def _normalize(table, normalize, margins, margins_name="All"):
         if (margins_name not in table.iloc[-1, :].name) | (
             margins_name != table.iloc[:, -1].name
         ):
-            raise ValueError("{} not in pivoted DataFrame".format(margins_name))
+            raise ValueError(
+                "{mname} not in pivoted DataFrame".format(mname=margins_name)
+            )
         column_margin = table.iloc[:-1, -1]
         index_margin = table.iloc[-1, :-1]
 
diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
@@ -88,7 +88,7 @@ class _Unstacker:
 
     def __init__(
         self,
-        values,
+        values: np.ndarray,
         index,
         level=-1,
         value_columns=None,
@@ -985,7 +985,7 @@ def get_empty_frame(data):
     else:
 
         # PY2 embedded unicode, gh-22084
-        def _make_col_name(prefix, prefix_sep, level):
+        def _make_col_name(prefix, prefix_sep, level) -> str:
             fstr = "{prefix}{prefix_sep}{level}"
             return fstr.format(prefix=prefix, prefix_sep=prefix_sep, level=level)
 

Original file line number	Diff line number	Diff line change
`@@ -478,7 +478,7 @@ def get_result(self):`
`478`	`478`	`self, method="concat"`
`479`	`479`	`)`
`480`	`480`
`481`		`- def _get_result_dim(self):`
	`481`	`+ def _get_result_dim(self) -> int:`
`482`	`482`	`if self._is_series and self.axis == 1:`
`483`	`483`	`return 2`
`484`	`484`	`else:`