From 75a15d469c1c484a0250cf8a0349625be184e737 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 21 Apr 2021 14:33:47 -0700 Subject: [PATCH] CLN: annotations, docstrings --- pandas/_libs/hashtable_class_helper.pxi.in | 5 ++- pandas/core/arrays/period.py | 4 +-- pandas/core/dtypes/dtypes.py | 21 ++++++++----- pandas/core/groupby/ops.py | 2 +- pandas/core/indexes/base.py | 9 ++---- pandas/core/indexing.py | 34 ++++++++++---------- pandas/core/resample.py | 36 +++++++++++++++------- pandas/core/sorting.py | 31 ++++++++++++++++--- 8 files changed, 92 insertions(+), 50 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 4dc5e7516db7e..a25867c4a3b0c 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -687,7 +687,10 @@ cdef class {{name}}HashTable(HashTable): {{if dtype == 'int64'}} @cython.boundscheck(False) - def get_labels_groupby(self, const {{dtype}}_t[:] values): + def get_labels_groupby( + self, const {{dtype}}_t[:] values + ) -> tuple[ndarray, ndarray]: + # tuple[np.ndarray[np.intp], np.ndarray[{{dtype}}]] cdef: Py_ssize_t i, n = len(values) intp_t[:] labels diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 5a9dd0e89bd65..a9c94b615f49c 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -178,8 +178,8 @@ class PeriodArray(dtl.DatelikeOps): "days_in_month", "daysinmonth", ] - _datetimelike_ops = _field_ops + _object_ops + _bool_ops - _datetimelike_methods = ["strftime", "to_timestamp", "asfreq"] + _datetimelike_ops: list[str] = _field_ops + _object_ops + _bool_ops + _datetimelike_methods: list[str] = ["strftime", "to_timestamp", "asfreq"] # -------------------------------------------------------------------- # Constructors diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 84eede019251b..3d68688c21241 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -48,9 +48,14 @@ ) if TYPE_CHECKING: + from datetime import tzinfo + import pyarrow - from pandas import Categorical + from pandas import ( + Categorical, + Index, + ) from pandas.core.arrays import ( DatetimeArray, IntervalArray, @@ -445,8 +450,8 @@ def _hash_categories(self) -> int: # assumes if any individual category is a tuple, then all our. ATM # I don't really want to support just some of the categories being # tuples. - categories = list(categories) # breaks if a np.array of categories - cat_array = hash_tuples(categories) + cat_list = list(categories) # breaks if a np.array of categories + cat_array = hash_tuples(cat_list) else: if categories.dtype == "O" and len({type(x) for x in categories}) != 1: # TODO: hash_array doesn't handle mixed types. It casts @@ -509,7 +514,7 @@ def validate_ordered(ordered: Ordered) -> None: raise TypeError("'ordered' must either be 'True' or 'False'") @staticmethod - def validate_categories(categories, fastpath: bool = False): + def validate_categories(categories, fastpath: bool = False) -> Index: """ Validates that we have good categories @@ -579,7 +584,7 @@ def update_dtype(self, dtype: str_type | CategoricalDtype) -> CategoricalDtype: return CategoricalDtype(new_categories, new_ordered) @property - def categories(self): + def categories(self) -> Index: """ An ``Index`` containing the unique categories allowed. """ @@ -717,7 +722,7 @@ def unit(self) -> str_type: return self._unit @property - def tz(self): + def tz(self) -> tzinfo: """ The timezone. """ @@ -882,7 +887,7 @@ def freq(self): return self._freq @classmethod - def _parse_dtype_strict(cls, freq): + def _parse_dtype_strict(cls, freq: str_type) -> BaseOffset: if isinstance(freq, str): if freq.startswith("period[") or freq.startswith("Period["): m = cls._match.search(freq) @@ -1136,7 +1141,7 @@ def construct_array_type(cls) -> type[IntervalArray]: return IntervalArray @classmethod - def construct_from_string(cls, string): + def construct_from_string(cls, string: str_type) -> IntervalDtype: """ attempt to construct this type from a string, raise a TypeError if its not possible diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 0a9c46f6ed069..4f9a71e5af59a 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -281,7 +281,7 @@ class BaseGrouper: whether this grouper will give sorted result or not group_keys : bool, default True mutated : bool, default False - indexer : intp array, optional + indexer : np.ndarray[np.intp], optional the indexer created by Grouper some groupers (TimeGrouper) will sort its axis and its group_info is also sorted, so need the indexer to reorder diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 58f5ca3de5dce..9b3f2d191831d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3029,9 +3029,6 @@ def _union(self, other: Index, sort): @final def _wrap_setop_result(self, other: Index, result) -> Index: - if is_categorical_dtype(self.dtype) and isinstance(result, np.ndarray): - result = Categorical(result, dtype=self.dtype) - name = get_op_result_name(self, other) if isinstance(result, Index): if result.name != name: @@ -4028,7 +4025,7 @@ def join( return join_index, lindexer, rindexer @final - def _join_multi(self, other, how): + def _join_multi(self, other: Index, how: str_t): from pandas.core.indexes.multi import MultiIndex from pandas.core.reshape.merge import restore_dropped_levels_multijoin @@ -4273,7 +4270,7 @@ def _get_leaf_sorter(labels: list[np.ndarray]) -> np.ndarray: return join_index, left_indexer, right_indexer @final - def _join_monotonic(self, other: Index, how="left"): + def _join_monotonic(self, other: Index, how: str_t = "left"): # We only get here with matching dtypes assert other.dtype == self.dtype @@ -5527,7 +5524,7 @@ def isin(self, values, level=None) -> np.ndarray: Returns ------- - is_contained : ndarray[bool] + np.ndarray[bool] NumPy array of boolean values. See Also diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 1b68ac9780ee1..04543da167fdd 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -36,7 +36,6 @@ from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ( ABCDataFrame, - ABCMultiIndex, ABCSeries, ) from pandas.core.dtypes.missing import ( @@ -53,7 +52,10 @@ is_list_like_indexer, length_of_indexer, ) -from pandas.core.indexes.api import Index +from pandas.core.indexes.api import ( + Index, + MultiIndex, +) if TYPE_CHECKING: from pandas import ( @@ -642,7 +644,7 @@ def _get_setitem_indexer(self, key): ax = self.obj._get_axis(0) - if isinstance(ax, ABCMultiIndex) and self.name != "iloc": + if isinstance(ax, MultiIndex) and self.name != "iloc": with suppress(TypeError, KeyError, InvalidIndexError): # TypeError e.g. passed a bool return ax.get_loc(key) @@ -690,7 +692,7 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None): if ( axis == column_axis - and not isinstance(self.obj.columns, ABCMultiIndex) + and not isinstance(self.obj.columns, MultiIndex) and is_list_like_indexer(key) and not com.is_bool_indexer(key) and all(is_hashable(k) for k in key) @@ -756,7 +758,7 @@ def _is_nested_tuple_indexer(self, tup: tuple) -> bool: ------- bool """ - if any(isinstance(ax, ABCMultiIndex) for ax in self.obj.axes): + if any(isinstance(ax, MultiIndex) for ax in self.obj.axes): return any(is_nested_tuple(tup, ax) for ax in self.obj.axes) return False @@ -817,7 +819,7 @@ def _getitem_lowerdim(self, tup: tuple): ax0 = self.obj._get_axis(0) # ...but iloc should handle the tuple as simple integer-location # instead of checking it as multiindex representation (GH 13797) - if isinstance(ax0, ABCMultiIndex) and self.name != "iloc": + if isinstance(ax0, MultiIndex) and self.name != "iloc": with suppress(IndexingError): return self._handle_lowerdim_multi_index_axis0(tup) @@ -996,7 +998,7 @@ def _is_scalar_access(self, key: tuple) -> bool: return False ax = self.obj.axes[i] - if isinstance(ax, ABCMultiIndex): + if isinstance(ax, MultiIndex): return False if isinstance(k, str) and ax._supports_partial_string_indexing: @@ -1142,7 +1144,7 @@ def _getitem_axis(self, key, axis: int): elif is_list_like_indexer(key): # an iterable multi-selection - if not (isinstance(key, tuple) and isinstance(labels, ABCMultiIndex)): + if not (isinstance(key, tuple) and isinstance(labels, MultiIndex)): if hasattr(key, "ndim") and key.ndim > 1: raise ValueError("Cannot index with multidimensional key") @@ -1205,20 +1207,20 @@ def _convert_to_indexer(self, key, axis: int, is_setter: bool = False): is_int_index = labels.is_integer() is_int_positional = is_integer(key) and not is_int_index - if is_scalar(key) or isinstance(labels, ABCMultiIndex): + if is_scalar(key) or isinstance(labels, MultiIndex): # Otherwise get_loc will raise InvalidIndexError # if we are a label return me try: return labels.get_loc(key) except LookupError: - if isinstance(key, tuple) and isinstance(labels, ABCMultiIndex): + if isinstance(key, tuple) and isinstance(labels, MultiIndex): if len(key) == labels.nlevels: return {"key": key} raise except InvalidIndexError: # GH35015, using datetime as column indices raises exception - if not isinstance(labels, ABCMultiIndex): + if not isinstance(labels, MultiIndex): raise except TypeError: pass @@ -1620,7 +1622,7 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): # GH 10360, GH 27841 if isinstance(indexer, tuple) and len(indexer) == len(self.obj.axes): for i, ax in zip(indexer, self.obj.axes): - if isinstance(ax, ABCMultiIndex) and not ( + if isinstance(ax, MultiIndex) and not ( is_integer(i) or com.is_null_slice(i) ): take_split_path = True @@ -1819,7 +1821,7 @@ def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str sub_indexer = list(indexer) pi = indexer[0] - multiindex_indexer = isinstance(self.obj.columns, ABCMultiIndex) + multiindex_indexer = isinstance(self.obj.columns, MultiIndex) unique_cols = value.columns.is_unique @@ -2163,8 +2165,8 @@ def _align_frame(self, indexer, df: DataFrame): # we have a multi-index and are trying to align # with a particular, level GH3738 if ( - isinstance(ax, ABCMultiIndex) - and isinstance(df.index, ABCMultiIndex) + isinstance(ax, MultiIndex) + and isinstance(df.index, MultiIndex) and ax.nlevels != df.index.nlevels ): raise TypeError( @@ -2428,7 +2430,7 @@ def is_nested_tuple(tup, labels) -> bool: for k in tup: if is_list_like(k) or isinstance(k, slice): - return isinstance(labels, ABCMultiIndex) + return isinstance(labels, MultiIndex) return False diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 58003c10db9e0..91c77e987654b 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -20,6 +20,7 @@ to_offset, ) from pandas._typing import ( + FrameOrSeries, T, TimedeltaConvertibleTypes, TimestampConvertibleTypes, @@ -1345,9 +1346,15 @@ def _upsample(self, method, limit=None, fill_value=None): # Get the fill indexer indexer = memb.get_indexer(new_index, method=method, limit=limit) - return self._wrap_result( - _take_new_index(obj, indexer, new_index, axis=self.axis) + new_obj = _take_new_index( + obj, + indexer, + # error: Argument 3 to "_take_new_index" has incompatible type + # "Optional[Any]"; expected "Index" + new_index, # type: ignore[arg-type] + axis=self.axis, ) + return self._wrap_result(new_obj) class PeriodIndexResamplerGroupby(_GroupByMixin, PeriodIndexResampler): @@ -1666,7 +1673,7 @@ def _adjust_bin_edges(self, binner, ax_values): bin_edges = binner.asi8 return binner, bin_edges - def _get_time_delta_bins(self, ax): + def _get_time_delta_bins(self, ax: TimedeltaIndex): if not isinstance(ax, TimedeltaIndex): raise TypeError( "axis must be a TimedeltaIndex, but got " @@ -1789,17 +1796,24 @@ def _get_period_bins(self, ax: PeriodIndex): return binner, bins, labels -def _take_new_index(obj, indexer, new_index, axis=0): +def _take_new_index( + obj: FrameOrSeries, indexer: np.ndarray, new_index: Index, axis: int = 0 +) -> FrameOrSeries: + # indexer: np.ndarray[np.intp] if isinstance(obj, ABCSeries): new_values = algos.take_nd(obj._values, indexer) - return obj._constructor(new_values, index=new_index, name=obj.name) + # error: Incompatible return value type (got "Series", expected "FrameOrSeries") + return obj._constructor( # type: ignore[return-value] + new_values, index=new_index, name=obj.name + ) elif isinstance(obj, ABCDataFrame): if axis == 1: raise NotImplementedError("axis 1 is not supported") - return obj._constructor( - obj._mgr.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1) - ) + new_mgr = obj._mgr.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1) + # error: Incompatible return value type + # (got "DataFrame", expected "FrameOrSeries") + return obj._constructor(new_mgr) # type: ignore[return-value] else: raise ValueError("'obj' should be either a Series or a DataFrame") @@ -1822,7 +1836,7 @@ def _get_timestamp_range_edges( The ending Timestamp of the range to be adjusted. freq : pd.DateOffset The dateoffset to which the Timestamps will be adjusted. - closed : {'right', 'left'}, default None + closed : {'right', 'left'}, default "left" Which side of bin interval is closed. origin : {'epoch', 'start', 'start_day'} or Timestamp, default 'start_day' The timestamp on which to adjust the grouping. The timezone of origin must @@ -1892,7 +1906,7 @@ def _get_period_range_edges( The ending Period of the range to be adjusted. freq : pd.DateOffset The freq to which the Periods will be adjusted. - closed : {'right', 'left'}, default None + closed : {'right', 'left'}, default "left" Which side of bin interval is closed. origin : {'epoch', 'start', 'start_day'}, Timestamp, default 'start_day' The timestamp on which to adjust the grouping. The timezone of origin must @@ -2042,7 +2056,7 @@ def asfreq(obj, freq, method=None, how=None, normalize=False, fill_value=None): return new_obj -def _asfreq_compat(index, freq): +def _asfreq_compat(index: DatetimeIndex | PeriodIndex | TimedeltaIndex, freq): """ Helper to mimic asfreq on (empty) DatetimeIndex and TimedeltaIndex. diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 71963ec4a2123..dd7ae904c866c 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -182,7 +182,7 @@ def maybe_lift(lab, size): return out -def get_compressed_ids(labels, sizes): +def get_compressed_ids(labels, sizes) -> tuple[np.ndarray, np.ndarray]: """ Group_index is offsets into cartesian product of all possible labels. This space can be huge, so this function compresses it, by computing offsets @@ -195,7 +195,10 @@ def get_compressed_ids(labels, sizes): Returns ------- - tuple of (comp_ids, obs_group_ids) + np.ndarray[np.intp] + comp_ids + np.ndarray[np.int64] + obs_group_ids """ ids = get_group_index(labels, sizes, sort=True, xnull=False) return compress_group_index(ids, sort=True) @@ -254,7 +257,8 @@ def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull: bool): return [i8copy(lab[i]) for lab in labels] -def indexer_from_factorized(labels, shape, compress: bool = True): +def indexer_from_factorized(labels, shape, compress: bool = True) -> np.ndarray: + # returned ndarray is np.intp ids = get_group_index(labels, shape, sort=True, xnull=False) if not compress: @@ -268,7 +272,7 @@ def indexer_from_factorized(labels, shape, compress: bool = True): def lexsort_indexer( keys, orders=None, na_position: str = "last", key: Callable | None = None -): +) -> np.ndarray: """ Performs lexical sorting on a set of keys @@ -288,6 +292,10 @@ def lexsort_indexer( Callable key function applied to every element in keys before sorting .. versionadded:: 1.0.0 + + Returns + ------- + np.ndarray[np.intp] """ from pandas.core.arrays import Categorical @@ -656,7 +664,20 @@ def compress_group_index(group_index, sort: bool = True): return ensure_int64(comp_ids), ensure_int64(obs_group_ids) -def _reorder_by_uniques(uniques, labels): +def _reorder_by_uniques( + uniques: np.ndarray, labels: np.ndarray +) -> tuple[np.ndarray, np.ndarray]: + """ + Parameters + ---------- + uniques : np.ndarray[np.int64] + labels : np.ndarray[np.intp] + + Returns + ------- + np.ndarray[np.int64] + np.ndarray[np.intp] + """ # sorter is index where elements ought to go sorter = uniques.argsort()