diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 61568717ace68..4d0bd4744be5d 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -109,9 +109,8 @@ cdef class SeriesBinGrouper(_BaseGrouper): ndarray arr, index, dummy_arr, dummy_index object values, f, bins, typ, ityp, name - def __init__(self, object series, object f, object bins, object dummy): + def __init__(self, object series, object f, object bins): - assert dummy is not None # always obj[:0] assert len(bins) > 0 # otherwise we get IndexError in get_result self.bins = bins @@ -127,6 +126,7 @@ cdef class SeriesBinGrouper(_BaseGrouper): self.index = series.index.values self.name = series.name + dummy = series.iloc[:0] self.dummy_arr, self.dummy_index = self._check_dummy(dummy) # kludge for #1688 @@ -203,10 +203,7 @@ cdef class SeriesGrouper(_BaseGrouper): object f, labels, values, typ, ityp, name def __init__(self, object series, object f, object labels, - Py_ssize_t ngroups, object dummy): - - # in practice we always pass obj.iloc[:0] or equivalent - assert dummy is not None + Py_ssize_t ngroups): if len(series) == 0: # get_result would never assign `result` @@ -225,6 +222,7 @@ cdef class SeriesGrouper(_BaseGrouper): self.index = series.index.values self.name = series.name + dummy = series.iloc[:0] self.dummy_arr, self.dummy_index = self._check_dummy(dummy) self.ngroups = ngroups diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 40533cdd554b3..4dac4dd557af2 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -263,7 +263,7 @@ def _reconstruct_data( return values -def _ensure_arraylike(values): +def _ensure_arraylike(values) -> ArrayLike: """ ensure that we are arraylike if not already """ @@ -323,7 +323,7 @@ def get_data_algo(values: ArrayLike): return htable, values -def _check_object_for_strings(values) -> str: +def _check_object_for_strings(values: np.ndarray) -> str: """ Check if we can use string hashtable instead of object hashtable. @@ -527,7 +527,11 @@ def f(c, v): def factorize_array( - values: np.ndarray, na_sentinel: int = -1, size_hint=None, na_value=None, mask=None + values: np.ndarray, + na_sentinel: int = -1, + size_hint: Optional[int] = None, + na_value=None, + mask: Optional[np.ndarray] = None, ) -> Tuple[np.ndarray, np.ndarray]: """ Factorize an array-like to codes and uniques. @@ -982,13 +986,13 @@ def mode(values, dropna: bool = True) -> Series: def rank( - values, + values: ArrayLike, axis: int = 0, method: str = "average", na_option: str = "keep", ascending: bool = True, pct: bool = False, -): +) -> np.ndarray: """ Rank the values along a given axis. @@ -1038,7 +1042,12 @@ def rank( return ranks -def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None): +def checked_add_with_arr( + arr: np.ndarray, + b, + arr_mask: Optional[np.ndarray] = None, + b_mask: Optional[np.ndarray] = None, +) -> np.ndarray: """ Perform array addition that checks for underflow and overflow. @@ -1051,9 +1060,9 @@ def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None): ---------- arr : array addend. b : array or scalar addend. - arr_mask : boolean array or None + arr_mask : np.ndarray[bool] or None, default None array indicating which elements to exclude from checking - b_mask : boolean array or boolean or None + b_mask : np.ndarray[bool] or None, default None array or scalar indicating which element(s) to exclude from checking Returns @@ -1406,7 +1415,9 @@ def get_indexer(current_indexer, other_indexer): def _view_wrapper(f, arr_dtype=None, out_dtype=None, fill_wrap=None): - def wrapper(arr, indexer, out, fill_value=np.nan): + def wrapper( + arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan + ): if arr_dtype is not None: arr = arr.view(arr_dtype) if out_dtype is not None: @@ -1419,7 +1430,9 @@ def wrapper(arr, indexer, out, fill_value=np.nan): def _convert_wrapper(f, conv_dtype): - def wrapper(arr, indexer, out, fill_value=np.nan): + def wrapper( + arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan + ): if conv_dtype == object: # GH#39755 avoid casting dt64/td64 to integers arr = ensure_wrapped_if_datetimelike(arr) @@ -1429,7 +1442,9 @@ def wrapper(arr, indexer, out, fill_value=np.nan): return wrapper -def _take_2d_multi_object(arr, indexer, out, fill_value, mask_info): +def _take_2d_multi_object( + arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value, mask_info +) -> None: # this is not ideal, performance-wise, but it's better than raising # an exception (best to optimize in Cython to avoid getting here) row_idx, col_idx = indexer @@ -1452,7 +1467,14 @@ def _take_2d_multi_object(arr, indexer, out, fill_value, mask_info): out[i, j] = arr[u_, v] -def _take_nd_object(arr, indexer, out, axis: int, fill_value, mask_info): +def _take_nd_object( + arr: np.ndarray, + indexer: np.ndarray, + out: np.ndarray, + axis: int, + fill_value, + mask_info, +): if mask_info is not None: mask, needs_masking = mask_info else: @@ -1570,7 +1592,7 @@ def _take_nd_object(arr, indexer, out, axis: int, fill_value, mask_info): def _get_take_nd_function( - ndim: int, arr_dtype, out_dtype, axis: int = 0, mask_info=None + ndim: int, arr_dtype: np.dtype, out_dtype: np.dtype, axis: int = 0, mask_info=None ): if ndim <= 2: tup = (arr_dtype.name, out_dtype.name) @@ -1605,7 +1627,9 @@ def func2(arr, indexer, out, fill_value=np.nan): return func2 -def take(arr, indices, axis: int = 0, allow_fill: bool = False, fill_value=None): +def take( + arr, indices: np.ndarray, axis: int = 0, allow_fill: bool = False, fill_value=None +): """ Take elements from an array. @@ -1739,7 +1763,7 @@ def take_nd( arr, indexer, axis: int = 0, - out=None, + out: Optional[np.ndarray] = None, fill_value=lib.no_default, allow_fill: bool = True, ): diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 6cfc0e1853b74..4d165dac40397 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -198,7 +198,7 @@ class IntervalArray(IntervalMixin, ExtensionArray): # Constructors def __new__( - cls, + cls: Type[IntervalArrayT], data, closed=None, dtype: Optional[Dtype] = None, @@ -226,7 +226,7 @@ def __new__( raise TypeError(msg) # might need to convert empty or purely na data - data = maybe_convert_platform_interval(data) + data = _maybe_convert_platform_interval(data) left, right, infer_closed = intervals_to_interval_bounds( data, validate_closed=closed is None ) @@ -243,14 +243,14 @@ def __new__( @classmethod def _simple_new( - cls, + cls: Type[IntervalArrayT], left, right, closed=None, - copy=False, + copy: bool = False, dtype: Optional[Dtype] = None, - verify_integrity=True, - ): + verify_integrity: bool = True, + ) -> IntervalArrayT: result = IntervalMixin.__new__(cls) if closed is None and isinstance(dtype, IntervalDtype): @@ -330,12 +330,18 @@ def _simple_new( @classmethod def _from_sequence( - cls, scalars, *, dtype: Optional[Dtype] = None, copy: bool = False - ): + cls: Type[IntervalArrayT], + scalars, + *, + dtype: Optional[Dtype] = None, + copy: bool = False, + ) -> IntervalArrayT: return cls(scalars, dtype=dtype, copy=copy) @classmethod - def _from_factorized(cls, values, original): + def _from_factorized( + cls: Type[IntervalArrayT], values: np.ndarray, original: IntervalArrayT + ) -> IntervalArrayT: if len(values) == 0: # An empty array returns object-dtype here. We can't create # a new IA from an (empty) object-dtype array, so turn it into the @@ -391,9 +397,13 @@ def _from_factorized(cls, values, original): } ) def from_breaks( - cls, breaks, closed="right", copy: bool = False, dtype: Optional[Dtype] = None - ): - breaks = maybe_convert_platform_interval(breaks) + cls: Type[IntervalArrayT], + breaks, + closed="right", + copy: bool = False, + dtype: Optional[Dtype] = None, + ) -> IntervalArrayT: + breaks = _maybe_convert_platform_interval(breaks) return cls.from_arrays(breaks[:-1], breaks[1:], closed, copy=copy, dtype=dtype) @@ -462,10 +472,15 @@ def from_breaks( } ) def from_arrays( - cls, left, right, closed="right", copy=False, dtype: Optional[Dtype] = None - ): - left = maybe_convert_platform_interval(left) - right = maybe_convert_platform_interval(right) + cls: Type[IntervalArrayT], + left, + right, + closed="right", + copy: bool = False, + dtype: Optional[Dtype] = None, + ) -> IntervalArrayT: + left = _maybe_convert_platform_interval(left) + right = _maybe_convert_platform_interval(right) return cls._simple_new( left, right, closed, copy=copy, dtype=dtype, verify_integrity=True @@ -521,8 +536,12 @@ def from_arrays( } ) def from_tuples( - cls, data, closed="right", copy=False, dtype: Optional[Dtype] = None - ): + cls: Type[IntervalArrayT], + data, + closed="right", + copy: bool = False, + dtype: Optional[Dtype] = None, + ) -> IntervalArrayT: if len(data): left, right = [], [] else: @@ -577,7 +596,7 @@ def _validate(self): msg = "left side of interval must be <= right side" raise ValueError(msg) - def _shallow_copy(self, left, right): + def _shallow_copy(self: IntervalArrayT, left, right) -> IntervalArrayT: """ Return a new IntervalArray with the replacement attributes @@ -594,7 +613,7 @@ def _shallow_copy(self, left, right): # Descriptive @property - def dtype(self): + def dtype(self) -> IntervalDtype: return self._dtype @property @@ -750,7 +769,9 @@ def argsort( ascending=ascending, kind=kind, na_position=na_position, **kwargs ) - def fillna(self, value=None, method=None, limit=None): + def fillna( + self: IntervalArrayT, value=None, method=None, limit=None + ) -> IntervalArrayT: """ Fill NA/NaN values using the specified method. @@ -788,7 +809,7 @@ def fillna(self, value=None, method=None, limit=None): right = self.right.fillna(value=value_right) return self._shallow_copy(left, right) - def astype(self, dtype, copy=True): + def astype(self, dtype, copy: bool = True): """ Cast to an ExtensionArray or NumPy array with dtype 'dtype'. @@ -892,7 +913,9 @@ def copy(self: IntervalArrayT) -> IntervalArrayT: def isna(self) -> np.ndarray: return isna(self._left) - def shift(self, periods: int = 1, fill_value: object = None) -> IntervalArray: + def shift( + self: IntervalArrayT, periods: int = 1, fill_value: object = None + ) -> IntervalArray: if not len(self) or periods == 0: return self.copy() @@ -921,7 +944,15 @@ def shift(self, periods: int = 1, fill_value: object = None) -> IntervalArray: b = empty return self._concat_same_type([a, b]) - def take(self, indices, *, allow_fill=False, fill_value=None, axis=None, **kwargs): + def take( + self: IntervalArrayT, + indices, + *, + allow_fill: bool = False, + fill_value=None, + axis=None, + **kwargs, + ) -> IntervalArrayT: """ Take elements from the IntervalArray. @@ -1076,7 +1107,7 @@ def value_counts(self, dropna: bool = True): # --------------------------------------------------------------------- # Rendering Methods - def _format_data(self): + def _format_data(self) -> str: # TODO: integrate with categorical and make generic # name argument is unused here; just for compat with base / categorical @@ -1120,7 +1151,7 @@ def __repr__(self) -> str: template = f"{class_name}{data}\nLength: {len(self)}, dtype: {self.dtype}" return template - def _format_space(self): + def _format_space(self) -> str: space = " " * (len(type(self).__name__) + 1) return f"\n{space}" @@ -1300,7 +1331,7 @@ def closed(self): ), } ) - def set_closed(self, closed): + def set_closed(self: IntervalArrayT, closed) -> IntervalArrayT: if closed not in VALID_CLOSED: msg = f"invalid option for 'closed': {closed}" raise ValueError(msg) @@ -1323,7 +1354,7 @@ def set_closed(self, closed): @Appender( _interval_shared_docs["is_non_overlapping_monotonic"] % _shared_docs_kwargs ) - def is_non_overlapping_monotonic(self): + def is_non_overlapping_monotonic(self) -> bool: # must be increasing (e.g., [0, 1), [1, 2), [2, 3), ... ) # or decreasing (e.g., [-1, 0), [-2, -1), [-3, -2), ...) # we already require left <= right @@ -1436,7 +1467,7 @@ def __arrow_array__(self, type=None): @Appender( _interval_shared_docs["to_tuples"] % {"return_type": "ndarray", "examples": ""} ) - def to_tuples(self, na_tuple=True): + def to_tuples(self, na_tuple=True) -> np.ndarray: tuples = com.asarray_tuplesafe(zip(self._left, self._right)) if not na_tuple: # GH 18756 @@ -1465,7 +1496,7 @@ def delete(self: IntervalArrayT, loc) -> IntervalArrayT: return self._shallow_copy(left=new_left, right=new_right) @Appender(_extension_array_shared_docs["repeat"] % _shared_docs_kwargs) - def repeat(self, repeats, axis=None): + def repeat(self: IntervalArrayT, repeats: int, axis=None) -> IntervalArrayT: nv.validate_repeat((), {"axis": axis}) left_repeat = self.left.repeat(repeats) right_repeat = self.right.repeat(repeats) @@ -1564,7 +1595,7 @@ def _combined(self) -> ArrayLike: return comb -def maybe_convert_platform_interval(values): +def _maybe_convert_platform_interval(values) -> ArrayLike: """ Try to do platform conversion, with special casing for IntervalArray. Wrapper around maybe_convert_platform that alters the default return diff --git a/pandas/core/common.py b/pandas/core/common.py index 50aed70bf275d..8625c5063382f 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -49,11 +49,7 @@ ABCSeries, ) from pandas.core.dtypes.inference import iterable_not_string -from pandas.core.dtypes.missing import ( # noqa - isna, - isnull, - notnull, -) +from pandas.core.dtypes.missing import isna class SettingWithCopyError(ValueError): @@ -153,7 +149,7 @@ def is_bool_indexer(key: Any) -> bool: return False -def cast_scalar_indexer(val, warn_float=False): +def cast_scalar_indexer(val, warn_float: bool = False): """ To avoid numpy DeprecationWarnings, cast float to integer where valid. @@ -300,7 +296,7 @@ def is_null_slice(obj) -> bool: ) -def is_true_slices(line): +def is_true_slices(line) -> List[bool]: """ Find non-trivial slices in "line": return a list of booleans with same length. """ @@ -308,7 +304,7 @@ def is_true_slices(line): # TODO: used only once in indexing; belongs elsewhere? -def is_full_slice(obj, line) -> bool: +def is_full_slice(obj, line: int) -> bool: """ We have a full length slice. """ diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 00cb65fff3803..5004d1fe08a5b 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -742,11 +742,10 @@ def _aggregate_series_fast(self, obj: Series, func: F): group_index, _, ngroups = self.group_info # avoids object / Series creation overhead - dummy = obj.iloc[:0] indexer = get_group_index_sorter(group_index, ngroups) obj = obj.take(indexer) group_index = algorithms.take_nd(group_index, indexer, allow_fill=False) - grouper = libreduction.SeriesGrouper(obj, func, group_index, ngroups, dummy) + grouper = libreduction.SeriesGrouper(obj, func, group_index, ngroups) result, counts = grouper.get_result() return result, counts @@ -945,8 +944,7 @@ def agg_series(self, obj: Series, func: F): # preempt SeriesBinGrouper from raising TypeError return self._aggregate_series_pure_python(obj, func) - dummy = obj[:0] - grouper = libreduction.SeriesBinGrouper(obj, func, self.bins, dummy) + grouper = libreduction.SeriesBinGrouper(obj, func, self.bins) return grouper.get_result() diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 0649cc3efc153..86d6b772fe2e4 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -344,7 +344,7 @@ def length_of_indexer(indexer, target=None) -> int: raise AssertionError("cannot find the length of the indexer") -def deprecate_ndim_indexing(result, stacklevel=3): +def deprecate_ndim_indexing(result, stacklevel: int = 3): """ Helper function to raise the deprecation warning for multi-dimensional indexing on 1D Series/Index. diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index b2c67ae2f0a00..70705d6988d86 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -732,13 +732,9 @@ def _intersection(self, other: Index, sort=False) -> Index: result = self[:0] else: lslice = slice(*left.slice_locs(start, end)) - left_chunk = left._values[lslice] - # error: Argument 1 to "_simple_new" of "DatetimeIndexOpsMixin" has - # incompatible type "Union[ExtensionArray, Any]"; expected - # "Union[DatetimeArray, TimedeltaArray, PeriodArray]" - result = type(self)._simple_new(left_chunk) # type: ignore[arg-type] + result = left._values[lslice] - return self._wrap_setop_result(other, result) + return result def _can_fast_intersect(self: _T, other: _T) -> bool: # Note: we only get here with len(self) > 0 and len(other) > 0 diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 7ef81b0947a22..615cce8767de3 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1122,7 +1122,7 @@ def _view(self) -> MultiIndex: result = type(self)( levels=self.levels, codes=self.codes, - sortorder=None, + sortorder=self.sortorder, names=self.names, verify_integrity=False, ) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 7b4921080e2e1..cfe16627d5c64 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2392,7 +2392,7 @@ def is_label_like(key) -> bool: return not isinstance(key, slice) and not is_list_like_indexer(key) -def need_slice(obj) -> bool: +def need_slice(obj: slice) -> bool: """ Returns ------- diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 2cfe613b7072b..eb1a7a355f313 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -36,6 +36,7 @@ maybe_convert_platform, maybe_infer_to_datetimelike, maybe_upcast, + sanitize_to_nanoseconds, ) from pandas.core.dtypes.common import ( is_datetime64tz_dtype, @@ -377,7 +378,7 @@ def convert(v): # this is equiv of np.asarray, but does object conversion # and platform dtype preservation try: - if is_list_like(values[0]) or hasattr(values[0], "len"): + if is_list_like(values[0]): values = np.array([convert(v) for v in values]) elif isinstance(values[0], np.ndarray) and values[0].ndim == 0: # GH#21861 @@ -827,8 +828,7 @@ def sanitize_index(data, index: Index): if isinstance(data, np.ndarray): - # coerce datetimelike types - if data.dtype.kind in ["M", "m"]: - data = sanitize_array(data, index, copy=False) + # coerce datetimelike types to ns + data = sanitize_to_nanoseconds(data) return data diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index ce46afc0ccd65..f873c93d90683 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -13,11 +13,10 @@ def test_series_grouper(): obj = Series(np.random.randn(10)) - dummy = obj.iloc[:0] labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64) - grouper = libreduction.SeriesGrouper(obj, np.mean, labels, 2, dummy) + grouper = libreduction.SeriesGrouper(obj, np.mean, labels, 2) result, counts = grouper.get_result() expected = np.array([obj[3:6].mean(), obj[6:].mean()]) @@ -34,16 +33,15 @@ def test_series_grouper_requires_nonempty_raises(): labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64) with pytest.raises(ValueError, match="SeriesGrouper requires non-empty `series`"): - libreduction.SeriesGrouper(dummy, np.mean, labels, 2, dummy) + libreduction.SeriesGrouper(dummy, np.mean, labels, 2) def test_series_bin_grouper(): obj = Series(np.random.randn(10)) - dummy = obj[:0] bins = np.array([3, 6]) - grouper = libreduction.SeriesBinGrouper(obj, np.mean, bins, dummy) + grouper = libreduction.SeriesBinGrouper(obj, np.mean, bins) result, counts = grouper.get_result() expected = np.array([obj[:3].mean(), obj[3:6].mean(), obj[6:].mean()])