pandas-dev · mroeschke · May 5, 2023 · May 4, 2023 · phofl · May 5, 2023
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -438,7 +438,7 @@ Metadata
 
 Other
 ^^^^^
-- Bug in :class:`FloatingArray.__contains__` with ``NaN`` item incorrectly returning ``False`` when ``NaN`` values are presnet (:issue:`52840`)
+- Bug in :class:`FloatingArray.__contains__` with ``NaN`` item incorrectly returning ``False`` when ``NaN`` values are present (:issue:`52840`)
 - Bug in :func:`assert_almost_equal` now throwing assertion error for two unequal sets (:issue:`51727`)
 - Bug in :func:`assert_frame_equal` checks category dtypes even when asked not to check index type (:issue:`52126`)
 - Bug in :meth:`DataFrame.reindex` with a ``fill_value`` that should be inferred with a :class:`ExtensionDtype` incorrectly inferring ``object`` dtype (:issue:`52586`)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
@@ -346,7 +346,7 @@ def kth_smallest(numeric_t[::1] arr, Py_ssize_t k) -> numeric_t:
 def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
     cdef:
         Py_ssize_t i, xi, yi, N, K
-        bint minpv
+        int64_t minpv
         float64_t[:, ::1] result
         ndarray[uint8_t, ndim=2] mask
         int64_t nobs = 0
@@ -357,7 +357,7 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
     if minp is None:
         minpv = 1
     else:
-        minpv = <int>minp
+        minpv = <int64_t>minp
 
     result = np.empty((K, K), dtype=np.float64)
     mask = np.isfinite(mat).view(np.uint8)

diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi
@@ -5,6 +5,8 @@ from pandas._typing import npt
 from pandas import MultiIndex
 from pandas.core.arrays import ExtensionArray
 
+multiindex_nulls_shift: int
+
 class IndexEngine:
     over_size_threshold: bool
     def __init__(self, values: np.ndarray) -> None: ...

diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi
@@ -102,5 +102,5 @@ class BlockValuesRefs:
     referenced_blocks: list[weakref.ref]
     def __init__(self, blk: SharedBlock | None = ...) -> None: ...
     def add_reference(self, blk: SharedBlock) -> None: ...
-    def add_index_reference(self, index: object) -> None: ...
+    def add_index_reference(self, index: Index) -> None: ...
     def has_reference(self) -> bool: ...
diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx
@@ -966,7 +966,7 @@ cdef class BlockValuesRefs:
 
         Parameters
         ----------
-        index: object
+        index : Index
             The index that the new reference should point to.
         """
         self.referenced_blocks.append(weakref.ref(index))

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -2612,7 +2612,7 @@ def maybe_convert_objects(ndarray[object] objects,
                 return tdi._data._ndarray
         seen.object_ = True
 
-    if seen.period_:
+    elif seen.period_:
         if is_period_array(objects):
             from pandas import PeriodIndex
             pi = PeriodIndex(objects)
@@ -2621,7 +2621,7 @@ def maybe_convert_objects(ndarray[object] objects,
             return pi._data
         seen.object_ = True
 
-    if seen.interval_:
+    elif seen.interval_:
         if is_interval_array(objects):
             from pandas import IntervalIndex
             ii = IntervalIndex(objects)
@@ -2631,7 +2631,7 @@ def maybe_convert_objects(ndarray[object] objects,
 
         seen.object_ = True
 
-    if seen.nat_:
+    elif seen.nat_:
         if not seen.object_ and not seen.numeric_ and not seen.bool_:
             # all NaT, None, or nan (at least one NaT)
             # see GH#49340 for discussion of desired behavior

diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
@@ -852,9 +852,7 @@ def _constructor_sliced(self):
 
 
 class SubclassedCategorical(Categorical):
-    @property
-    def _constructor(self):
-        return SubclassedCategorical
+    pass
 
 
 def _make_skipna_wrapper(alternative, skipna_alternative=None):

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -1240,7 +1240,7 @@ def take(
     if not is_array_like(arr):
         arr = np.asarray(arr)
 
-    indices = np.asarray(indices, dtype=np.intp)
+    indices = ensure_platform_int(indices)
 
     if allow_fill:
         # Pandas style, -1 means NA

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -1086,15 +1086,10 @@ def agg(self):
         result = super().agg()
         if result is None:
             f = self.f
-            kwargs = self.kwargs
 
             # string, list-like, and dict-like are entirely handled in super
             assert callable(f)
 
-            # we can be called from an inner function which
-            # passes this meta-data
-            kwargs.pop("_level", None)
-
             # try a regular apply, this evaluates lambdas
             # row-by-row; however if the lambda is expected a Series
             # expression, e.g.: lambda x: x-x.quantile(0.25)

diff --git a/pandas/core/array_algos/putmask.py b/pandas/core/array_algos/putmask.py
@@ -138,7 +138,7 @@ def setitem_datetimelike_compat(values: np.ndarray, num_set: int, other):
     if values.dtype == object:
         dtype, _ = infer_dtype_from(other)
 
-        if isinstance(dtype, np.dtype) and dtype.kind in "mM":
+        if lib.is_np_dtype(dtype, "mM"):
             # https://github.com/numpy/numpy/issues/12550
             #  timedelta64 will incorrectly cast to int
             if not is_list_like(other):

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2022,7 +2022,7 @@ def _validate_listlike(self, value):
                     "Cannot set a Categorical with another, "
                     "without identical categories"
                 )
-            # is_dtype_equal implies categories_match_up_to_permutation
+            # dtype equality implies categories_match_up_to_permutation
             value = self._encode_with_my_categories(value)
             return value._codes
 

diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -63,7 +63,6 @@
 
 from pandas.core.arrays import datetimelike as dtl
 from pandas.core.arrays._ranges import generate_regular_range
-from pandas.core.arrays.sparse.dtype import SparseDtype
 import pandas.core.common as com
 
 from pandas.tseries.frequencies import get_period_alias
@@ -2035,11 +2034,7 @@ def _sequence_to_dt64ns(
     if out_unit is not None:
         out_dtype = np.dtype(f"M8[{out_unit}]")
 
-    if (
-        data_dtype == object
-        or is_string_dtype(data_dtype)
-        or isinstance(data_dtype, SparseDtype)
-    ):
+    if data_dtype == object or is_string_dtype(data_dtype):
         # TODO: We do not have tests specific to string-dtypes,
         #  also complex or categorical or other extension
         copy = False

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -48,7 +48,6 @@
     pandas_dtype,
 )
 from pandas.core.dtypes.dtypes import BaseMaskedDtype
-from pandas.core.dtypes.inference import is_array_like
 from pandas.core.dtypes.missing import (
     array_equivalent,
     is_valid_na_for_dtype,
@@ -172,20 +171,13 @@ def __getitem__(self, item: PositionalIndexer) -> Self | Any:
 
         return type(self)(self._data[item], newmask)
 
-    @doc(ExtensionArray.fillna)
     @doc(ExtensionArray.fillna)
     def fillna(self, value=None, method=None, limit: int | None = None) -> Self:
         value, method = validate_fillna_kwargs(value, method)
 
         mask = self._mask
 
-        if is_array_like(value):
-            if len(value) != len(self):
-                raise ValueError(
-                    f"Length of 'value' does not match. Got ({len(value)}) "
-                    f" expected {len(self)}"
-                )
-            value = value[mask]
+        value = missing.check_value_size(value, mask, len(self))
 
         if mask.any():
             if method is not None:

diff --git a/pandas/core/flags.py b/pandas/core/flags.py
@@ -1,7 +1,11 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
 import weakref
 
+if TYPE_CHECKING:
+    from pandas.core.generic import NDFrame
+
 
 class Flags:
     """
@@ -44,9 +48,9 @@ class Flags:
     <Flags(allows_duplicate_labels=True)>
     """
 
-    _keys = {"allows_duplicate_labels"}
+    _keys: set[str] = {"allows_duplicate_labels"}
 
-    def __init__(self, obj, *, allows_duplicate_labels) -> None:
+    def __init__(self, obj: NDFrame, *, allows_duplicate_labels: bool) -> None:
         self._allows_duplicate_labels = allows_duplicate_labels
         self._obj = weakref.ref(obj)
 
@@ -95,21 +99,21 @@ def allows_duplicate_labels(self, value: bool) -> None:
 
         self._allows_duplicate_labels = value
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str):
         if key not in self._keys:
             raise KeyError(key)
 
         return getattr(self, key)
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value) -> None:
         if key not in self._keys:
             raise ValueError(f"Unknown flag {key}. Must be one of {self._keys}")
         setattr(self, key, value)
 
     def __repr__(self) -> str:
         return f"<Flags(allows_duplicate_labels={self.allows_duplicate_labels})>"
 
-    def __eq__(self, other):
+    def __eq__(self, other) -> bool:
         if isinstance(other, type(self)):
             return self.allows_duplicate_labels == other.allows_duplicate_labels
         return False
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -6530,7 +6530,7 @@ def sort_values(
         axis: Axis = ...,
         ascending=...,
         inplace: Literal[True],
-        kind: str = ...,
+        kind: SortKind = ...,
         na_position: str = ...,
         ignore_index: bool = ...,
         key: ValueKeyFunc = ...,
@@ -6544,7 +6544,7 @@ def sort_values(
         axis: Axis = 0,
         ascending: bool | list[bool] | tuple[bool, ...] = True,
         inplace: bool = False,
-        kind: str = "quicksort",
+        kind: SortKind = "quicksort",
         na_position: str = "last",
         ignore_index: bool = False,
         key: ValueKeyFunc = None,

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -6206,7 +6206,7 @@ def _check_inplace_setting(self, value) -> bool_t:
         """check whether we allow in-place setting with this type of value"""
         if self._is_mixed_type and not self._mgr.is_numeric_mixed_type:
             # allow an actual np.nan through
-            if is_float(value) and np.isnan(value) or value is lib.no_default:
+            if (is_float(value) and np.isnan(value)) or value is lib.no_default:
                 return True
 
             raise TypeError(

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -2868,8 +2868,9 @@ def fillna(self, value=None, downcast=None):
         DataFrame.fillna : Fill NaN values of a DataFrame.
         Series.fillna : Fill NaN Values of a Series.
         """
+        if not is_scalar(value):
+            raise TypeError(f"'value' must be a scalar, passed: {type(value).__name__}")
 
-        value = self._require_scalar(value)
         if self.hasnans:
             result = self.putmask(self._isnan, value)
             if downcast is None:
@@ -3211,7 +3212,7 @@ def union(self, other, sort=None):
 
         elif not len(other) or self.equals(other):
             # NB: whether this (and the `if not len(self)` check below) come before
-            #  or after the is_dtype_equal check above affects the returned dtype
+            #  or after the dtype equality check above affects the returned dtype
             result = self._get_reconciled_name_object(other)
             if sort is True:
                 return result.sort_values()
@@ -5119,16 +5120,6 @@ def _validate_fill_value(self, value):
             raise TypeError
         return value
 
-    @final
-    def _require_scalar(self, value):
-        """
-        Check that this is a scalar value that we can use for setitem-like
-        operations without changing dtype.
-        """
-        if not is_scalar(value):
-            raise TypeError(f"'value' must be a scalar, passed: {type(value).__name__}")
-        return value
-
     def _is_memory_usage_qualified(self) -> bool:
         """
         Return a boolean if we need a qualified .info display.

diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -1116,11 +1116,7 @@ def _engine(self):
         # calculating the indexer are shifted to 0
         sizes = np.ceil(
             np.log2(
-                [
-                    len(level)
-                    + libindex.multiindex_nulls_shift  # type: ignore[attr-defined]
-                    for level in self.levels
-                ]
+                [len(level) + libindex.multiindex_nulls_shift for level in self.levels]
             )
         )
 

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -309,11 +309,7 @@ def should_store(self, value: ArrayLike) -> bool:
         -------
         bool
         """
-        # faster equivalent to is_dtype_equal(value.dtype, self.dtype)
-        try:
-            return value.dtype == self.dtype
-        except TypeError:
-            return False
+        return value.dtype == self.dtype
 
     # ---------------------------------------------------------------------
     # Apply/Reduce and Helpers

diff --git a/pandas/core/ops/invalid.py b/pandas/core/ops/invalid.py
@@ -4,11 +4,15 @@
 from __future__ import annotations
 
 import operator
+from typing import TYPE_CHECKING
 
 import numpy as np
 
+if TYPE_CHECKING:
+    from pandas._typing import npt
 
-def invalid_comparison(left, right, op) -> np.ndarray:
+
+def invalid_comparison(left, right, op) -> npt.NDArray[np.bool_]:
     """
     If a comparison has mismatched types and is not necessarily meaningful,
     follow python3 conventions by:

diff --git a/pandas/core/ops/missing.py b/pandas/core/ops/missing.py
@@ -27,16 +27,10 @@
 
 import numpy as np
 
-from pandas.core.dtypes.common import (
-    is_float_dtype,
-    is_integer_dtype,
-    is_scalar,
-)
-
 from pandas.core import roperator
 
 
-def _fill_zeros(result, x, y):
+def _fill_zeros(result: np.ndarray, x, y):
     """
     If this is a reversed op, then flip x,y
 
@@ -46,11 +40,11 @@ def _fill_zeros(result, x, y):
 
     Mask the nan's from x.
     """
-    if is_float_dtype(result.dtype):
+    if result.dtype.kind == "f":
         return result
 
     is_variable_type = hasattr(y, "dtype")
-    is_scalar_type = is_scalar(y)
+    is_scalar_type = not isinstance(y, np.ndarray)
 
     if not is_variable_type and not is_scalar_type:
         # e.g. test_series_ops_name_retention with mod we get here with list/tuple
@@ -59,7 +53,7 @@ def _fill_zeros(result, x, y):
     if is_scalar_type:
         y = np.array(y)
 
-    if is_integer_dtype(y.dtype):
+    if y.dtype.kind in "iu":
         ymask = y == 0
         if ymask.any():
             # GH#7325, mask and nans must be broadcastable
@@ -143,7 +137,9 @@ def dispatch_fill_zeros(op, left, right, result):
     ----------
     op : function (operator.add, operator.div, ...)
     left : object (np.ndarray for non-reversed ops)
+        We have excluded ExtensionArrays here
     right : object (np.ndarray for reversed ops)
+        We have excluded ExtensionArrays here
     result : ndarray
 
     Returns