diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index dc9576bc94d4c..0cbc300ee2fc4 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -1,5 +1,5 @@ """ -Benchmarks in this fiel depend exclusively on code in _libs/ +Benchmarks in this file depend exclusively on code in _libs/ If a PR does not edit anything in _libs, it is very unlikely that benchmarks in this file will be affected. diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 59b952cd46c56..0450a3483d346 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -6,9 +6,9 @@ from pandas._typing import npt def group_median_float64( out: np.ndarray, # ndarray[float64_t, ndim=2] - counts: np.ndarray, # ndarray[int64_t] + counts: npt.NDArray[np.int64], values: np.ndarray, # ndarray[float64_t, ndim=2] - labels: np.ndarray, # ndarray[int64_t] + labels: npt.NDArray[np.int64], min_count: int = ..., # Py_ssize_t ) -> None: ... def group_cumprod_float64( @@ -37,7 +37,7 @@ def group_fillna_indexer( out: np.ndarray, # ndarray[intp_t] labels: np.ndarray, # ndarray[int64_t] sorted_labels: npt.NDArray[np.intp], - mask: np.ndarray, # ndarray[uint8_t] + mask: npt.NDArray[np.uint8], direction: Literal["ffill", "bfill"], limit: int, # int64_t dropna: bool, diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index c6319a1f24435..bf7df5776896b 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -5,6 +5,8 @@ from typing import ( import numpy as np +from pandas._typing import npt + def unique_label_indices( labels: np.ndarray, # const int64_t[:] ) -> np.ndarray: ... @@ -19,11 +21,11 @@ class ObjectFactorizer(Factorizer): uniques: ObjectVector def factorize( self, - values: np.ndarray, # ndarray[object] + values: npt.NDArray[np.object_], sort: bool = ..., na_sentinel=..., na_value=..., - ) -> np.ndarray: ... # np.ndarray[intp] + ) -> npt.NDArray[np.intp]: ... class Int64Factorizer(Factorizer): table: Int64HashTable @@ -34,77 +36,77 @@ class Int64Factorizer(Factorizer): sort: bool = ..., na_sentinel=..., na_value=..., - ) -> np.ndarray: ... # np.ndarray[intp] + ) -> npt.NDArray[np.intp]: ... class Int64Vector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[np.int64] + def to_array(self) -> npt.NDArray[np.int64]: ... class Int32Vector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[np.int32] + def to_array(self) -> npt.NDArray[np.int32]: ... class Int16Vector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[np.int16] + def to_array(self) -> npt.NDArray[np.int16]: ... class Int8Vector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[np.int8] + def to_array(self) -> npt.NDArray[np.int8]: ... class UInt64Vector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint64] + def to_array(self) -> npt.NDArray[np.uint64]: ... class UInt32Vector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint32] + def to_array(self) -> npt.NDArray[np.uint32]: ... class UInt16Vector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint16] + def to_array(self) -> npt.NDArray[np.uint16]: ... class UInt8Vector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint8] + def to_array(self) -> npt.NDArray[np.uint8]: ... class Float64Vector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[np.float64] + def to_array(self) -> npt.NDArray[np.float64]: ... class Float32Vector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[np.float32] + def to_array(self) -> npt.NDArray[np.float32]: ... class Complex128Vector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[np.complex128] + def to_array(self) -> npt.NDArray[np.complex128]: ... class Complex64Vector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[np.complex64] + def to_array(self) -> npt.NDArray[np.complex64]: ... class StringVector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[object] + def to_array(self) -> npt.NDArray[np.object_]: ... class ObjectVector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[object] + def to_array(self) -> npt.NDArray[np.object_]: ... class HashTable: # NB: The base HashTable class does _not_ actually have these methods; @@ -131,7 +133,7 @@ class HashTable: def lookup( self, values: np.ndarray, # np.ndarray[subclass-specific] - ) -> np.ndarray: ... # np.ndarray[np.intp] + ) -> npt.NDArray[np.intp]: ... def get_labels( self, values: np.ndarray, # np.ndarray[subclass-specific] @@ -139,14 +141,14 @@ class HashTable: count_prior: int = ..., na_sentinel: int = ..., na_value: object = ..., - ) -> np.ndarray: ... # np.ndarray[intp_t] + ) -> npt.NDArray[np.intp]: ... def unique( self, values: np.ndarray, # np.ndarray[subclass-specific] return_inverse: bool = ..., ) -> tuple[ np.ndarray, # np.ndarray[subclass-specific] - np.ndarray, # np.ndarray[np.intp], + npt.NDArray[np.intp], ] | np.ndarray: ... # np.ndarray[subclass-specific] def _unique( self, @@ -159,7 +161,7 @@ class HashTable: return_inverse: bool = ..., ) -> tuple[ np.ndarray, # np.ndarray[subclass-specific] - np.ndarray, # np.ndarray[np.intp], + npt.NDArray[np.intp], ] | np.ndarray: ... # np.ndarray[subclass-specific] def factorize( self, @@ -167,10 +169,7 @@ class HashTable: na_sentinel: int = ..., na_value: object = ..., mask=..., - ) -> tuple[ - np.ndarray, # np.ndarray[subclass-specific] - np.ndarray, # np.ndarray[np.intp], - ]: ... + ) -> tuple[np.ndarray, npt.NDArray[np.intp],]: ... # np.ndarray[subclass-specific] class Complex128HashTable(HashTable): ... class Complex64HashTable(HashTable): ... @@ -182,10 +181,7 @@ class Int64HashTable(HashTable): def get_labels_groupby( self, values: np.ndarray, # const int64_t[:] - ) -> tuple[ - np.ndarray, # np.ndarray[np.intp] - np.ndarray, # np.ndarray[np.int64] - ]: ... + ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64],]: ... class Int32HashTable(HashTable): ... class Int16HashTable(HashTable): ... @@ -200,32 +196,32 @@ class PyObjectHashTable(HashTable): ... def duplicated_int64( values: np.ndarray, # const int64_t[:] values keep: Literal["last", "first", False] = ..., -) -> np.ndarray: ... # np.ndarray[bool] +) -> npt.NDArray[np.bool_]: ... # TODO: Is it actually bool or is it uint8? def mode_int64( values: np.ndarray, # const int64_t[:] values dropna: bool, -) -> np.ndarray: ... # np.ndarray[np.int64] +) -> npt.NDArray[np.int64]: ... def value_count_int64( values: np.ndarray, # const int64_t[:] dropna: bool, -) -> tuple[np.ndarray, np.ndarray,]: ... # np.ndarray[np.int64] # np.ndarray[np.int64] +) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: ... def duplicated( values: np.ndarray, keep: Literal["last", "first", False] = ..., -) -> np.ndarray: ... # np.ndarray[bool] +) -> npt.NDArray[np.bool_]: ... def mode(values: np.ndarray, dropna: bool) -> np.ndarray: ... def value_count( values: np.ndarray, dropna: bool, -) -> tuple[np.ndarray, np.ndarray,]: ... # np.ndarray[np.int64] +) -> tuple[np.ndarray, npt.NDArray[np.int64],]: ... # np.ndarray[same-as-values] # arr and values should have same dtype def ismember( arr: np.ndarray, values: np.ndarray, -) -> np.ndarray: ... # np.ndarray[bool] +) -> npt.NDArray[np.bool_]: ... def object_hash(obj) -> int: ... def objects_are_equal(a, b) -> bool: ... diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 185cbf6fd91b1..bf1ab88a87f34 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -298,7 +298,6 @@ cdef class IndexEngine: Py_ssize_t i, j, n, n_t, n_alloc bint d_has_nan = False, stargets_has_nan = False, need_nan_check = True - self._ensure_mapping_populated() values = self.values stargets = set(targets) @@ -740,7 +739,6 @@ cdef class BaseMultiIndexCodesEngine: return self._base.get_loc(self, lab_int) def get_indexer_non_unique(self, target: np.ndarray) -> np.ndarray: - # target: MultiIndex indexer = self._base.get_indexer_non_unique(self, target) return indexer diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 002473a1a5fb2..105eb1820df8b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -28,6 +28,7 @@ DtypeObj, F, Shape, + npt, ) from pandas.util._decorators import cache_readonly from pandas.util._validators import validate_bool_kwarg @@ -1278,7 +1279,13 @@ def where(self, other, cond, errors="raise") -> list[Block]: return result_blocks - def _unstack(self, unstacker, fill_value, new_placement, allow_fill: bool): + def _unstack( + self, + unstacker, + fill_value, + new_placement: npt.NDArray[np.intp], + allow_fill: bool, + ): """ Return a list of unstacked blocks of self @@ -1668,7 +1675,13 @@ def where(self, other, cond, errors="raise") -> list[Block]: return [self.make_block_same_class(result)] - def _unstack(self, unstacker, fill_value, new_placement, allow_fill: bool): + def _unstack( + self, + unstacker, + fill_value, + new_placement: npt.NDArray[np.intp], + allow_fill: bool, + ): # ExtensionArray-safe unstack. # We override ObjectBlock._unstack, which unstacks directly on the # values of the array. For EA-backed blocks, this would require diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index d810bbecc412f..fa09f003bc7b8 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2172,10 +2172,16 @@ def _factorize_keys( rizer = klass(max(len(lk), len(rk))) - llab = rizer.factorize(lk) - rlab = rizer.factorize(rk) - assert llab.dtype == np.intp, llab.dtype - assert rlab.dtype == np.intp, rlab.dtype + # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type + # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], + # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]" + llab = rizer.factorize(lk) # type: ignore[arg-type] + # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type + # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], + # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]" + rlab = rizer.factorize(rk) # type: ignore[arg-type] + assert llab.dtype == np.dtype(np.intp), llab.dtype + assert rlab.dtype == np.dtype(np.intp), rlab.dtype count = rizer.get_count() diff --git a/pandas/core/series.py b/pandas/core/series.py index 7f612df095c4b..a891a7bdd8fbd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -449,7 +449,9 @@ def __init__( self.name = name self._set_axis(0, index, fastpath=True) - def _init_dict(self, data, index=None, dtype: Dtype | None = None): + def _init_dict( + self, data, index: Index | None = None, dtype: DtypeObj | None = None + ): """ Derive the "_mgr" and "index" attributes of a new Series from a dictionary input. @@ -458,9 +460,9 @@ def _init_dict(self, data, index=None, dtype: Dtype | None = None): ---------- data : dict or dict-like Data used to populate the new Series. - index : Index or index-like, default None + index : Index or None, default None Index for the new Series: if None, use dict keys. - dtype : dtype, default None + dtype : np.dtype, ExtensionDtype, or None, default None The dtype for the new Series: if None, infer from data. Returns @@ -468,6 +470,8 @@ def _init_dict(self, data, index=None, dtype: Dtype | None = None): _data : BlockManager for the new Series index : index for the new Series """ + keys: Index | tuple + # Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')] # raises KeyError), so we iterate the entire dict, and align if data: diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 80fbe809099a9..a8b05e3178197 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -32,6 +32,7 @@ def df(): # TODO(ArrayManager) dask is still accessing the blocks # https://github.com/dask/dask/pull/7318 @td.skip_array_manager_not_yet_implemented +@pytest.mark.filterwarnings("ignore:.*64Index is deprecated:FutureWarning") def test_dask(df): toolz = import_module("toolz") # noqa @@ -92,6 +93,7 @@ def test_oo_optimized_datetime_index_unpickle(): # Cython import warning @pytest.mark.filterwarnings("ignore:pandas.util.testing is deprecated") @pytest.mark.filterwarnings("ignore:can't:ImportWarning") +@pytest.mark.filterwarnings("ignore:.*64Index is deprecated:FutureWarning") @pytest.mark.filterwarnings( # patsy needs to update their imports "ignore:Using or importing the ABCs from 'collections:DeprecationWarning"