From 25c6cfd607888baaa930889af87297e934061ba2 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 8 Sep 2021 14:06:34 -0700 Subject: [PATCH 1/7] TYP: groupby --- pandas/core/groupby/ops.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 07353863cdc0b..14d7179751ad7 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -807,7 +807,7 @@ def is_monotonic(self) -> bool: return Index(self.group_info[0]).is_monotonic @cache_readonly - def group_info(self): + def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64], int]: comp_ids, obs_group_ids = self._get_compressed_codes() ngroups = len(obs_group_ids) @@ -817,22 +817,26 @@ def group_info(self): @final @cache_readonly - def codes_info(self) -> np.ndarray: + def codes_info(self) -> npt.NDArray[np.intp]: # return the codes of items in original grouped axis ids, _, _ = self.group_info if self.indexer is not None: sorter = np.lexsort((ids, self.indexer)) ids = ids[sorter] + ids = ensure_platform_int(ids) + # TODO: if numpy annotates np.lexsort, this ensure_platform_int + # may become unnecessary return ids @final - def _get_compressed_codes(self) -> tuple[np.ndarray, np.ndarray]: + def _get_compressed_codes(self) -> tuple[np.ndarray, npt.NDArray[np.int64]]: + # The first returned ndarray may have any signed integer dtype if len(self.groupings) > 1: group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True) return compress_group_index(group_index, sort=self._sort) ping = self.groupings[0] - return ping.codes, np.arange(len(ping.group_index)) + return ping.codes, np.arange(len(ping.group_index), dtype=np.int64) @final @cache_readonly @@ -1017,7 +1021,7 @@ class BinGrouper(BaseGrouper): """ - bins: np.ndarray # np.ndarray[np.int64] + bins: npt.NDArray[np.int64] binlabels: Index mutated: bool @@ -1101,7 +1105,7 @@ def indices(self): return indices @cache_readonly - def group_info(self): + def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64], int]: ngroups = self.ngroups obs_group_ids = np.arange(ngroups, dtype=np.int64) rep = np.diff(np.r_[0, self.bins]) From 882c1436912ba6e2ee643f7990bcbc3478f0fc74 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 19 Sep 2021 12:30:47 -0700 Subject: [PATCH 2/7] TYP: groupby --- pandas/_libs/lib.pyi | 2 +- pandas/core/groupby/grouper.py | 3 ++- pandas/core/groupby/ops.py | 2 +- pandas/core/sorting.py | 3 ++- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index e60250169edd1..87fe20214713e 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -197,7 +197,7 @@ def indices_fast( labels: np.ndarray, # const int64_t[:] keys: list, sorted_labels: list[npt.NDArray[np.int64]], -) -> dict: ... +) -> dict[Hashable, npt.NDArray[np.intp]]: ... def generate_slices( labels: np.ndarray, ngroups: int # const intp_t[:] ) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: ... diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 02f2024091877..7577b1e671d60 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -17,6 +17,7 @@ from pandas._typing import ( ArrayLike, NDFrameT, + npt, ) from pandas.errors import InvalidIndexError from pandas.util._decorators import cache_readonly @@ -604,7 +605,7 @@ def ngroups(self) -> int: return len(self.group_index) @cache_readonly - def indices(self): + def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: # we have a list of groupers if isinstance(self.grouping_vector, ops.BaseGrouper): return self.grouping_vector.indices diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 14d7179751ad7..1cde1da731837 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -756,7 +756,7 @@ def apply( return result_values, mutated @cache_readonly - def indices(self): + def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: """dict {group name -> group indices}""" if len(self.groupings) == 1 and isinstance(self.result_index, CategoricalIndex): # This shows unused categories in indices GH#38642 diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index befa67350e182..ccb51a0ea2132 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -6,6 +6,7 @@ TYPE_CHECKING, Callable, DefaultDict, + Hashable, Iterable, Sequence, ) @@ -576,7 +577,7 @@ def get_flattened_list( def get_indexer_dict( label_list: list[np.ndarray], keys: list[Index] -) -> dict[str | tuple, np.ndarray]: +) -> dict[Hashable, npt.NDArray[np.intp]]: """ Returns ------- From 82d5ff59a4e848579c82f7c7465719a12f8acb6a Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 19 Sep 2021 12:51:52 -0700 Subject: [PATCH 3/7] missing import --- pandas/_libs/lib.pyi | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 87fe20214713e..b88a2e4c28cfb 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -5,6 +5,7 @@ from typing import ( Any, Callable, Generator, + Hashable, Literal, overload, ) From 9da17fb76d58a2595fc5e1f110c13e8ea0e01605 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 23 Sep 2021 11:21:14 -0700 Subject: [PATCH 4/7] TYP blocks --- pandas/core/internals/blocks.py | 39 ++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 002473a1a5fb2..2279dbd283905 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -28,6 +28,7 @@ DtypeObj, F, Shape, + npt, ) from pandas.util._decorators import cache_readonly from pandas.util._validators import validate_bool_kwarg @@ -294,7 +295,7 @@ def __repr__(self) -> str: def __len__(self) -> int: return len(self.values) - def _slice(self, slicer): + def _slice(self, slicer) -> ArrayLike: """return a slice of my values""" return self.values[slicer] @@ -343,7 +344,7 @@ def dtype(self) -> DtypeObj: def iget(self, i): return self.values[i] - def set_inplace(self, locs, values): + def set_inplace(self, locs, values) -> None: """ Modify block values in-place with new item value. @@ -562,13 +563,13 @@ def _downcast_2d(self) -> list[Block]: return [self.make_block(new_values)] @final - def astype(self, dtype, copy: bool = False, errors: str = "raise"): + def astype(self, dtype: DtypeObj, copy: bool = False, errors: str = "raise"): """ Coerce to the new dtype. Parameters ---------- - dtype : str, dtype convertible + dtype : np.dtype or ExtensionDtype copy : bool, default False copy if indicated errors : str, {'raise', 'ignore'}, default 'raise' @@ -1278,7 +1279,13 @@ def where(self, other, cond, errors="raise") -> list[Block]: return result_blocks - def _unstack(self, unstacker, fill_value, new_placement, allow_fill: bool): + def _unstack( + self, + unstacker, + fill_value, + new_placement: npt.NDArray[np.intp], + allow_fill: bool, + ): """ Return a list of unstacked blocks of self @@ -1434,7 +1441,7 @@ def iget(self, col): raise IndexError(f"{self} only contains one item") return self.values - def set_inplace(self, locs, values): + def set_inplace(self, locs, values) -> None: # NB: This is a misnomer, is supposed to be inplace but is not, # see GH#33457 assert locs.tolist() == [0] @@ -1502,7 +1509,7 @@ def setitem(self, indexer, value): # https://github.com/pandas-dev/pandas/issues/24020 # Need a dedicated setitem until GH#24020 (type promotion in setitem # for extension arrays) is designed and implemented. - return self.astype(object).setitem(indexer, value) + return self.astype(_dtype_obj).setitem(indexer, value) if isinstance(indexer, tuple): # TODO(EA2D): not needed with 2D EAs @@ -1540,7 +1547,7 @@ def take_nd( return self.make_block_same_class(new_values, new_mgr_locs) - def _slice(self, slicer): + def _slice(self, slicer) -> ExtensionArray: """ Return a slice of my values. @@ -1551,7 +1558,7 @@ def _slice(self, slicer): Returns ------- - np.ndarray or ExtensionArray + ExtensionArray """ # return same dims as we currently have if not isinstance(slicer, tuple) and self.ndim == 2: @@ -1668,7 +1675,13 @@ def where(self, other, cond, errors="raise") -> list[Block]: return [self.make_block_same_class(result)] - def _unstack(self, unstacker, fill_value, new_placement, allow_fill: bool): + def _unstack( + self, + unstacker, + fill_value, + new_placement: npt.NDArray[np.intp], + allow_fill: bool, + ): # ExtensionArray-safe unstack. # We override ObjectBlock._unstack, which unstacks directly on the # values of the array. For EA-backed blocks, this would require @@ -1723,7 +1736,7 @@ def is_view(self) -> bool: def setitem(self, indexer, value): if not self._can_hold_element(value): # TODO: general case needs casting logic. - return self.astype(object).setitem(indexer, value) + return self.astype(_dtype_obj).setitem(indexer, value) values = self.values if self.ndim > 1: @@ -1737,7 +1750,7 @@ def putmask(self, mask, new) -> list[Block]: mask = extract_bool_array(mask) if not self._can_hold_element(new): - return self.astype(object).putmask(mask, new) + return self.astype(_dtype_obj).putmask(mask, new) arr = self.values arr.T.putmask(mask, new) @@ -1795,7 +1808,7 @@ def fillna( # We support filling a DatetimeTZ with a `value` whose timezone # is different by coercing to object. # TODO: don't special-case td64 - return self.astype(object).fillna(value, limit, inplace, downcast) + return self.astype(_dtype_obj).fillna(value, limit, inplace, downcast) values = self.values values = values if inplace else values.copy() From 86f29b3b1652d6df04ca20c3a672b288b68e4b38 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 29 Sep 2021 20:31:13 -0700 Subject: [PATCH 5/7] 32bit fixup --- pandas/core/groupby/ops.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 1cde1da731837..ece704d98046e 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -807,7 +807,7 @@ def is_monotonic(self) -> bool: return Index(self.group_info[0]).is_monotonic @cache_readonly - def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64], int]: + def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: comp_ids, obs_group_ids = self._get_compressed_codes() ngroups = len(obs_group_ids) @@ -829,14 +829,14 @@ def codes_info(self) -> npt.NDArray[np.intp]: return ids @final - def _get_compressed_codes(self) -> tuple[np.ndarray, npt.NDArray[np.int64]]: + def _get_compressed_codes(self) -> tuple[np.ndarray, npt.NDArray[np.intp]]: # The first returned ndarray may have any signed integer dtype if len(self.groupings) > 1: group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True) return compress_group_index(group_index, sort=self._sort) ping = self.groupings[0] - return ping.codes, np.arange(len(ping.group_index), dtype=np.int64) + return ping.codes, np.arange(len(ping.group_index), dtype=np.intp) @final @cache_readonly @@ -1105,9 +1105,9 @@ def indices(self): return indices @cache_readonly - def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64], int]: + def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: ngroups = self.ngroups - obs_group_ids = np.arange(ngroups, dtype=np.int64) + obs_group_ids = np.arange(ngroups, dtype=np.intp) rep = np.diff(np.r_[0, self.bins]) rep = ensure_platform_int(rep) From d3e4e7874003fad5817db5fdb7047ef2d3674902 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 30 Sep 2021 09:02:07 -0700 Subject: [PATCH 6/7] win32 compat --- pandas/tests/groupby/test_grouping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 527b93a28359c..394352036e50a 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -688,7 +688,7 @@ def test_groupby_empty(self): ) tm.assert_numpy_array_equal( - gr.grouper.group_info[1], np.array([], dtype=np.dtype("int")) + gr.grouper.group_info[1], np.array([], dtype=np.dtype(np.int64)) ) assert gr.grouper.group_info[2] == 0 From 3f76c43bbaf5c45d81849a7a5b1cd61c71fc2563 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 30 Sep 2021 11:09:10 -0700 Subject: [PATCH 7/7] 32bit fixup --- pandas/tests/groupby/test_grouping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 394352036e50a..efb0b82f58e97 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -688,7 +688,7 @@ def test_groupby_empty(self): ) tm.assert_numpy_array_equal( - gr.grouper.group_info[1], np.array([], dtype=np.dtype(np.int64)) + gr.grouper.group_info[1], np.array([], dtype=np.dtype(np.intp)) ) assert gr.grouper.group_info[2] == 0