From f3c76f034a2ecb57d1cd05a30f969ffb90c41ab5 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 24 Jan 2022 12:34:19 -0800 Subject: [PATCH 1/8] CLN: typo fixups, avoid warnings in test_eval --- pandas/core/arrays/_mixins.py | 4 ++-- pandas/core/arrays/datetimes.py | 1 - pandas/core/internals/blocks.py | 17 ++++++++++------- pandas/tests/computation/test_eval.py | 2 +- pandas/tests/io/parser/test_unsupported.py | 2 +- 5 files changed, 14 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index a40be5a988f26..3446d5fc43a65 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -180,7 +180,7 @@ def _values_for_argsort(self) -> np.ndarray: def argmin(self, axis: int = 0, skipna: bool = True): # type:ignore[override] # override base class by adding axis keyword validate_bool_kwarg(skipna, "skipna") - if not skipna and self.isna().any(): + if not skipna and self._hasna: raise NotImplementedError return nargminmax(self, "argmin", axis=axis) @@ -188,7 +188,7 @@ def argmin(self, axis: int = 0, skipna: bool = True): # type:ignore[override] def argmax(self, axis: int = 0, skipna: bool = True): # type:ignore[override] # override base class by adding axis keyword validate_bool_kwarg(skipna, "skipna") - if not skipna and self.isna().any(): + if not skipna and self._hasna: raise NotImplementedError return nargminmax(self, "argmax", axis=axis) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 87acadc01faad..9c262fa37d760 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2289,7 +2289,6 @@ def maybe_convert_dtype(data, copy: bool): copy = False elif is_extension_array_dtype(data.dtype) and not is_datetime64tz_dtype(data.dtype): - # Includes categorical # TODO: We have no tests for these data = np.array(data, dtype=np.object_) copy = False diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3d4f53530b89c..93696b0da97d8 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -367,14 +367,16 @@ def iget(self, i: int | tuple[int, int] | tuple[slice, int]): # "Union[int, integer[Any]]" return self.values[i] # type: ignore[index] - def set_inplace(self, locs, values) -> None: + def set_inplace(self, locs, values: ArrayLike) -> None: """ Modify block values in-place with new item value. Notes ----- - `set` never creates a new array or new Block, whereas `setitem` _may_ - create a new array and always creates a new Block. + `set_inplace` never creates a new array or new Block, whereas `setitem` + _may_ create a new array and always creates a new Block. + + Caller is responsible for checking values.dtype == self.dtype. """ self.values[locs] = values @@ -1183,7 +1185,7 @@ def where(self, other, cond) -> list[Block]: icond, noop = validate_putmask(values, ~cond) if noop: # GH-39595: Always return a copy; short-circuit up/downcasting - return self.copy() + return [self.copy()] if other is lib.no_default: other = self.fill_value @@ -1375,7 +1377,8 @@ def setitem(self, indexer, value): values = self.values if values.ndim == 2: - # TODO: string[pyarrow] tests break if we transpose unconditionally + # TODO(GH#45419): string[pyarrow] tests break if we transpose + # unconditionally values = values.T check_setitem_lengths(indexer, value, values) values[indexer] = value @@ -1396,7 +1399,7 @@ def where(self, other, cond) -> list[Block]: if noop: # GH#44181, GH#45135 # Avoid a) raising for Interval/PeriodDtype and b) unnecessary object upcast - return self.copy() + return [self.copy()] try: res_values = arr._where(cond, other).T @@ -1597,7 +1600,7 @@ def iget(self, i: int | tuple[int, int] | tuple[slice, int]): raise IndexError(f"{self} only contains one item") return self.values - def set_inplace(self, locs, values) -> None: + def set_inplace(self, locs, values: ArrayLike) -> None: # NB: This is a misnomer, is supposed to be inplace but is not, # see GH#33457 # When an ndarray, we should have locs.tolist() == [0] diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 94b1f5d2717a4..bf74e11ae247a 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -797,7 +797,7 @@ def _is_datetime(x): def should_warn(*args): - not_mono = not any(map(operator.attrgetter("is_monotonic"), args)) + not_mono = not any(map(operator.attrgetter("is_monotonic_increasing"), args)) only_one_dt = reduce(operator.xor, map(_is_datetime, args)) return not_mono and only_one_dt diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index e223378b600e0..67f155817582c 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -166,7 +166,7 @@ def test_on_bad_lines_callable_python_only(self, all_parsers): parser.read_csv(sio, on_bad_lines=bad_lines_func) -def test_close_file_handle_on_invalide_usecols(all_parsers): +def test_close_file_handle_on_invalid_usecols(all_parsers): # GH 45384 parser = all_parsers From 75700a93a4e78223cc0f1faa7cda74117d68ca57 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 25 Jan 2022 11:52:16 -0800 Subject: [PATCH 2/8] check only needed in older numpy --- pandas/core/array_algos/putmask.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/array_algos/putmask.py b/pandas/core/array_algos/putmask.py index daf7d0bd3f213..1082f8d71af01 100644 --- a/pandas/core/array_algos/putmask.py +++ b/pandas/core/array_algos/putmask.py @@ -12,6 +12,7 @@ ArrayLike, npt, ) +from pandas.compat import np_version_under1p20 from pandas.core.dtypes.cast import ( can_hold_element, @@ -126,7 +127,8 @@ def putmask_without_repeat( mask : np.ndarray[bool] new : Any """ - new = setitem_datetimelike_compat(values, mask.sum(), new) + if np_version_under1p20: + new = setitem_datetimelike_compat(values, mask.sum(), new) if getattr(new, "ndim", 0) >= 1: new = new.astype(values.dtype, copy=False) From 57839655daa83fb38b4f63fe2bb9851fb3728207 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 27 Jan 2022 10:46:24 -0800 Subject: [PATCH 3/8] supress warnings --- pandas/util/_test_decorators.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 33bde4e69b042..78ef335adf948 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -80,6 +80,12 @@ def safe_import(mod_name: str, min_version: str | None = None): message=".*Int64Index.*", ) + warnings.filterwarnings( + "ignore", + category=DeprecationWarning, + message="distutils Version classes are deprecated.*", + ) + try: mod = __import__(mod_name) except ImportError: From 9dd3b705dac5ae4d7d0951eda5364aeee89d47a9 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 27 Jan 2022 11:16:06 -0800 Subject: [PATCH 4/8] mypy fixup --- pandas/core/internals/blocks.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 93696b0da97d8..accacbf464434 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1605,7 +1605,11 @@ def set_inplace(self, locs, values: ArrayLike) -> None: # see GH#33457 # When an ndarray, we should have locs.tolist() == [0] # When a BlockPlacement we should have list(locs) == [0] - self.values = values + + # error: Incompatible types in assignment (expression has type + # "Union[ExtensionArray, ndarray[Any, Any]]", variable has type + # "ExtensionArray") + self.values = values # type: ignore[assignment] try: # TODO(GH33457) this can be removed self._cache.clear() From 6a4d26776bedad91e2a6ca6515ed73e51b0cc4a1 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 18 Feb 2022 19:06:56 -0800 Subject: [PATCH 5/8] TYP/CLN: assorted --- pandas/core/internals/array_manager.py | 5 +++-- pandas/core/internals/blocks.py | 15 +++++---------- pandas/core/internals/managers.py | 4 ++-- pandas/core/util/hashing.py | 19 +++++++++++++------ pandas/tests/frame/indexing/test_where.py | 14 ++------------ pandas/tests/indexing/test_loc.py | 6 +++--- 6 files changed, 28 insertions(+), 35 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index c7ec4f35e0ff1..3e499c99ac144 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -21,6 +21,7 @@ from pandas._typing import ( ArrayLike, DtypeObj, + npt, ) from pandas.util._validators import validate_bool_kwarg @@ -568,7 +569,7 @@ def reindex_indexer( def _reindex_indexer( self: T, new_axis, - indexer, + indexer: npt.NDArray[np.intp] | None, axis: int, fill_value=None, allow_dups: bool = False, @@ -579,7 +580,7 @@ def _reindex_indexer( Parameters ---------- new_axis : Index - indexer : ndarray of int64 or None + indexer : ndarray[intp] or None axis : int fill_value : object, default None allow_dups : bool, default False diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 5acba7960111e..57cc88613cb7d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -87,7 +87,6 @@ replace_regex, should_use_regex, ) -from pandas.core.array_algos.take import take_nd from pandas.core.array_algos.transforms import shift from pandas.core.arrays import ( Categorical, @@ -827,19 +826,14 @@ def set_inplace(self, locs, values: ArrayLike) -> None: def take_nd( self, - indexer, + indexer: npt.NDArray[np.intp], axis: int, new_mgr_locs: BlockPlacement | None = None, fill_value=lib.no_default, ) -> Block: """ - Take values according to indexer and return them as a block.bb - + Take values according to indexer and return them as a block. """ - # algos.take_nd dispatches for DatetimeTZBlock, CategoricalBlock - # so need to preserve types - # sparse is treated like an ndarray, but needs .get_values() shaping - values = self.values if fill_value is lib.no_default: @@ -848,6 +842,7 @@ def take_nd( else: allow_fill = True + # Note: algos.take_nd has upcast logic similar to coerce_to_target_dtype new_values = algos.take_nd( values, indexer, axis=axis, allow_fill=allow_fill, fill_value=fill_value ) @@ -1727,7 +1722,7 @@ def is_numeric(self): def take_nd( self, - indexer, + indexer: npt.NDArray[np.intp], axis: int = 0, new_mgr_locs: BlockPlacement | None = None, fill_value=lib.no_default, @@ -2259,7 +2254,7 @@ def to_native_types( """convert to our native types format""" if isinstance(values, Categorical): # GH#40754 Convert categorical datetimes to datetime array - values = take_nd( + values = algos.take_nd( values.categories._values, ensure_platform_int(values._codes), fill_value=na_rep, diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 6297a7578ccd4..4b8f1aae75b1b 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -597,7 +597,7 @@ def consolidate(self: T) -> T: def reindex_indexer( self: T, new_axis: Index, - indexer, + indexer: npt.NDArray[np.intp] | None, axis: int, fill_value=None, allow_dups: bool = False, @@ -610,7 +610,7 @@ def reindex_indexer( Parameters ---------- new_axis : Index - indexer : ndarray of int64 or None + indexer : ndarray[intp] or None axis : int fill_value : object, default None allow_dups : bool, default False diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 79db60ab5a7ce..5a5e46e0227aa 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -16,7 +16,10 @@ from pandas._libs import lib from pandas._libs.hashing import hash_object_array -from pandas._typing import ArrayLike +from pandas._typing import ( + ArrayLike, + npt, +) from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -44,7 +47,9 @@ _default_hash_key = "0123456789123456" -def combine_hash_arrays(arrays: Iterator[np.ndarray], num_items: int) -> np.ndarray: +def combine_hash_arrays( + arrays: Iterator[np.ndarray], num_items: int +) -> npt.NDArray[np.uint64]: """ Parameters ---------- @@ -172,7 +177,7 @@ def hash_tuples( vals: MultiIndex | Iterable[tuple[Hashable, ...]], encoding: str = "utf8", hash_key: str = _default_hash_key, -) -> np.ndarray: +) -> npt.NDArray[np.uint64]: """ Hash an MultiIndex / listlike-of-tuples efficiently. @@ -214,7 +219,9 @@ def hash_tuples( return h -def _hash_categorical(cat: Categorical, encoding: str, hash_key: str) -> np.ndarray: +def _hash_categorical( + cat: Categorical, encoding: str, hash_key: str +) -> npt.NDArray[np.uint64]: """ Hash a Categorical by hashing its categories, and then mapping the codes to the hashes @@ -257,7 +264,7 @@ def hash_array( encoding: str = "utf8", hash_key: str = _default_hash_key, categorize: bool = True, -) -> np.ndarray: +) -> npt.NDArray[np.uint64]: """ Given a 1d array, return an array of deterministic integers. @@ -306,7 +313,7 @@ def _hash_ndarray( encoding: str = "utf8", hash_key: str = _default_hash_key, categorize: bool = True, -) -> np.ndarray: +) -> npt.NDArray[np.uint64]: """ See hash_array.__doc__. """ diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index ca050a7d7db4a..46d3b00097337 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -769,25 +769,15 @@ def test_where_try_cast_deprecated(frame_or_series): obj.where(mask, -1, try_cast=False) -def test_where_int_downcasting_deprecated(using_array_manager, request): +def test_where_int_downcasting_deprecated(): # GH#44597 - if not using_array_manager: - mark = pytest.mark.xfail( - reason="After fixing a bug in can_hold_element, we don't go through " - "the deprecated path, and also up-cast both columns to int32 " - "instead of just 1." - ) - request.node.add_marker(mark) arr = np.arange(6).astype(np.int16).reshape(3, 2) df = DataFrame(arr) mask = np.zeros(arr.shape, dtype=bool) mask[:, 0] = True - msg = "Downcasting integer-dtype" - warn = FutureWarning if not using_array_manager else None - with tm.assert_produces_warning(warn, match=msg): - res = df.where(mask, 2**17) + res = df.where(mask, 2**17) expected = DataFrame({0: arr[:, 0], 1: np.array([2**17] * 3, dtype=np.int32)}) tm.assert_frame_equal(res, expected) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index d16d086a37d3d..4702b5e5c4504 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -44,16 +44,16 @@ @pytest.mark.parametrize( - "series, new_serie, expected_ser", + "series, new_series, expected_ser", [ [[np.nan, np.nan, "b"], ["a", np.nan, np.nan], [False, True, True]], [[np.nan, "b"], ["a", np.nan], [False, True]], ], ) -def test_not_change_nan_loc(series, new_serie, expected_ser): +def test_not_change_nan_loc(series, new_series, expected_ser): # GH 28403 df = DataFrame({"A": series}) - df["A"].loc[:] = new_serie + df["A"].loc[:] = new_series expected = DataFrame({"A": expected_ser}) tm.assert_frame_equal(df.isna(), expected) tm.assert_frame_equal(df.notna(), ~expected) From 7ae0bad416d287c5176d5f2db293bf21430ada1f Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 22 Feb 2022 11:28:00 -0800 Subject: [PATCH 6/8] TYP: assorted --- pandas/core/groupby/ops.py | 8 ++++---- pandas/core/reshape/merge.py | 3 +-- pandas/core/sorting.py | 31 +++++++++++++++-------------- pandas/core/strings/object_array.py | 2 +- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index cf046d92dd6f3..89de9d284ecef 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -437,8 +437,8 @@ def _cython_op_ndim_compat( min_count: int, ngroups: int, comp_ids: np.ndarray, - mask: np.ndarray | None = None, - result_mask: np.ndarray | None = None, + mask: npt.NDArray[np.bool_] | None = None, + result_mask: npt.NDArray[np.bool_] | None = None, **kwargs, ) -> np.ndarray: if values.ndim == 1: @@ -481,8 +481,8 @@ def _call_cython_op( min_count: int, ngroups: int, comp_ids: np.ndarray, - mask: np.ndarray | None, - result_mask: np.ndarray | None, + mask: npt.NDArray[np.bool_] | None, + result_mask: npt.NDArray[np.bool_] | None, **kwargs, ) -> np.ndarray: # np.ndarray[ndim=2] orig_values = values diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 44ead165fec25..46f74c5fc67e7 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2276,14 +2276,13 @@ def _factorize_keys( def _sort_labels( - uniques: np.ndarray, left: np.ndarray, right: np.ndarray + uniques: np.ndarray, left: npt.NDArray[np.intp], right: npt.NDArray[np.intp] ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: llength = len(left) labels = np.concatenate([left, right]) _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1) - assert new_labels.dtype == np.intp new_left, new_right = new_labels[:llength], new_labels[llength:] return new_left, new_right diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 7ab53ccf7cb8d..21d9107a61cb7 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -41,6 +41,7 @@ if TYPE_CHECKING: from pandas import MultiIndex + from pandas.core.arrays import ExtensionArray from pandas.core.indexes.base import Index @@ -52,7 +53,7 @@ def get_indexer_indexer( na_position: str, sort_remaining: bool, key: IndexKeyFunc, -) -> np.ndarray | None: +) -> npt.NDArray[np.intp] | None: """ Helper method that return the indexer according to input parameters for the sort_index method of DataFrame and Series. @@ -69,7 +70,7 @@ def get_indexer_indexer( Returns ------- - Optional[ndarray] + Optional[ndarray[intp]] The indexer for the new index. """ @@ -215,7 +216,7 @@ def get_compressed_ids( return compress_group_index(ids, sort=True) -def is_int64_overflow_possible(shape) -> bool: +def is_int64_overflow_possible(shape: Shape) -> bool: the_prod = 1 for x in shape: the_prod *= int(x) @@ -223,7 +224,7 @@ def is_int64_overflow_possible(shape) -> bool: return the_prod >= lib.i8max -def decons_group_index(comp_labels, shape): +def decons_group_index(comp_labels, shape: Shape): # reconstruct labels if is_int64_overflow_possible(shape): # at some point group indices are factorized, @@ -244,7 +245,7 @@ def decons_group_index(comp_labels, shape): def decons_obs_group_ids( - comp_ids: npt.NDArray[np.intp], obs_ids, shape, labels, xnull: bool + comp_ids: npt.NDArray[np.intp], obs_ids, shape: Shape, labels, xnull: bool ): """ Reconstruct labels from observed group ids. @@ -428,7 +429,7 @@ def nargsort( return ensure_platform_int(indexer) -def nargminmax(values, method: str, axis: int = 0): +def nargminmax(values: ExtensionArray, method: str, axis: int = 0): """ Implementation of np.argmin/argmax but for ExtensionArray and which handles missing values. @@ -447,21 +448,21 @@ def nargminmax(values, method: str, axis: int = 0): func = np.argmax if method == "argmax" else np.argmin mask = np.asarray(isna(values)) - values = values._values_for_argsort() + arr_values = values._values_for_argsort() - if values.ndim > 1: + if arr_values.ndim > 1: if mask.any(): if axis == 1: - zipped = zip(values, mask) + zipped = zip(arr_values, mask) else: - zipped = zip(values.T, mask.T) + zipped = zip(arr_values.T, mask.T) return np.array([_nanargminmax(v, m, func) for v, m in zipped]) - return func(values, axis=axis) + return func(arr_values, axis=axis) - return _nanargminmax(values, mask, func) + return _nanargminmax(arr_values, mask, func) -def _nanargminmax(values, mask, func) -> int: +def _nanargminmax(values: np.ndarray, mask: npt.NDArray[np.bool_], func) -> int: """ See nanargminmax.__doc__. """ @@ -591,9 +592,9 @@ def get_indexer_dict( dict: Labels mapped to indexers. """ - shape = [len(x) for x in keys] + shape = tuple(len(x) for x in keys) - group_index = get_group_index(label_list, tuple(shape), sort=True, xnull=True) + group_index = get_group_index(label_list, shape, sort=True, xnull=True) if np.all(group_index == -1): # Short-circuit, lib.indices_fast will return the same return {} diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 6b0380a292f07..2f65ce17f93b2 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -62,7 +62,7 @@ def _str_map( na_value = self._str_na_value if not len(self): - return np.ndarray(0, dtype=dtype) + return np.array([], dtype=dtype) arr = np.asarray(self, dtype=object) mask = isna(arr) From e3f56631f4119b8b121647524c75bfd3c05999c7 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 22 Feb 2022 12:09:50 -0800 Subject: [PATCH 7/8] avoid warnings --- pandas/tests/plotting/test_datetimelike.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 1b818f57d0572..197083a9ad940 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -523,7 +523,7 @@ def test_finder_hourly(self): def test_gaps(self): ts = tm.makeTimeSeries() - ts[5:25] = np.nan + ts.iloc[5:25] = np.nan _, ax = self.plt.subplots() ts.plot(ax=ax) lines = ax.get_lines() @@ -541,7 +541,7 @@ def test_gaps(self): # irregular ts = tm.makeTimeSeries() ts = ts[[0, 1, 2, 5, 7, 9, 12, 15, 20]] - ts[2:5] = np.nan + ts.iloc[2:5] = np.nan _, ax = self.plt.subplots() ax = ts.plot(ax=ax) lines = ax.get_lines() From 3565ba9e729f2082871439001f9315671ee77372 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 22 Feb 2022 13:41:47 -0800 Subject: [PATCH 8/8] TYP: masks --- pandas/core/algorithms.py | 2 +- pandas/core/array_algos/masked_reductions.py | 37 ++++++++++---------- pandas/core/array_algos/replace.py | 7 ++-- pandas/core/arrays/_mixins.py | 2 +- pandas/core/arrays/masked.py | 6 ++-- pandas/core/arrays/numeric.py | 5 ++- pandas/core/groupby/groupby.py | 4 +-- pandas/core/indexes/base.py | 4 ++- pandas/core/indexes/period.py | 3 +- pandas/core/internals/blocks.py | 2 +- pandas/core/missing.py | 2 +- pandas/core/tools/numeric.py | 3 +- pandas/io/pytables.py | 3 +- 13 files changed, 46 insertions(+), 34 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 8c10b62d83f9e..f6bda51298622 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -534,7 +534,7 @@ def factorize_array( na_sentinel: int = -1, size_hint: int | None = None, na_value=None, - mask: np.ndarray | None = None, + mask: npt.NDArray[np.bool_] | None = None, ) -> tuple[npt.NDArray[np.intp], np.ndarray]: """ Factorize a numpy array to codes and uniques. diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 66a3152de1499..3e59a267f7191 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -2,15 +2,14 @@ masked_reductions.py is for reduction algorithms using a mask-based approach for missing values. """ +from __future__ import annotations -from typing import ( - Callable, - Optional, -) +from typing import Callable import numpy as np from pandas._libs import missing as libmissing +from pandas._typing import npt from pandas.core.nanops import check_below_min_count @@ -18,11 +17,11 @@ def _sumprod( func: Callable, values: np.ndarray, - mask: np.ndarray, + mask: npt.NDArray[np.bool_], *, skipna: bool = True, min_count: int = 0, - axis: Optional[int] = None, + axis: int | None = None, ): """ Sum or product for 1D masked array. @@ -33,7 +32,7 @@ def _sumprod( values : np.ndarray Numpy array with the values (can be of any dtype that support the operation). - mask : np.ndarray + mask : np.ndarray[bool] Boolean numpy array (True values indicate missing values). skipna : bool, default True Whether to skip NA. @@ -58,11 +57,11 @@ def _sumprod( def sum( values: np.ndarray, - mask: np.ndarray, + mask: npt.NDArray[np.bool_], *, skipna: bool = True, min_count: int = 0, - axis: Optional[int] = None, + axis: int | None = None, ): return _sumprod( np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis @@ -71,11 +70,11 @@ def sum( def prod( values: np.ndarray, - mask: np.ndarray, + mask: npt.NDArray[np.bool_], *, skipna: bool = True, min_count: int = 0, - axis: Optional[int] = None, + axis: int | None = None, ): return _sumprod( np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis @@ -85,10 +84,10 @@ def prod( def _minmax( func: Callable, values: np.ndarray, - mask: np.ndarray, + mask: npt.NDArray[np.bool_], *, skipna: bool = True, - axis: Optional[int] = None, + axis: int | None = None, ): """ Reduction for 1D masked array. @@ -99,7 +98,7 @@ def _minmax( values : np.ndarray Numpy array with the values (can be of any dtype that support the operation). - mask : np.ndarray + mask : np.ndarray[bool] Boolean numpy array (True values indicate missing values). skipna : bool, default True Whether to skip NA. @@ -122,26 +121,26 @@ def _minmax( def min( values: np.ndarray, - mask: np.ndarray, + mask: npt.NDArray[np.bool_], *, skipna: bool = True, - axis: Optional[int] = None, + axis: int | None = None, ): return _minmax(np.min, values=values, mask=mask, skipna=skipna, axis=axis) def max( values: np.ndarray, - mask: np.ndarray, + mask: npt.NDArray[np.bool_], *, skipna: bool = True, - axis: Optional[int] = None, + axis: int | None = None, ): return _minmax(np.max, values=values, mask=mask, skipna=skipna, axis=axis) # TODO: axis kwarg -def mean(values: np.ndarray, mask: np.ndarray, skipna: bool = True): +def mean(values: np.ndarray, mask: npt.NDArray[np.bool_], skipna: bool = True): if not values.size or mask.all(): return libmissing.NA _sum = _sumprod(np.sum, values=values, mask=mask, skipna=skipna) diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index e26bb9fb6ebad..19a44dbfe6f6d 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -15,6 +15,7 @@ from pandas._typing import ( ArrayLike, Scalar, + npt, ) from pandas.core.dtypes.common import ( @@ -42,7 +43,7 @@ def should_use_regex(regex: bool, to_replace: Any) -> bool: def compare_or_regex_search( - a: ArrayLike, b: Scalar | Pattern, regex: bool, mask: np.ndarray + a: ArrayLike, b: Scalar | Pattern, regex: bool, mask: npt.NDArray[np.bool_] ) -> ArrayLike | bool: """ Compare two array-like inputs of the same shape or two scalar values @@ -116,7 +117,9 @@ def _check_comparison_types( return result -def replace_regex(values: ArrayLike, rx: re.Pattern, value, mask: np.ndarray | None): +def replace_regex( + values: ArrayLike, rx: re.Pattern, value, mask: npt.NDArray[np.bool_] | None +): """ Parameters ---------- diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 3446d5fc43a65..a493cf67fff9d 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -360,7 +360,7 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: np.putmask(self._ndarray, mask, value) def _where( - self: NDArrayBackedExtensionArrayT, mask: np.ndarray, value + self: NDArrayBackedExtensionArrayT, mask: npt.NDArray[np.bool_], value ) -> NDArrayBackedExtensionArrayT: """ Analogue to np.where(mask, self, value) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 21f44dbc6a1cd..ee6d2a6ab5684 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -99,13 +99,15 @@ class BaseMaskedArray(OpsMixin, ExtensionArray): _internal_fill_value: Scalar # our underlying data and mask are each ndarrays _data: np.ndarray - _mask: np.ndarray + _mask: npt.NDArray[np.bool_] # Fill values used for any/all _truthy_value = Scalar # bool(_truthy_value) = True _falsey_value = Scalar # bool(_falsey_value) = False - def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): + def __init__( + self, values: np.ndarray, mask: npt.NDArray[np.bool_], copy: bool = False + ): # values is supposed to already be validated in the subclass if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_): raise TypeError( diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 958c9f7b0b3f1..724cd1c0d88ca 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -17,6 +17,7 @@ from pandas._typing import ( Dtype, DtypeObj, + npt, ) from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly @@ -219,7 +220,9 @@ class NumericArray(BaseMaskedArray): _dtype_cls: type[NumericDtype] - def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): + def __init__( + self, values: np.ndarray, mask: npt.NDArray[np.bool_], copy: bool = False + ): checker = self._dtype_cls._checker if not (isinstance(values, np.ndarray) and checker(values.dtype)): descr = ( diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index dbff541e9568b..ecc85f12a9880 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3607,13 +3607,13 @@ def tail(self, n=5): return self._mask_selected_obj(mask) @final - def _mask_selected_obj(self, mask: np.ndarray) -> NDFrameT: + def _mask_selected_obj(self, mask: npt.NDArray[np.bool_]) -> NDFrameT: """ Return _selected_obj with mask applied to the correct axis. Parameters ---------- - mask : np.ndarray + mask : np.ndarray[bool] Boolean mask to apply. Returns diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d7594f2483569..108646f8766a6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5548,7 +5548,9 @@ def asof(self, label): return self[loc] - def asof_locs(self, where: Index, mask: np.ndarray) -> npt.NDArray[np.intp]: + def asof_locs( + self, where: Index, mask: npt.NDArray[np.bool_] + ) -> npt.NDArray[np.intp]: """ Return the locations (indices) of labels in the index. diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index aba834a47ffef..006f53ba06c71 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -23,6 +23,7 @@ from pandas._typing import ( Dtype, DtypeObj, + npt, ) from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level @@ -327,7 +328,7 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: # ------------------------------------------------------------------------ # Index Methods - def asof_locs(self, where: Index, mask: np.ndarray) -> np.ndarray: + def asof_locs(self, where: Index, mask: npt.NDArray[np.bool_]) -> np.ndarray: """ where : array of timestamps mask : np.ndarray[bool] diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 57cc88613cb7d..42c6f03a3af93 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -735,7 +735,7 @@ def _replace_coerce( self, to_replace, value, - mask: np.ndarray, + mask: npt.NDArray[np.bool_], inplace: bool = True, regex: bool = False, ) -> list[Block]: diff --git a/pandas/core/missing.py b/pandas/core/missing.py index d922cfd06f7ac..46ea23e431d15 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -43,7 +43,7 @@ from pandas import Index -def check_value_size(value, mask: np.ndarray, length: int): +def check_value_size(value, mask: npt.NDArray[np.bool_], length: int): """ Validate the size of the values passed to ExtensionArray.fillna. """ diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 26197e1ac4847..ef7f4bc92e25b 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -3,6 +3,7 @@ import numpy as np from pandas._libs import lib +from pandas._typing import npt from pandas.core.dtypes.cast import maybe_downcast_numeric from pandas.core.dtypes.common import ( @@ -167,7 +168,7 @@ def to_numeric(arg, errors="raise", downcast=None): # GH33013: for IntegerArray & FloatingArray extract non-null values for casting # save mask to reconstruct the full array after casting - mask: np.ndarray | None = None + mask: npt.NDArray[np.bool_] | None = None if isinstance(values, NumericArray): mask = values._mask values = values._data[~mask] diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 53b5079ed6be9..7c784c1489617 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -41,6 +41,7 @@ ArrayLike, DtypeArg, Shape, + npt, ) from pandas.compat._optional import import_optional_dependency from pandas.compat.pickle_compat import patch_pickle @@ -4409,7 +4410,7 @@ def write_data_chunk( self, rows: np.ndarray, indexes: list[np.ndarray], - mask: np.ndarray | None, + mask: npt.NDArray[np.bool_] | None, values: list[np.ndarray], ): """