From 9e4f54ffaa266aebc56aed7e2efcdff1d6fb968f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 5 Apr 2021 07:56:17 -0700 Subject: [PATCH 1/5] CLN/TYP: _libs (#40765) --- pandas/_libs/groupby.pyx | 51 +++++++++++----------- pandas/_libs/hashtable.pxd | 2 +- pandas/_libs/hashtable.pyx | 6 +-- pandas/_libs/hashtable_class_helper.pxi.in | 27 +++++++----- pandas/_libs/index.pyx | 3 +- pandas/_libs/index_class_helper.pxi.in | 3 +- 6 files changed, 48 insertions(+), 44 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index e7cd7cd898d5b..48ee01c809efd 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -106,7 +106,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, ndarray[int64_t] counts, ndarray[float64_t, ndim=2] values, ndarray[intp_t] labels, - Py_ssize_t min_count=-1): + Py_ssize_t min_count=-1) -> None: """ Only aggregates on axis=0 """ @@ -148,7 +148,7 @@ def group_cumprod_float64(float64_t[:, ::1] out, const intp_t[:] labels, int ngroups, bint is_datetimelike, - bint skipna=True): + bint skipna=True) -> None: """ Cumulative product of columns of `values`, in row groups `labels`. @@ -205,7 +205,7 @@ def group_cumsum(numeric[:, ::1] out, const intp_t[:] labels, int ngroups, is_datetimelike, - bint skipna=True): + bint skipna=True) -> None: """ Cumulative sum of columns of `values`, in row groups `labels`. @@ -270,7 +270,7 @@ def group_cumsum(numeric[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) def group_shift_indexer(int64_t[::1] out, const intp_t[:] labels, - int ngroups, int periods): + int ngroups, int periods) -> None: cdef: Py_ssize_t N, i, j, ii, lab int offset = 0, sign @@ -322,14 +322,14 @@ def group_shift_indexer(int64_t[::1] out, const intp_t[:] labels, @cython.wraparound(False) @cython.boundscheck(False) def group_fillna_indexer(ndarray[int64_t] out, ndarray[intp_t] labels, - ndarray[uint8_t] mask, object direction, - int64_t limit, bint dropna): + ndarray[uint8_t] mask, str direction, + int64_t limit, bint dropna) -> None: """ Indexes how to fill values forwards or backwards within a group. Parameters ---------- - out : np.ndarray[np.uint8] + out : np.ndarray[np.int64] Values into which this method will write its results. labels : np.ndarray[np.intp] Array containing unique label for each group, with its ordering @@ -392,8 +392,8 @@ def group_any_all(uint8_t[::1] out, const uint8_t[::1] values, const intp_t[:] labels, const uint8_t[::1] mask, - object val_test, - bint skipna): + str val_test, + bint skipna) -> None: """ Aggregated boolean values to show truthfulness of group elements. @@ -465,7 +465,7 @@ def group_add(complexfloating_t[:, ::1] out, int64_t[::1] counts, ndarray[complexfloating_t, ndim=2] values, const intp_t[:] labels, - Py_ssize_t min_count=0): + Py_ssize_t min_count=0) -> None: """ Only aggregates on axis=0 using Kahan summation """ @@ -518,7 +518,7 @@ def group_prod(floating[:, ::1] out, int64_t[::1] counts, ndarray[floating, ndim=2] values, const intp_t[:] labels, - Py_ssize_t min_count=0): + Py_ssize_t min_count=0) -> None: """ Only aggregates on axis=0 """ @@ -568,7 +568,7 @@ def group_var(floating[:, ::1] out, ndarray[floating, ndim=2] values, const intp_t[:] labels, Py_ssize_t min_count=-1, - int64_t ddof=1): + int64_t ddof=1) -> None: cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) floating val, ct, oldmean @@ -621,7 +621,7 @@ def group_mean(floating[:, ::1] out, int64_t[::1] counts, ndarray[floating, ndim=2] values, const intp_t[::1] labels, - Py_ssize_t min_count=-1): + Py_ssize_t min_count=-1) -> None: cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) floating val, count, y, t @@ -673,7 +673,7 @@ def group_ohlc(floating[:, ::1] out, int64_t[::1] counts, ndarray[floating, ndim=2] values, const intp_t[:] labels, - Py_ssize_t min_count=-1): + Py_ssize_t min_count=-1) -> None: """ Only aggregates on axis=0 """ @@ -721,7 +721,7 @@ def group_quantile(ndarray[float64_t] out, ndarray[intp_t] labels, ndarray[uint8_t] mask, float64_t q, - object interpolation): + str interpolation) -> None: """ Calculate the quantile per group. @@ -733,8 +733,6 @@ def group_quantile(ndarray[float64_t] out, Array containing the values to apply the function against. labels : ndarray[np.intp] Array containing the unique group labels. - values : ndarray - Array containing the values to apply the function against. q : float The quantile value to search for. interpolation : {'linear', 'lower', 'highest', 'nearest', 'midpoint'} @@ -865,7 +863,7 @@ def group_last(rank_t[:, ::1] out, int64_t[::1] counts, ndarray[rank_t, ndim=2] values, const intp_t[:] labels, - Py_ssize_t min_count=-1): + Py_ssize_t min_count=-1) -> None: """ Only aggregates on axis=0 """ @@ -957,8 +955,9 @@ def group_nth(rank_t[:, ::1] out, int64_t[::1] counts, ndarray[rank_t, ndim=2] values, const intp_t[:] labels, - int64_t min_count=-1, int64_t rank=1 - ): + int64_t min_count=-1, + int64_t rank=1, + ) -> None: """ Only aggregates on axis=0 """ @@ -1050,8 +1049,8 @@ def group_rank(float64_t[:, ::1] out, ndarray[rank_t, ndim=2] values, const intp_t[:] labels, int ngroups, - bint is_datetimelike, object ties_method="average", - bint ascending=True, bint pct=False, object na_option="keep"): + bint is_datetimelike, str ties_method="average", + bint ascending=True, bint pct=False, str na_option="keep") -> None: """ Provides the rank of values within each group. @@ -1221,7 +1220,7 @@ def group_max(groupby_t[:, ::1] out, int64_t[::1] counts, ndarray[groupby_t, ndim=2] values, const intp_t[:] labels, - Py_ssize_t min_count=-1): + Py_ssize_t min_count=-1) -> None: """See group_min_max.__doc__""" group_min_max(out, counts, values, labels, min_count=min_count, compute_max=True) @@ -1232,7 +1231,7 @@ def group_min(groupby_t[:, ::1] out, int64_t[::1] counts, ndarray[groupby_t, ndim=2] values, const intp_t[:] labels, - Py_ssize_t min_count=-1): + Py_ssize_t min_count=-1) -> None: """See group_min_max.__doc__""" group_min_max(out, counts, values, labels, min_count=min_count, compute_max=False) @@ -1311,7 +1310,7 @@ def group_cummin(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, const intp_t[:] labels, int ngroups, - bint is_datetimelike): + bint is_datetimelike) -> None: """See group_cummin_max.__doc__""" group_cummin_max(out, values, labels, ngroups, is_datetimelike, compute_max=False) @@ -1322,6 +1321,6 @@ def group_cummax(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, const intp_t[:] labels, int ngroups, - bint is_datetimelike): + bint is_datetimelike) -> None: """See group_cummin_max.__doc__""" group_cummin_max(out, values, labels, ngroups, is_datetimelike, compute_max=True) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 735d8c07f4774..a5679af44ac06 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -134,6 +134,6 @@ cdef class Int64Vector: cdef bint external_view_exists cdef resize(self) - cpdef to_array(self) + cpdef ndarray to_array(self) cdef inline void append(self, int64_t x) cdef extend(self, int64_t[:] x) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index e402a4b7c0ccc..1e2a336f12444 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -61,7 +61,7 @@ cdef class Factorizer: ObjectVector uniques Py_ssize_t count - def __init__(self, size_hint): + def __init__(self, size_hint: int): self.table = PyObjectHashTable(size_hint) self.uniques = ObjectVector() self.count = 0 @@ -116,12 +116,12 @@ cdef class Int64Factorizer: Int64Vector uniques Py_ssize_t count - def __init__(self, size_hint): + def __init__(self, size_hint: int): self.table = Int64HashTable(size_hint) self.uniques = Int64Vector() self.count = 0 - def get_count(self): + def get_count(self) -> int: return self.count def factorize(self, const int64_t[:] values, sort=False, diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 301644274111b..b80a127be970d 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -220,7 +220,7 @@ cdef class {{name}}Vector: def __len__(self) -> int: return self.data.n - cpdef to_array(self): + cpdef ndarray to_array(self): if self.data.m != self.data.n: if self.external_view_exists: # should never happen @@ -288,7 +288,7 @@ cdef class StringVector: def __len__(self) -> int: return self.data.n - def to_array(self): + cpdef ndarray[object, ndim=1] to_array(self): cdef: ndarray ao Py_ssize_t n @@ -345,7 +345,7 @@ cdef class ObjectVector: self.data[self.n] = obj self.n += 1 - def to_array(self): + cpdef ndarray[object, ndim=1] to_array(self): if self.m != self.n: if self.external_view_exists: raise ValueError("should have raised on append()") @@ -403,7 +403,7 @@ cdef class {{name}}HashTable(HashTable): kh_destroy_{{dtype}}(self.table) self.table = NULL - def __contains__(self, object key): + def __contains__(self, object key) -> bool: cdef: khiter_t k {{c_type}} ckey @@ -452,7 +452,7 @@ cdef class {{name}}HashTable(HashTable): raise KeyError(key) @cython.boundscheck(False) - def map(self, const {{dtype}}_t[:] keys, const int64_t[:] values): + def map(self, const {{dtype}}_t[:] keys, const int64_t[:] values) -> None: cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -466,7 +466,7 @@ cdef class {{name}}HashTable(HashTable): self.table.vals[k] = values[i] @cython.boundscheck(False) - def map_locations(self, const {{dtype}}_t[:] values): + def map_locations(self, const {{dtype}}_t[:] values) -> None: cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -480,7 +480,8 @@ cdef class {{name}}HashTable(HashTable): self.table.vals[k] = i @cython.boundscheck(False) - def lookup(self, const {{dtype}}_t[:] values): + def lookup(self, const {{dtype}}_t[:] values) -> ndarray: + # -> np.ndarray[np.intp] cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -818,7 +819,8 @@ cdef class StringHashTable(HashTable): return labels @cython.boundscheck(False) - def lookup(self, ndarray[object] values): + def lookup(self, ndarray[object] values) -> ndarray: + # -> np.ndarray[np.intp] cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -853,7 +855,7 @@ cdef class StringHashTable(HashTable): return np.asarray(locs) @cython.boundscheck(False) - def map_locations(self, ndarray[object] values): + def map_locations(self, ndarray[object] values) -> None: cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -1071,7 +1073,7 @@ cdef class PyObjectHashTable(HashTable): def __len__(self) -> int: return self.table.size - def __contains__(self, object key): + def __contains__(self, object key) -> bool: cdef: khiter_t k hash(key) @@ -1123,7 +1125,7 @@ cdef class PyObjectHashTable(HashTable): else: raise KeyError(key) - def map_locations(self, ndarray[object] values): + def map_locations(self, ndarray[object] values) -> None: cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -1137,7 +1139,8 @@ cdef class PyObjectHashTable(HashTable): k = kh_put_pymap(self.table, val, &ret) self.table.vals[k] = i - def lookup(self, ndarray[object] values): + def lookup(self, ndarray[object] values) -> ndarray: + # -> np.ndarray[np.intp] cdef: Py_ssize_t i, n = len(values) int ret = 0 diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 47e6d417bb925..f1f56c6c0c855 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -132,6 +132,7 @@ cdef class IndexEngine: return self._maybe_get_bool_indexer(val) cdef _maybe_get_bool_indexer(self, object val): + # Returns ndarray[bool] or int cdef: ndarray[uint8_t, ndim=1, cast=True] indexer @@ -247,7 +248,7 @@ cdef class IndexEngine: self.need_unique_check = 0 - cdef void _call_map_locations(self, values): + cdef void _call_map_locations(self, ndarray values): self.mapping.map_locations(values) def clear_mapping(self): diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index e5026ce2fa292..8638c2c689c3f 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -44,10 +44,11 @@ cdef class {{name}}Engine(IndexEngine): raise KeyError(val) {{endif}} - cdef void _call_map_locations(self, values): + cdef void _call_map_locations(self, ndarray values): self.mapping.map_locations(algos.ensure_{{name.lower()}}(values)) cdef _maybe_get_bool_indexer(self, object val): + # Returns ndarray[bool] or int cdef: ndarray[uint8_t, ndim=1, cast=True] indexer ndarray[intp_t, ndim=1] found From 8c9621dbca5e66517455154250664a59a3b93ace Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 5 Apr 2021 07:56:58 -0700 Subject: [PATCH 2/5] TYP: fix type:ignores (#40758) --- pandas/core/arrays/datetimelike.py | 4 ++-- pandas/core/arrays/integer.py | 6 +++--- pandas/core/construction.py | 22 ++++++++------------ pandas/core/dtypes/cast.py | 7 ++++--- pandas/core/frame.py | 28 ++++++++++---------------- pandas/core/generic.py | 3 ++- pandas/core/internals/array_manager.py | 11 +++++----- pandas/core/internals/construction.py | 4 ++-- pandas/core/internals/managers.py | 5 +++-- pandas/core/missing.py | 2 +- pandas/core/strings/object_array.py | 8 ++------ pandas/tests/extension/test_numpy.py | 11 ---------- 12 files changed, 44 insertions(+), 67 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 7be06fe92c418..25939bcdc7c6a 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1732,8 +1732,8 @@ def _round(self, freq, mode, ambiguous, nonexistent): values = self.view("i8") values = cast(np.ndarray, values) nanos = to_offset(freq).nanos - result = round_nsint64(values, mode, nanos) - result = self._maybe_mask_results(result, fill_value=iNaT) + result_i8 = round_nsint64(values, mode, nanos) + result = self._maybe_mask_results(result_i8, fill_value=iNaT) result = result.view(self._ndarray.dtype) return self._simple_new(result, dtype=self.dtype) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index ae44acf06591f..3f5c550545aad 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -371,14 +371,14 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: if isinstance(dtype, ExtensionDtype): return super().astype(dtype, copy=copy) + na_value: float | np.datetime64 | lib.NoDefault + # coerce if is_float_dtype(dtype): # In astype, we consider dtype=float to also mean na_value=np.nan na_value = np.nan elif is_datetime64_dtype(dtype): - # error: Incompatible types in assignment (expression has type - # "datetime64", variable has type "float") - na_value = np.datetime64("NaT") # type: ignore[assignment] + na_value = np.datetime64("NaT") else: na_value = lib.no_default diff --git a/pandas/core/construction.py b/pandas/core/construction.py index b5a17e1ef882e..98dfad72142f6 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -52,10 +52,10 @@ is_integer_dtype, is_list_like, is_object_dtype, - is_sparse, is_string_dtype, is_timedelta64_ns_dtype, ) +from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.generic import ( ABCExtensionArray, ABCIndex, @@ -549,12 +549,10 @@ def sanitize_array( subarr = _sanitize_ndim(subarr, data, dtype, index) - if not (is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype)): - # error: Argument 1 to "_sanitize_str_dtypes" has incompatible type - # "ExtensionArray"; expected "ndarray" - subarr = _sanitize_str_dtypes( - subarr, data, dtype, copy # type: ignore[arg-type] - ) + if not ( + isinstance(subarr.dtype, ExtensionDtype) or isinstance(dtype, ExtensionDtype) + ): + subarr = _sanitize_str_dtypes(subarr, data, dtype, copy) is_object_or_str_dtype = is_object_dtype(dtype) or is_string_dtype(dtype) if is_object_dtype(subarr.dtype) and not is_object_or_str_dtype: @@ -599,7 +597,7 @@ def _sanitize_ndim( def _sanitize_str_dtypes( - result: np.ndarray, data, dtype: Optional[DtypeObj], copy: bool + result: np.ndarray, data, dtype: Optional[np.dtype], copy: bool ) -> np.ndarray: """ Ensure we have a dtype that is supported by pandas. @@ -613,11 +611,7 @@ def _sanitize_str_dtypes( # GH#19853: If data is a scalar, result has already the result if not lib.is_scalar(data): if not np.all(isna(data)): - # error: Argument "dtype" to "array" has incompatible type - # "Union[dtype[Any], ExtensionDtype, None]"; expected "Union[dtype[Any], - # None, type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, - # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]" - data = np.array(data, dtype=dtype, copy=False) # type: ignore[arg-type] + data = np.array(data, dtype=dtype, copy=False) result = np.array(data, dtype=object, copy=copy) return result @@ -666,7 +660,7 @@ def _try_cast( ): return arr - if isinstance(dtype, ExtensionDtype) and (dtype.kind != "M" or is_sparse(dtype)): + if isinstance(dtype, ExtensionDtype) and not isinstance(dtype, DatetimeTZDtype): # create an extension array from its dtype # DatetimeTZ case needs to go through maybe_cast_to_datetime but # SparseDtype does not diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 3f3d9f9f2833b..a3744ffa7f9bc 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1400,11 +1400,13 @@ def soft_convert_objects( # GH 20380, when datetime is beyond year 2262, hence outside # bound of nanosecond-resolution 64-bit integers. try: - values = lib.maybe_convert_objects( + converted = lib.maybe_convert_objects( values, convert_datetime=datetime, convert_timedelta=timedelta ) except (OutOfBoundsDatetime, ValueError): return values + if converted is not values: + return converted if numeric and is_object_dtype(values.dtype): converted = lib.maybe_convert_numeric(values, set(), coerce_numeric=True) @@ -1446,10 +1448,9 @@ def convert_dtypes( dtype new dtype """ - is_extension = is_extension_array_dtype(input_array.dtype) if ( convert_string or convert_integer or convert_boolean or convert_floating - ) and not is_extension: + ) and isinstance(input_array, np.ndarray): inferred_dtype = lib.infer_dtype(input_array) if not convert_string and is_string_dtype(inferred_dtype): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 484b01f2c04f0..d1d1993931062 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3300,14 +3300,10 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: ) else: - # error: Incompatible types in assignment (expression has type - # "ndarray", variable has type "List[Any]") - new_values = self.values.T # type: ignore[assignment] + new_arr = self.values.T if copy: - new_values = new_values.copy() - result = self._constructor( - new_values, index=self.columns, columns=self.index - ) + new_arr = new_arr.copy() + result = self._constructor(new_arr, index=self.columns, columns=self.index) return result.__finalize__(self, method="transpose") @@ -3682,17 +3678,15 @@ def _set_item_frame_value(self, key, value: DataFrame) -> None: value = value.reindex(cols, axis=1) # now align rows + arraylike = _reindex_for_setitem(value, self.index) + self._set_item_mgr(key, arraylike) - # error: Incompatible types in assignment (expression has type "ExtensionArray", - # variable has type "DataFrame") - value = _reindex_for_setitem(value, self.index) # type: ignore[assignment] - self._set_item_mgr(key, value) - - def _iset_item_mgr(self, loc: int, value) -> None: + def _iset_item_mgr(self, loc: int | slice | np.ndarray, value) -> None: + # when called from _set_item_mgr loc can be anything returned from get_loc self._mgr.iset(loc, value) self._clear_item_cache() - def _set_item_mgr(self, key, value): + def _set_item_mgr(self, key, value: ArrayLike) -> None: try: loc = self._info_axis.get_loc(key) except KeyError: @@ -3707,9 +3701,9 @@ def _set_item_mgr(self, key, value): if len(self): self._check_setitem_copy() - def _iset_item(self, loc: int, value): - value = self._sanitize_column(value) - self._iset_item_mgr(loc, value) + def _iset_item(self, loc: int, value) -> None: + arraylike = self._sanitize_column(value) + self._iset_item_mgr(loc, arraylike) # check if we are modifying a copy # try to set first as we want an invalid diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6b4e3c7caef50..25c10c215e8cc 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3541,7 +3541,8 @@ def _maybe_cache_changed(self, item, value) -> None: The object has called back to us saying maybe it has changed. """ loc = self._info_axis.get_loc(item) - self._mgr.iset(loc, value) + arraylike = value._values + self._mgr.iset(loc, arraylike) @final @property diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 294d1fd078b08..d432b7ef443cc 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -850,7 +850,7 @@ def idelete(self, indexer): self._axes = [self._axes[0], self._axes[1][to_keep]] return self - def iset(self, loc: Union[int, slice, np.ndarray], value): + def iset(self, loc: Union[int, slice, np.ndarray], value: ArrayLike): """ Set new column(s). @@ -861,12 +861,10 @@ def iset(self, loc: Union[int, slice, np.ndarray], value): ---------- loc : integer, slice or boolean mask Positional location (already bounds checked) - value : array-like + value : np.ndarray or ExtensionArray """ # single column -> single integer index if lib.is_integer(loc): - # TODO the extract array should in theory not be needed? - value = extract_array(value, extract_numpy=True) # TODO can we avoid needing to unpack this here? That means converting # DataFrame into 1D array when loc is an integer @@ -904,7 +902,10 @@ def iset(self, loc: Union[int, slice, np.ndarray], value): assert value.shape[0] == len(self._axes[0]) for value_idx, mgr_idx in enumerate(indices): - value_arr = value[:, value_idx] + # error: Invalid index type "Tuple[slice, int]" for + # "Union[ExtensionArray, ndarray]"; expected type + # "Union[int, slice, ndarray]" + value_arr = value[:, value_idx] # type: ignore[index] self.arrays[mgr_idx] = value_arr return diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 5b4b710838ef8..6364816b9ab2d 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -843,8 +843,8 @@ def _list_of_dict_to_arrays( if columns is None: gen = (list(x.keys()) for x in data) sort = not any(isinstance(d, dict) for d in data) - columns = lib.fast_unique_multiple_list_gen(gen, sort=sort) - columns = ensure_index(columns) + pre_cols = lib.fast_unique_multiple_list_gen(gen, sort=sort) + columns = ensure_index(pre_cols) # assure that they are of the base dict class and not of derived # classes diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b688f1b4fea5f..549d4337dcf54 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1062,7 +1062,7 @@ def idelete(self, indexer) -> BlockManager: axes = [new_columns, self.axes[1]] return type(self)._simple_new(tuple(nbs), axes) - def iset(self, loc: Union[int, slice, np.ndarray], value): + def iset(self, loc: Union[int, slice, np.ndarray], value: ArrayLike): """ Set new item in-place. Does not consolidate. Adds new Block if not contained in the current set of items @@ -1073,6 +1073,7 @@ def iset(self, loc: Union[int, slice, np.ndarray], value): if self._blklocs is None and self.ndim > 1: self._rebuild_blknos_and_blklocs() + # Note: we exclude DTA/TDA here value_is_extension_type = is_extension_array_dtype(value) # categorical/sparse/datetimetz @@ -1429,7 +1430,7 @@ def _slice_take_blocks_ax0( return blocks - def _make_na_block(self, placement, fill_value=None): + def _make_na_block(self, placement: BlockPlacement, fill_value=None) -> Block: if fill_value is None: fill_value = np.nan diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 21c79588317df..2fd39588a3da6 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -888,7 +888,7 @@ def clean_reindex_fill_method(method): return clean_fill_method(method, allow_nearest=True) -def _interp_limit(invalid, fw_limit, bw_limit): +def _interp_limit(invalid: np.ndarray, fw_limit, bw_limit): """ Get indexers of values that won't be filled because they exceed the limits. diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index edf32bade0657..45f1faa637b85 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -56,21 +56,17 @@ def _str_map(self, f, na_value=None, dtype: Optional[Dtype] = None): dtype : Dtype, optional The dtype of the result array. """ - arr = self if dtype is None: dtype = np.dtype("object") if na_value is None: na_value = self._str_na_value - if not len(arr): + if not len(self): # error: Argument 1 to "ndarray" has incompatible type "int"; # expected "Sequence[int]" return np.ndarray(0, dtype=dtype) # type: ignore[arg-type] - if not isinstance(arr, np.ndarray): - # error: Incompatible types in assignment (expression has type "ndarray", - # variable has type "ObjectStringArrayMixin") - arr = np.asarray(arr, dtype=object) # type: ignore[assignment] + arr = np.asarray(self, dtype=object) mask = isna(arr) convert = not np.all(mask) try: diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index e11e74f16030c..35e5abe9ce4e7 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -330,17 +330,6 @@ def test_fillna_frame(self, data_missing): # Non-scalar "scalar" values. super().test_fillna_frame(data_missing) - def test_fillna_fill_other(self, data_missing): - # Same as the parent class test, but with PandasDtype for expected["B"] - # instead of equivalent numpy dtype - data = data_missing - result = pd.DataFrame({"A": data, "B": [np.nan] * len(data)}).fillna({"B": 0.0}) - - expected = pd.DataFrame({"A": data, "B": [0.0] * len(result)}) - expected["B"] = expected["B"].astype(PandasDtype(expected["B"].dtype)) - - self.assert_frame_equal(result, expected) - class TestReshaping(BaseNumPyTests, base.BaseReshapingTests): @pytest.mark.skip(reason="Incorrect expected.") From 986c5ec0f08c9d45f808c0d53f40ca470db8bf27 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Mon, 5 Apr 2021 16:24:47 +0100 Subject: [PATCH 3/5] TYP/CLN: factorize_from_iterable(s) (#40775) --- pandas/core/arrays/categorical.py | 26 ++++++++++++++------------ pandas/core/reshape/reshape.py | 2 +- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 32c3095c3e6ee..fe08ea418493e 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -11,6 +11,7 @@ List, Optional, Sequence, + Tuple, Type, TypeVar, Union, @@ -2642,13 +2643,11 @@ def recode_for_categories( return new_codes -def factorize_from_iterable(values): +def factorize_from_iterable(values) -> Tuple[np.ndarray, Index]: """ Factorize an input `values` into `categories` and `codes`. Preserves categorical dtype in `categories`. - *This is an internal function* - Parameters ---------- values : list-like @@ -2660,6 +2659,8 @@ def factorize_from_iterable(values): If `values` has a categorical dtype, then `categories` is a CategoricalIndex keeping the categories and order of `values`. """ + from pandas import CategoricalIndex + if not is_list_like(values): raise TypeError("Input must be list-like") @@ -2668,7 +2669,8 @@ def factorize_from_iterable(values): # The Categorical we want to build has the same categories # as values but its codes are by def [0, ..., len(n_categories) - 1] cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype) - categories = Categorical.from_codes(cat_codes, dtype=values.dtype) + cat = Categorical.from_codes(cat_codes, dtype=values.dtype) + categories = CategoricalIndex(cat) codes = values.codes else: # The value of ordered is irrelevant since we don't use cat as such, @@ -2680,26 +2682,26 @@ def factorize_from_iterable(values): return codes, categories -def factorize_from_iterables(iterables): +def factorize_from_iterables(iterables) -> Tuple[List[np.ndarray], List[Index]]: """ A higher-level wrapper over `factorize_from_iterable`. - *This is an internal function* - Parameters ---------- iterables : list-like of list-likes Returns ------- - codes_list : list of ndarrays - categories_list : list of Indexes + codes : list of ndarrays + categories : list of Indexes Notes ----- See `factorize_from_iterable` for more info. """ if len(iterables) == 0: - # For consistency, it should return a list of 2 lists. - return [[], []] - return map(list, zip(*(factorize_from_iterable(it) for it in iterables))) + # For consistency, it should return two empty lists. + return [], [] + + codes, categories = zip(*(factorize_from_iterable(it) for it in iterables)) + return list(codes), list(categories) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 613669b8cc1d8..346dc3732b212 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -991,7 +991,7 @@ def get_empty_frame(data) -> DataFrame: if prefix is None: dummy_cols = levels else: - dummy_cols = [f"{prefix}{prefix_sep}{level}" for level in levels] + dummy_cols = Index([f"{prefix}{prefix_sep}{level}" for level in levels]) index: Optional[Index] if isinstance(data, Series): From 84b3f91c83a0ec0225f856f33e6e1d5b7afb3614 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 5 Apr 2021 09:19:43 -0700 Subject: [PATCH 4/5] BUG: Series.__delitem__ converting EAs to ndarrays (#40763) --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/core/internals/blocks.py | 38 ++++++++++++-------- pandas/tests/series/indexing/test_delitem.py | 21 +++++++++++ 3 files changed, 45 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 75b2dee4a5822..49ddeb23e3193 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -642,6 +642,7 @@ Indexing - Bug in setting ``numpy.timedelta64`` values into an object-dtype :class:`Series` using a boolean indexer (:issue:`39488`) - Bug in setting numeric values into a into a boolean-dtypes :class:`Series` using ``at`` or ``iat`` failing to cast to object-dtype (:issue:`39582`) - Bug in :meth:`DataFrame.loc.__setitem__` when setting-with-expansion incorrectly raising when the index in the expanding axis contains duplicates (:issue:`40096`) +- Bug in :meth:`Series.__delitem__` with ``ExtensionDtype`` incorrectly casting to ``ndarray`` (:issue:`40386`) Missing ^^^^^^^ diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 7c2a31e63eeb3..9a2b3be4b66e2 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1338,7 +1338,28 @@ def quantile( return new_block(result, placement=self._mgr_locs, ndim=2) -class ExtensionBlock(Block): +class EABackedBlock(Block): + """ + Mixin for Block subclasses backed by ExtensionArray. + """ + + values: ExtensionArray + + def delete(self, loc) -> None: + """ + Delete given loc(-s) from block in-place. + """ + # This will be unnecessary if/when __array_function__ is implemented + self.values = self.values.delete(loc) + self.mgr_locs = self._mgr_locs.delete(loc) + try: + self._cache.clear() + except AttributeError: + # _cache not yet initialized + pass + + +class ExtensionBlock(EABackedBlock): """ Block for holding extension types. @@ -1647,7 +1668,7 @@ class NumericBlock(Block): is_numeric = True -class NDArrayBackedExtensionBlock(Block): +class NDArrayBackedExtensionBlock(EABackedBlock): """ Block backed by an NDArrayBackedExtensionArray """ @@ -1754,19 +1775,6 @@ def fillna( new_values = values.fillna(value=value, limit=limit) return [self.make_block_same_class(values=new_values)] - def delete(self, loc) -> None: - """ - Delete given loc(-s) from block in-place. - """ - # This will be unnecessary if/when __array_function__ is implemented - self.values = self.values.delete(loc, axis=0) - self.mgr_locs = self._mgr_locs.delete(loc) - try: - self._cache.clear() - except AttributeError: - # _cache not yet initialized - pass - class DatetimeLikeBlock(NDArrayBackedExtensionBlock): """Mixin class for DatetimeLikeBlock, DatetimeTZBlock.""" diff --git a/pandas/tests/series/indexing/test_delitem.py b/pandas/tests/series/indexing/test_delitem.py index 019cb92d780ef..af6b3910baec0 100644 --- a/pandas/tests/series/indexing/test_delitem.py +++ b/pandas/tests/series/indexing/test_delitem.py @@ -3,6 +3,7 @@ from pandas import ( Index, Series, + date_range, ) import pandas._testing as tm @@ -50,3 +51,23 @@ def test_delitem_missing_key(self): with pytest.raises(KeyError, match=r"^0$"): del s[0] + + def test_delitem_extension_dtype(self): + # GH#40386 + # DatetimeTZDtype + dti = date_range("2016-01-01", periods=3, tz="US/Pacific") + ser = Series(dti) + + expected = ser[[0, 2]] + del ser[1] + assert ser.dtype == dti.dtype + tm.assert_series_equal(ser, expected) + + # PeriodDtype + pi = dti.tz_localize(None).to_period("D") + ser = Series(pi) + + expected = ser[:2] + del ser[2] + assert ser.dtype == pi.dtype + tm.assert_series_equal(ser, expected) From ce34c1cee70021792b0c1b0db2c899069bbf0c1d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 5 Apr 2021 09:22:13 -0700 Subject: [PATCH 5/5] BUG: RollingGroupby MultiIndex levels dropped (#40701) --- doc/source/whatsnew/v1.3.0.rst | 32 ++++++++++++++++++++ pandas/core/window/rolling.py | 23 ++++++-------- pandas/tests/window/test_groupby.py | 47 ++++++++++++++++++++++++++--- 3 files changed, 84 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 49ddeb23e3193..5e95cd6e5ee10 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -400,6 +400,38 @@ However, floating point artifacts may now exist in the results when rolling over s = pd.Series([7, 5, 5, 5]) s.rolling(3).var() +.. _whatsnew_130.notable_bug_fixes.rolling_groupby_multiindex: + +GroupBy.rolling with MultiIndex no longer drops levels in the result +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`core.window.rolling.RollingGroupby` will no longer drop levels of a :class:`DataFrame` +with a :class:`MultiIndex` in the result. This can lead to a perceived duplication of levels in the resulting +:class:`MultiIndex`, but this change restores the behavior that was present in version 1.1.3 (:issue:`38787`, :issue:`38523`). + + +.. ipython:: python + + index = pd.MultiIndex.from_tuples([('idx1', 'idx2')], names=['label1', 'label2']) + df = pd.DataFrame({'a': [1], 'b': [2]}, index=index) + df + +*Previous behavior*: + +.. code-block:: ipython + + In [1]: df.groupby('label1').rolling(1).sum() + Out[1]: + a b + label1 + idx1 1.0 2.0 + +*New behavior*: + +.. ipython:: python + + df.groupby('label1').rolling(1).sum() + .. _whatsnew_130.api_breaking.deps: diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index b90722857938e..c7fa6f99bfb1c 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -577,26 +577,23 @@ def _apply( numba_cache_key, **kwargs, ) - # Reconstruct the resulting MultiIndex from tuples + # Reconstruct the resulting MultiIndex # 1st set of levels = group by labels - # 2nd set of levels = original index - # Ignore 2nd set of levels if a group by label include an index level - result_index_names = copy.copy(self._grouper.names) - grouped_object_index = None + # 2nd set of levels = original DataFrame/Series index + grouped_object_index = self.obj.index + grouped_index_name = [*grouped_object_index.names] + groupby_keys = copy.copy(self._grouper.names) + result_index_names = groupby_keys + grouped_index_name - column_keys = [ + drop_columns = [ key - for key in result_index_names + for key in self._grouper.names if key not in self.obj.index.names or key is None ] - if len(column_keys) == len(result_index_names): - grouped_object_index = self.obj.index - grouped_index_name = [*grouped_object_index.names] - result_index_names += grouped_index_name - else: + if len(drop_columns) != len(groupby_keys): # Our result will have still kept the column in the result - result = result.drop(columns=column_keys, errors="ignore") + result = result.drop(columns=drop_columns, errors="ignore") codes = self._grouper.codes levels = copy.copy(self._grouper.levels) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 5c2f69a9247e9..dd988a4abd9e1 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -588,23 +588,31 @@ def test_groupby_rolling_nans_in_index(self, rollings, key): with pytest.raises(ValueError, match=f"{key} must be monotonic"): df.groupby("c").rolling("60min", **rollings) - def test_groupby_rolling_group_keys(self): + @pytest.mark.parametrize("group_keys", [True, False]) + def test_groupby_rolling_group_keys(self, group_keys): # GH 37641 + # GH 38523: GH 37641 actually was not a bug. + # group_keys only applies to groupby.apply directly arrays = [["val1", "val1", "val2"], ["val1", "val1", "val2"]] index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2")) s = Series([1, 2, 3], index=index) - result = s.groupby(["idx1", "idx2"], group_keys=False).rolling(1).mean() + result = s.groupby(["idx1", "idx2"], group_keys=group_keys).rolling(1).mean() expected = Series( [1.0, 2.0, 3.0], index=MultiIndex.from_tuples( - [("val1", "val1"), ("val1", "val1"), ("val2", "val2")], - names=["idx1", "idx2"], + [ + ("val1", "val1", "val1", "val1"), + ("val1", "val1", "val1", "val1"), + ("val2", "val2", "val2", "val2"), + ], + names=["idx1", "idx2", "idx1", "idx2"], ), ) tm.assert_series_equal(result, expected) def test_groupby_rolling_index_level_and_column_label(self): + # The groupby keys should not appear as a resulting column arrays = [["val1", "val1", "val2"], ["val1", "val1", "val2"]] index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2")) @@ -613,7 +621,12 @@ def test_groupby_rolling_index_level_and_column_label(self): expected = DataFrame( {"B": [0.0, 1.0, 2.0]}, index=MultiIndex.from_tuples( - [("val1", 1), ("val1", 1), ("val2", 2)], names=["idx1", "A"] + [ + ("val1", 1, "val1", "val1"), + ("val1", 1, "val1", "val1"), + ("val2", 2, "val2", "val2"), + ], + names=["idx1", "A", "idx1", "idx2"], ), ) tm.assert_frame_equal(result, expected) @@ -695,6 +708,30 @@ def test_by_column_not_in_values(self, columns): assert "A" not in result.columns tm.assert_frame_equal(g.obj, original_obj) + def test_groupby_level(self): + # GH 38523, 38787 + arrays = [ + ["Falcon", "Falcon", "Parrot", "Parrot"], + ["Captive", "Wild", "Captive", "Wild"], + ] + index = MultiIndex.from_arrays(arrays, names=("Animal", "Type")) + df = DataFrame({"Max Speed": [390.0, 350.0, 30.0, 20.0]}, index=index) + result = df.groupby(level=0)["Max Speed"].rolling(2).sum() + expected = Series( + [np.nan, 740.0, np.nan, 50.0], + index=MultiIndex.from_tuples( + [ + ("Falcon", "Falcon", "Captive"), + ("Falcon", "Falcon", "Wild"), + ("Parrot", "Parrot", "Captive"), + ("Parrot", "Parrot", "Wild"), + ], + names=["Animal", "Animal", "Type"], + ), + name="Max Speed", + ) + tm.assert_series_equal(result, expected) + class TestExpanding: def setup_method(self):