From 6f3e1724c7721d46421726a3d358fc0010a41cb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Tue, 17 May 2022 18:25:24 -0400 Subject: [PATCH 01/11] TYP: NoDefault --- pandas/_libs/lib.pyi | 7 +++++-- pandas/core/common.py | 2 +- pandas/core/groupby/groupby.py | 4 +--- pandas/core/indexes/multi.py | 3 ++- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index ad77e9e533b0b..90340d42d2d3f 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -23,9 +23,12 @@ ndarray_obj_2d = np.ndarray from enum import Enum -class NoDefault(Enum): ... +class _NoDefault(Enum): + no_default = ... -no_default: NoDefault +no_default = _NoDefault.no_default +# note: the pyx file defines NoDefault as _NoDefault! +NoDefault = Literal[_NoDefault.no_default] i8max: int u8max: int diff --git a/pandas/core/common.py b/pandas/core/common.py index 2e8d6dbced4e3..eeb18759fc72c 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -673,7 +673,7 @@ def resolve_numeric_only(numeric_only: bool | None | lib.NoDefault) -> bool: # first default to None result = False else: - result = cast(bool, numeric_only) + result = numeric_only return result diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 70f8e0a752dcb..ddf460a85621b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1258,9 +1258,7 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: else: numeric_only = False - # error: Incompatible return value type (got "Union[bool, NoDefault]", - # expected "bool") - return numeric_only # type: ignore[return-value] + return numeric_only # ----------------------------------------------------------------- # numba diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index e015a3e5a941a..fc89a03a2fbda 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -10,6 +10,7 @@ Hashable, Iterable, List, + Literal, Sequence, Tuple, cast, @@ -1399,7 +1400,7 @@ def format( sparsify = get_option("display.multi_sparse") if sparsify: - sentinel = "" + sentinel: Literal[""] | bool | lib.NoDefault = "" # GH3547 use value of sparsify as sentinel if it's "Falsey" assert isinstance(sparsify, bool) or sparsify is lib.no_default if sparsify in [False, lib.no_default]: From b08fcc3dd4b2a213de4b5073c8b331bcfca323f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Tue, 17 May 2022 21:35:40 -0400 Subject: [PATCH 02/11] ix mypy issues; re-write isinstance(..., NoDefault) --- pandas/_libs/lib.pyi | 1 - pandas/_libs/lib.pyx | 6 ++++-- pandas/_testing/asserters.py | 14 ++------------ pandas/core/arrays/interval.py | 14 ++++++++------ pandas/core/frame.py | 8 ++------ pandas/core/generic.py | 5 ++--- pandas/core/indexes/datetimes.py | 6 +++--- pandas/core/indexes/interval.py | 10 +++++----- pandas/core/reshape/tile.py | 2 +- pandas/core/series.py | 4 +--- 10 files changed, 28 insertions(+), 42 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 90340d42d2d3f..d4a766f7086af 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -27,7 +27,6 @@ class _NoDefault(Enum): no_default = ... no_default = _NoDefault.no_default -# note: the pyx file defines NoDefault as _NoDefault! NoDefault = Literal[_NoDefault.no_default] i8max: int diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 13bd95004445d..4e245d1bd8693 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1,6 +1,7 @@ from collections import abc from decimal import Decimal from enum import Enum +from typing import Literal import warnings cimport cython @@ -2791,7 +2792,7 @@ cdef _infer_all_nats(dtype, ndarray datetimes, ndarray timedeltas): return result -class NoDefault(Enum): +class _NoDefault(Enum): # We make this an Enum # 1) because it round-trips through pickle correctly (see GH#40397) # 2) because mypy does not understand singletons @@ -2802,7 +2803,8 @@ class NoDefault(Enum): # Note: no_default is exported to the public API in pandas.api.extensions -no_default = NoDefault.no_default # Sentinel indicating the default value. +no_default = _NoDefault.no_default # Sentinel indicating the default value. +NoDefault = Literal[_NoDefault.no_default] @cython.boundscheck(False) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index ab42fcd92a3d9..efbe9995525d7 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -112,12 +112,7 @@ def assert_almost_equal( FutureWarning, stacklevel=find_stack_level(), ) - # https://github.com/python/mypy/issues/7642 - # error: Argument 1 to "_get_tol_from_less_precise" has incompatible - # type "Union[bool, int, NoDefault]"; expected "Union[bool, int]" - rtol = atol = _get_tol_from_less_precise( - check_less_precise # type: ignore[arg-type] - ) + rtol = atol = _get_tol_from_less_precise(check_less_precise) if isinstance(left, Index): assert_index_equal( @@ -345,12 +340,7 @@ def _get_ilevel_values(index, level): FutureWarning, stacklevel=find_stack_level(), ) - # https://github.com/python/mypy/issues/7642 - # error: Argument 1 to "_get_tol_from_less_precise" has incompatible - # type "Union[bool, int, NoDefault]"; expected "Union[bool, int]" - rtol = atol = _get_tol_from_less_precise( - check_less_precise # type: ignore[arg-type] - ) + rtol = atol = _get_tol_from_less_precise(check_less_precise) # instance validation _check_isinstance(left, right, Index) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 679feaca71024..0979e97b84a99 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -263,7 +263,7 @@ def _simple_new( cls: type[IntervalArrayT], left, right, - closed=None, + closed: IntervalClosedType | None = None, copy: bool = False, dtype: Dtype | None = None, verify_integrity: bool = True, @@ -416,7 +416,7 @@ def _from_factorized( def from_breaks( cls: type[IntervalArrayT], breaks, - closed="right", + closed: IntervalClosedType | None = "right", copy: bool = False, dtype: Dtype | None = None, ) -> IntervalArrayT: @@ -492,7 +492,7 @@ def from_arrays( cls: type[IntervalArrayT], left, right, - closed="right", + closed: IntervalClosedType | None = "right", copy: bool = False, dtype: Dtype | None = None, ) -> IntervalArrayT: @@ -956,10 +956,12 @@ def _concat_same_type( ------- IntervalArray """ - closed = {interval.closed for interval in to_concat} - if len(closed) != 1: + closed_set: set[IntervalClosedType] = { + interval.closed for interval in to_concat + } + if len(closed_set) != 1: raise ValueError("Intervals must all be closed on the same side.") - closed = closed.pop() + closed = closed_set.pop() left = np.concatenate([interval.left for interval in to_concat]) right = np.concatenate([interval.right for interval in to_concat]) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 51ca9dbd763b4..22456644e4d69 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6259,8 +6259,7 @@ def dropna( # faster equivalent to 'agg_obj.count(agg_axis) > 0' mask = notna(agg_obj).any(axis=agg_axis, bool_only=False) else: - if how is not no_default: - raise ValueError(f"invalid how option: {how}") + raise ValueError(f"invalid how option: {how}") if np.all(mask): result = self.copy() @@ -8030,9 +8029,6 @@ def groupby( raise TypeError("You have to supply one of 'by' and 'level'") axis = self._get_axis_number(axis) - # https://github.com/python/mypy/issues/7642 - # error: Argument "squeeze" to "DataFrameGroupBy" has incompatible type - # "Union[bool, NoDefault]"; expected "bool" return DataFrameGroupBy( obj=self, keys=by, @@ -8041,7 +8037,7 @@ def groupby( as_index=as_index, sort=sort, group_keys=group_keys, - squeeze=squeeze, # type: ignore[arg-type] + squeeze=squeeze, observed=observed, dropna=dropna, ) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 24c3bcb7bf669..69dabf0ea7d6e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7901,8 +7901,8 @@ def between_time( FutureWarning, stacklevel=find_stack_level(), ) - left = True if isinstance(include_start, lib.NoDefault) else include_start - right = True if isinstance(include_end, lib.NoDefault) else include_end + left = True if include_start is lib.no_default else include_start + right = True if include_end is lib.no_default else include_end inc_dict: dict[tuple[bool_t, bool_t], IntervalClosedType] = { (True, True): "both", @@ -10692,7 +10692,6 @@ def _stat_function( if axis is None: axis = self._stat_axis_number - axis = cast(Axis, axis) if level is not None: warnings.warn( "Using the level keyword in DataFrame and Series aggregations is " diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 5274f68eb3171..2d31b9d0f0f9a 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1034,12 +1034,12 @@ def date_range( DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq='D') """ - if inclusive is not None and not isinstance(closed, lib.NoDefault): + if inclusive is not None and closed is not lib.no_default: raise ValueError( "Deprecated argument `closed` cannot be passed" "if argument `inclusive` is not None" ) - elif not isinstance(closed, lib.NoDefault): + elif closed is not lib.no_default: warnings.warn( "Argument `closed` is deprecated in favor of `inclusive`.", FutureWarning, @@ -1082,7 +1082,7 @@ def bdate_range( name: Hashable = None, weekmask=None, holidays=None, - closed: lib.NoDefault = lib.no_default, + closed: Literal["left", "right"] | lib.NoDefault | None = lib.no_default, inclusive: IntervalClosedType | None = None, **kwargs, ) -> DatetimeIndex: diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index c3acfc5ff2f66..a89b52e0950f2 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -251,7 +251,7 @@ def __new__( def from_breaks( cls, breaks, - closed: str = "right", + closed: IntervalClosedType | None = "right", name: Hashable = None, copy: bool = False, dtype: Dtype | None = None, @@ -282,7 +282,7 @@ def from_arrays( cls, left, right, - closed: str = "right", + closed: IntervalClosedType = "right", name: Hashable = None, copy: bool = False, dtype: Dtype | None = None, @@ -957,7 +957,7 @@ def interval_range( periods=None, freq=None, name: Hashable = None, - closed: lib.NoDefault = lib.no_default, + closed: IntervalClosedType | lib.NoDefault = lib.no_default, inclusive: IntervalClosedType | None = None, ) -> IntervalIndex: """ @@ -1054,12 +1054,12 @@ def interval_range( IntervalIndex([[1, 2], [2, 3], [3, 4], [4, 5]], dtype='interval[int64, both]') """ - if inclusive is not None and not isinstance(closed, lib.NoDefault): + if inclusive is not None and closed is not lib.no_default: raise ValueError( "Deprecated argument `closed` cannot be passed " "if argument `inclusive` is not None" ) - elif not isinstance(closed, lib.NoDefault): + elif closed is not lib.no_default: warnings.warn( "Argument `closed` is deprecated in favor of `inclusive`.", FutureWarning, diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index d8c4f3f3da765..ae948eda93e7c 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -560,7 +560,7 @@ def _format_labels( bins, precision: int, right: bool = True, include_lowest: bool = False, dtype=None ): """based on the dtype, return our labels""" - closed = "right" if right else "left" + closed: Literal["right", "left"] = "right" if right else "left" formatter: Callable[[Any], Timestamp] | Callable[[Any], Timedelta] diff --git a/pandas/core/series.py b/pandas/core/series.py index b740bac78b263..d1514a3872800 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1952,8 +1952,6 @@ def groupby( raise TypeError("You have to supply one of 'by' and 'level'") axis = self._get_axis_number(axis) - # error: Argument "squeeze" to "SeriesGroupBy" has incompatible type - # "Union[bool, NoDefault]"; expected "bool" return SeriesGroupBy( obj=self, keys=by, @@ -1962,7 +1960,7 @@ def groupby( as_index=as_index, sort=sort, group_keys=group_keys, - squeeze=squeeze, # type: ignore[arg-type] + squeeze=squeeze, observed=observed, dropna=dropna, ) From 897eed99f25e4baccfcbea5b988f4e05f3c35d04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Wed, 18 May 2022 12:31:29 -0400 Subject: [PATCH 03/11] remove two more casts --- pandas/core/groupby/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2acf5c826eb57..090554f2eafe5 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1580,7 +1580,7 @@ def idxmax( # DataFrame.idxmax for backwards compatibility numeric_only_arg = None if axis == 0 else False else: - numeric_only_arg = cast(bool, numeric_only) + numeric_only_arg = numeric_only def func(df): res = df._reduce( @@ -1616,7 +1616,7 @@ def idxmin( # DataFrame.idxmin for backwards compatibility numeric_only_arg = None if axis == 0 else False else: - numeric_only_arg = cast(bool, numeric_only) + numeric_only_arg = numeric_only def func(df): res = df._reduce( From 1aa1ac7884181d267c5d7d4b631f810c84889bf5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 17 May 2022 18:01:08 -0700 Subject: [PATCH 04/11] ENH: DatetimeArray fields support non-nano (#47044) --- pandas/_libs/tslibs/fields.pyi | 2 ++ pandas/_libs/tslibs/fields.pyx | 35 ++++++++++++++------------- pandas/core/arrays/datetimes.py | 6 ++--- pandas/tests/arrays/test_datetimes.py | 26 +++++++++++++++++++- 4 files changed, 48 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/tslibs/fields.pyi b/pandas/_libs/tslibs/fields.pyi index e404eadf13657..b1d9e0342f81e 100644 --- a/pandas/_libs/tslibs/fields.pyi +++ b/pandas/_libs/tslibs/fields.pyi @@ -22,6 +22,7 @@ def get_start_end_field( def get_date_field( dtindex: npt.NDArray[np.int64], # const int64_t[:] field: str, + reso: int = ..., # NPY_DATETIMEUNIT ) -> npt.NDArray[np.int32]: ... def get_timedelta_field( tdindex: npt.NDArray[np.int64], # const int64_t[:] @@ -32,6 +33,7 @@ def isleapyear_arr( ) -> npt.NDArray[np.bool_]: ... def build_isocalendar_sarray( dtindex: npt.NDArray[np.int64], # const int64_t[:] + reso: int = ..., # NPY_DATETIMEUNIT ) -> np.ndarray: ... def _get_locale_names(name_type: str, locale: str | None = ...): ... diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 57d4c27b3337d..5865b8c6877b0 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -329,7 +329,7 @@ def get_start_end_field( @cython.wraparound(False) @cython.boundscheck(False) -def get_date_field(const int64_t[:] dtindex, str field): +def get_date_field(const int64_t[:] dtindex, str field, NPY_DATETIMEUNIT reso=NPY_FR_ns): """ Given a int64-based datetime index, extract the year, month, etc., field and return an array of these values. @@ -348,7 +348,7 @@ def get_date_field(const int64_t[:] dtindex, str field): out[i] = -1 continue - dt64_to_dtstruct(dtindex[i], &dts) + pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts) out[i] = dts.year return out @@ -359,7 +359,7 @@ def get_date_field(const int64_t[:] dtindex, str field): out[i] = -1 continue - dt64_to_dtstruct(dtindex[i], &dts) + pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts) out[i] = dts.month return out @@ -370,7 +370,7 @@ def get_date_field(const int64_t[:] dtindex, str field): out[i] = -1 continue - dt64_to_dtstruct(dtindex[i], &dts) + pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts) out[i] = dts.day return out @@ -381,8 +381,9 @@ def get_date_field(const int64_t[:] dtindex, str field): out[i] = -1 continue - dt64_to_dtstruct(dtindex[i], &dts) + pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts) out[i] = dts.hour + # TODO: can we de-dup with period.pyx s? return out elif field == 'm': @@ -392,7 +393,7 @@ def get_date_field(const int64_t[:] dtindex, str field): out[i] = -1 continue - dt64_to_dtstruct(dtindex[i], &dts) + pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts) out[i] = dts.min return out @@ -403,7 +404,7 @@ def get_date_field(const int64_t[:] dtindex, str field): out[i] = -1 continue - dt64_to_dtstruct(dtindex[i], &dts) + pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts) out[i] = dts.sec return out @@ -414,7 +415,7 @@ def get_date_field(const int64_t[:] dtindex, str field): out[i] = -1 continue - dt64_to_dtstruct(dtindex[i], &dts) + pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts) out[i] = dts.us return out @@ -425,7 +426,7 @@ def get_date_field(const int64_t[:] dtindex, str field): out[i] = -1 continue - dt64_to_dtstruct(dtindex[i], &dts) + pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts) out[i] = dts.ps // 1000 return out elif field == 'doy': @@ -435,7 +436,7 @@ def get_date_field(const int64_t[:] dtindex, str field): out[i] = -1 continue - dt64_to_dtstruct(dtindex[i], &dts) + pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts) out[i] = get_day_of_year(dts.year, dts.month, dts.day) return out @@ -446,7 +447,7 @@ def get_date_field(const int64_t[:] dtindex, str field): out[i] = -1 continue - dt64_to_dtstruct(dtindex[i], &dts) + pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts) out[i] = dayofweek(dts.year, dts.month, dts.day) return out @@ -457,7 +458,7 @@ def get_date_field(const int64_t[:] dtindex, str field): out[i] = -1 continue - dt64_to_dtstruct(dtindex[i], &dts) + pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts) out[i] = get_week_of_year(dts.year, dts.month, dts.day) return out @@ -468,7 +469,7 @@ def get_date_field(const int64_t[:] dtindex, str field): out[i] = -1 continue - dt64_to_dtstruct(dtindex[i], &dts) + pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts) out[i] = dts.month out[i] = ((out[i] - 1) // 3) + 1 return out @@ -480,11 +481,11 @@ def get_date_field(const int64_t[:] dtindex, str field): out[i] = -1 continue - dt64_to_dtstruct(dtindex[i], &dts) + pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts) out[i] = get_days_in_month(dts.year, dts.month) return out elif field == 'is_leap_year': - return isleapyear_arr(get_date_field(dtindex, 'Y')) + return isleapyear_arr(get_date_field(dtindex, 'Y', reso=reso)) raise ValueError(f"Field {field} not supported") @@ -564,7 +565,7 @@ cpdef isleapyear_arr(ndarray years): @cython.wraparound(False) @cython.boundscheck(False) -def build_isocalendar_sarray(const int64_t[:] dtindex): +def build_isocalendar_sarray(const int64_t[:] dtindex, NPY_DATETIMEUNIT reso=NPY_FR_ns): """ Given a int64-based datetime array, return the ISO 8601 year, week, and day as a structured array. @@ -592,7 +593,7 @@ def build_isocalendar_sarray(const int64_t[:] dtindex): if dtindex[i] == NPY_NAT: ret_val = 0, 0, 0 else: - dt64_to_dtstruct(dtindex[i], &dts) + pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts) ret_val = get_iso_calendar(dts.year, dts.month, dts.day) iso_years[i] = ret_val[0] diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 6f984727f4f6d..dadfad394b903 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -136,7 +136,7 @@ def f(self): values, field, self.freqstr, month_kw, reso=self._reso ) else: - result = fields.get_date_field(values, field) + result = fields.get_date_field(values, field, reso=self._reso) # these return a boolean by-definition return result @@ -146,7 +146,7 @@ def f(self): result = self._maybe_mask_results(result, fill_value=None) else: - result = fields.get_date_field(values, field) + result = fields.get_date_field(values, field, reso=self._reso) result = self._maybe_mask_results( result, fill_value=None, convert="float64" ) @@ -1403,7 +1403,7 @@ def isocalendar(self) -> DataFrame: from pandas import DataFrame values = self._local_timestamps() - sarray = fields.build_isocalendar_sarray(values) + sarray = fields.build_isocalendar_sarray(values, reso=self._reso) iso_calendar_df = DataFrame( sarray, columns=["year", "week", "day"], dtype="UInt32" ) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 8eb5cc2dd82f6..897528cf18122 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -12,7 +12,15 @@ class TestNonNano: - @pytest.mark.parametrize("unit,reso", [("s", 7), ("ms", 8), ("us", 9)]) + @pytest.fixture(params=["s", "ms", "us"]) + def unit(self, request): + return request.param + + @pytest.fixture + def reso(self, unit): + # TODO: avoid hard-coding + return {"s": 7, "ms": 8, "us": 9}[unit] + @pytest.mark.xfail(reason="_box_func is not yet patched to get reso right") def test_non_nano(self, unit, reso): arr = np.arange(5, dtype=np.int64).view(f"M8[{unit}]") @@ -21,6 +29,22 @@ def test_non_nano(self, unit, reso): assert dta.dtype == arr.dtype assert dta[0]._reso == reso + @pytest.mark.filterwarnings( + "ignore:weekofyear and week have been deprecated:FutureWarning" + ) + @pytest.mark.parametrize( + "field", DatetimeArray._field_ops + DatetimeArray._bool_ops + ) + def test_fields(self, unit, reso, field): + dti = pd.date_range("2016-01-01", periods=55, freq="D") + arr = np.asarray(dti).astype(f"M8[{unit}]") + + dta = DatetimeArray._simple_new(arr, dtype=arr.dtype) + + res = getattr(dta, field) + expected = getattr(dti._data, field) + tm.assert_numpy_array_equal(res, expected) + class TestDatetimeArrayComparisons: # TODO: merge this into tests/arithmetic/test_datetime64 once it is From b503e448f535bf1e56614ea933b5a580fffd17fb Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Wed, 18 May 2022 08:54:43 -0400 Subject: [PATCH 05/11] DEPR: groupby numeric_only default (#47025) --- doc/source/whatsnew/v1.5.0.rst | 5 +- pandas/core/groupby/generic.py | 63 ++++++-- pandas/core/groupby/groupby.py | 145 +++++++++++++----- pandas/tests/extension/base/groupby.py | 12 +- pandas/tests/frame/test_stack_unstack.py | 4 +- pandas/tests/generic/test_frame.py | 4 +- .../tests/groupby/aggregate/test_aggregate.py | 10 +- pandas/tests/groupby/test_allowlist.py | 6 +- pandas/tests/groupby/test_categorical.py | 13 +- pandas/tests/groupby/test_function.py | 126 ++++++++++++++- pandas/tests/groupby/test_groupby.py | 131 +++++++++++----- pandas/tests/groupby/test_groupby_subclass.py | 4 +- pandas/tests/groupby/test_grouping.py | 33 ++-- pandas/tests/groupby/test_index_as_string.py | 7 +- pandas/tests/groupby/test_pipe.py | 4 +- pandas/tests/groupby/test_quantile.py | 5 +- pandas/tests/groupby/test_timegrouper.py | 48 ++++-- .../tests/groupby/transform/test_transform.py | 30 +++- pandas/tests/resample/test_resample_api.py | 26 +++- .../tests/resample/test_resampler_grouper.py | 4 +- pandas/tests/reshape/merge/test_join.py | 4 +- pandas/tests/reshape/test_pivot.py | 28 +++- 22 files changed, 544 insertions(+), 168 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 128fd68674f96..af30add139222 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -493,7 +493,8 @@ retained by specifying ``group_keys=False``. ``numeric_only`` default value ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Across the DataFrame operations such as ``min``, ``sum``, and ``idxmax``, the default +Across the DataFrame and DataFrameGroupBy operations such as +``min``, ``sum``, and ``idxmax``, the default value of the ``numeric_only`` argument, if it exists at all, was inconsistent. Furthermore, operations with the default value ``None`` can lead to surprising results. (:issue:`46560`) @@ -523,6 +524,8 @@ gained the ``numeric_only`` argument. - :meth:`DataFrame.cov` - :meth:`DataFrame.idxmin` - :meth:`DataFrame.idxmax` +- :meth:`.DataFrameGroupBy.cummin` +- :meth:`.DataFrameGroupBy.cummax` - :meth:`.DataFrameGroupBy.idxmin` - :meth:`.DataFrameGroupBy.idxmax` - :meth:`.GroupBy.var` diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f725ae061cedb..2acf5c826eb57 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -28,6 +28,7 @@ from pandas._libs import ( Interval, + lib, reduction as libreduction, ) from pandas._typing import ( @@ -1128,10 +1129,15 @@ def _wrap_applied_output_series( return self._reindex_output(result) def _cython_transform( - self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs + self, + how: str, + numeric_only: bool | lib.NoDefault = lib.no_default, + axis: int = 0, + **kwargs, ) -> DataFrame: assert axis == 0 # handled by caller # TODO: no tests with self.ndim == 1 for DataFrameGroupBy + numeric_only_bool = self._resolve_numeric_only(numeric_only, axis) # With self.axis == 0, we have multi-block tests # e.g. test_rank_min_int, test_cython_transform_frame @@ -1139,7 +1145,8 @@ def _cython_transform( # With self.axis == 1, _get_data_to_aggregate does a transpose # so we always have a single block. mgr: Manager2D = self._get_data_to_aggregate() - if numeric_only: + orig_mgr_len = len(mgr) + if numeric_only_bool: mgr = mgr.get_numeric_data(copy=False) def arr_func(bvalues: ArrayLike) -> ArrayLike: @@ -1152,8 +1159,8 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike: res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True) res_mgr.set_axis(1, mgr.axes[1]) - if len(res_mgr) < len(mgr): - warn_dropping_nuisance_columns_deprecated(type(self), how) + if len(res_mgr) < orig_mgr_len: + warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only) res_df = self.obj._constructor(res_mgr) if self.axis == 1: @@ -1269,7 +1276,9 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame: output[i] = sgb.transform(wrapper) except TypeError: # e.g. trying to call nanmean with string values - warn_dropping_nuisance_columns_deprecated(type(self), "transform") + warn_dropping_nuisance_columns_deprecated( + type(self), "transform", numeric_only=False + ) else: inds.append(i) @@ -1559,19 +1568,27 @@ def nunique(self, dropna: bool = True) -> DataFrame: _shared_docs["idxmax"], numeric_only_default="True for axis=0, False for axis=1", ) - def idxmax(self, axis=0, skipna: bool = True, numeric_only: bool | None = None): + def idxmax( + self, + axis=0, + skipna: bool = True, + numeric_only: bool | lib.NoDefault = lib.no_default, + ): axis = DataFrame._get_axis_number(axis) - if numeric_only is None: - numeric_only = None if axis == 0 else False + if numeric_only is lib.no_default: + # Cannot use self._resolve_numeric_only; we must pass None to + # DataFrame.idxmax for backwards compatibility + numeric_only_arg = None if axis == 0 else False + else: + numeric_only_arg = cast(bool, numeric_only) def func(df): - # NB: here we use numeric_only=None, in DataFrame it is False GH#38217 res = df._reduce( nanops.nanargmax, "argmax", axis=axis, skipna=skipna, - numeric_only=numeric_only, + numeric_only=numeric_only_arg, ) indices = res._values index = df._get_axis(axis) @@ -1579,25 +1596,35 @@ def func(df): return df._constructor_sliced(result, index=res.index) func.__name__ = "idxmax" - return self._python_apply_general(func, self._obj_with_exclusions) + result = self._python_apply_general(func, self._obj_with_exclusions) + self._maybe_warn_numeric_only_depr("idxmax", result, numeric_only) + return result @doc( _shared_docs["idxmin"], numeric_only_default="True for axis=0, False for axis=1", ) - def idxmin(self, axis=0, skipna: bool = True, numeric_only: bool | None = None): + def idxmin( + self, + axis=0, + skipna: bool = True, + numeric_only: bool | lib.NoDefault = lib.no_default, + ): axis = DataFrame._get_axis_number(axis) - if numeric_only is None: - numeric_only = None if axis == 0 else False + if numeric_only is lib.no_default: + # Cannot use self._resolve_numeric_only; we must pass None to + # DataFrame.idxmin for backwards compatibility + numeric_only_arg = None if axis == 0 else False + else: + numeric_only_arg = cast(bool, numeric_only) def func(df): - # NB: here we use numeric_only=None, in DataFrame it is False GH#46560 res = df._reduce( nanops.nanargmin, "argmin", axis=axis, skipna=skipna, - numeric_only=numeric_only, + numeric_only=numeric_only_arg, ) indices = res._values index = df._get_axis(axis) @@ -1605,7 +1632,9 @@ def func(df): return df._constructor_sliced(result, index=res.index) func.__name__ = "idxmin" - return self._python_apply_general(func, self._obj_with_exclusions) + result = self._python_apply_general(func, self._obj_with_exclusions) + self._maybe_warn_numeric_only_depr("idxmin", result, numeric_only) + return result boxplot = boxplot_frame_groupby diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ddf460a85621b..d08d25ed58c3c 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -939,8 +939,15 @@ def wrapper(*args, **kwargs): if kwargs.get("axis", None) is None: kwargs["axis"] = self.axis + numeric_only = kwargs.get("numeric_only", lib.no_default) + def curried(x): - return f(x, *args, **kwargs) + with warnings.catch_warnings(): + # Catch any warnings from dispatch to DataFrame; we'll emit + # a warning for groupby below + match = "The default value of numeric_only " + warnings.filterwarnings("ignore", match, FutureWarning) + return f(x, *args, **kwargs) # preserve the name so we can detect it when calling plot methods, # to avoid duplicates @@ -956,6 +963,13 @@ def curried(x): curried, self._obj_with_exclusions, is_transform=is_transform ) + if self._selected_obj.ndim != 1 and self.axis != 1: + missing = self._obj_with_exclusions.columns.difference(result.columns) + if len(missing) > 0: + warn_dropping_nuisance_columns_deprecated( + type(self), name, numeric_only + ) + if self.grouper.has_dropped_na and is_transform: # result will have dropped rows due to nans, fill with null # and ensure index is ordered same as the input @@ -1223,7 +1237,9 @@ def _wrap_applied_output( ): raise AbstractMethodError(self) - def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: + def _resolve_numeric_only( + self, numeric_only: bool | lib.NoDefault, axis: int + ) -> bool: """ Determine subclass-specific default value for 'numeric_only'. @@ -1233,6 +1249,8 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: Parameters ---------- numeric_only : bool or lib.no_default + axis : int + Axis passed to the groupby op (not self.axis). Returns ------- @@ -1243,7 +1261,7 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: # i.e. not explicitly passed by user if self.obj.ndim == 2: # i.e. DataFrameGroupBy - numeric_only = True + numeric_only = axis != 1 # GH#42395 GH#43108 GH#43154 # Regression from 1.2.5 to 1.3 caused object columns to be dropped if self.axis: @@ -1253,13 +1271,33 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: check = obj._get_numeric_data() if len(obj.columns) and not len(check.columns) and not obj.empty: numeric_only = False - # TODO: v1.4+ Add FutureWarning else: numeric_only = False return numeric_only + def _maybe_warn_numeric_only_depr( + self, how: str, result: DataFrame | Series, numeric_only: bool | lib.NoDefault + ) -> None: + """Emit warning on numeric_only behavior deprecation when appropriate. + + Parameters + ---------- + how : str + Groupby kernel name. + result : + Result of the groupby operation. + numeric_only : bool or lib.no_default + Argument as passed by user. + """ + if ( + self._obj_with_exclusions.ndim != 1 + and result.ndim > 1 + and len(result.columns) < len(self._obj_with_exclusions.columns) + ): + warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only) + # ----------------------------------------------------------------- # numba @@ -1520,7 +1558,9 @@ def _python_agg_general(self, func, *args, raise_on_typeerror=False, **kwargs): except TypeError: if raise_on_typeerror: raise - warn_dropping_nuisance_columns_deprecated(type(self), "agg") + warn_dropping_nuisance_columns_deprecated( + type(self), "agg", numeric_only=False + ) continue key = base.OutputKey(label=name, position=idx) @@ -1534,7 +1574,7 @@ def _python_agg_general(self, func, *args, raise_on_typeerror=False, **kwargs): @final def _agg_general( self, - numeric_only: bool = True, + numeric_only: bool | lib.NoDefault = True, min_count: int = -1, *, alias: str, @@ -1596,17 +1636,19 @@ def _cython_agg_general( self, how: str, alt: Callable, - numeric_only: bool, + numeric_only: bool | lib.NoDefault, min_count: int = -1, ignore_failures: bool = True, ): # Note: we never get here with how="ohlc" for DataFrameGroupBy; # that goes through SeriesGroupBy + numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0) data = self._get_data_to_aggregate() is_ser = data.ndim == 1 - if numeric_only: + orig_len = len(data) + if numeric_only_bool: if is_ser and not is_numeric_dtype(self._selected_obj.dtype): # GH#41291 match Series behavior kwd_name = "numeric_only" @@ -1636,8 +1678,8 @@ def array_func(values: ArrayLike) -> ArrayLike: # continue and exclude the block new_mgr = data.grouped_reduce(array_func, ignore_failures=ignore_failures) - if not is_ser and len(new_mgr) < len(data): - warn_dropping_nuisance_columns_deprecated(type(self), how) + if not is_ser and len(new_mgr) < orig_len: + warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only) res = self._wrap_agged_manager(new_mgr) if is_ser: @@ -1995,7 +2037,7 @@ def mean( 2 4.0 Name: B, dtype: float64 """ - numeric_only_bool = self._resolve_numeric_only(numeric_only) + numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0) if maybe_use_numba(engine): from pandas.core._numba.kernels import sliding_mean @@ -2005,7 +2047,7 @@ def mean( result = self._cython_agg_general( "mean", alt=lambda x: Series(x).mean(numeric_only=numeric_only_bool), - numeric_only=numeric_only_bool, + numeric_only=numeric_only, ) return result.__finalize__(self.obj, method="groupby") @@ -2029,12 +2071,12 @@ def median(self, numeric_only: bool | lib.NoDefault = lib.no_default): Series or DataFrame Median of values within each group. """ - numeric_only_bool = self._resolve_numeric_only(numeric_only) + numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0) result = self._cython_agg_general( "median", alt=lambda x: Series(x).median(numeric_only=numeric_only_bool), - numeric_only=numeric_only_bool, + numeric_only=numeric_only, ) return result.__finalize__(self.obj, method="groupby") @@ -2090,7 +2132,7 @@ def std( return np.sqrt(self._numba_agg_general(sliding_var, engine_kwargs, ddof)) else: - return self._get_cythonized_result( + result = self._get_cythonized_result( libgroupby.group_var, cython_dtype=np.dtype(np.float64), numeric_only=numeric_only, @@ -2098,6 +2140,8 @@ def std( post_processing=lambda vals, inference: np.sqrt(vals), ddof=ddof, ) + self._maybe_warn_numeric_only_depr("std", result, numeric_only) + return result @final @Substitution(name="groupby") @@ -2151,12 +2195,12 @@ def var( return self._numba_agg_general(sliding_var, engine_kwargs, ddof) else: - numeric_only_bool = self._resolve_numeric_only(numeric_only) + numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0) if ddof == 1: return self._cython_agg_general( "var", alt=lambda x: Series(x).var(ddof=ddof), - numeric_only=numeric_only_bool, + numeric_only=numeric_only, ignore_failures=numeric_only is lib.no_default, ) else: @@ -2191,6 +2235,8 @@ def sem(self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default Standard error of the mean of values within each group. """ result = self.std(ddof=ddof, numeric_only=numeric_only) + self._maybe_warn_numeric_only_depr("sem", result, numeric_only) + if result.ndim == 1: result /= np.sqrt(self.count()) else: @@ -2251,8 +2297,6 @@ def sum( engine_kwargs, ) else: - numeric_only = self._resolve_numeric_only(numeric_only) - # If we are grouping on categoricals we want unobserved categories to # return zero, rather than the default of NaN which the reindexing in # _agg_general() returns. GH #31422 @@ -2271,8 +2315,6 @@ def sum( def prod( self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0 ): - numeric_only = self._resolve_numeric_only(numeric_only) - return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod ) @@ -3048,7 +3090,7 @@ def quantile( a 2.0 b 3.0 """ - numeric_only_bool = self._resolve_numeric_only(numeric_only) + numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0) def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, np.dtype | None]: if is_object_dtype(vals): @@ -3151,7 +3193,9 @@ def blk_func(values: ArrayLike) -> ArrayLike: and not is_ser and len(res_mgr.items) != len(mgr.items) ): - warn_dropping_nuisance_columns_deprecated(type(self), "quantile") + warn_dropping_nuisance_columns_deprecated( + type(self), "quantile", numeric_only + ) if len(res_mgr.items) == 0: # re-call grouped_reduce to get the desired exception message @@ -3445,7 +3489,7 @@ def cumsum(self, axis=0, *args, **kwargs): @final @Substitution(name="groupby") @Appender(_common_see_also) - def cummin(self, axis=0, **kwargs): + def cummin(self, axis=0, numeric_only=False, **kwargs): """ Cumulative min for each group. @@ -3458,12 +3502,14 @@ def cummin(self, axis=0, **kwargs): f = lambda x: np.minimum.accumulate(x, axis) return self._python_apply_general(f, self._selected_obj, is_transform=True) - return self._cython_transform("cummin", numeric_only=False, skipna=skipna) + return self._cython_transform( + "cummin", numeric_only=numeric_only, skipna=skipna + ) @final @Substitution(name="groupby") @Appender(_common_see_also) - def cummax(self, axis=0, **kwargs): + def cummax(self, axis=0, numeric_only=False, **kwargs): """ Cumulative max for each group. @@ -3476,7 +3522,9 @@ def cummax(self, axis=0, **kwargs): f = lambda x: np.maximum.accumulate(x, axis) return self._python_apply_general(f, self._selected_obj, is_transform=True) - return self._cython_transform("cummax", numeric_only=False, skipna=skipna) + return self._cython_transform( + "cummax", numeric_only=numeric_only, skipna=skipna + ) @final def _get_cythonized_result( @@ -3530,7 +3578,7 @@ def _get_cythonized_result( ------- `Series` or `DataFrame` with filled values """ - numeric_only = self._resolve_numeric_only(numeric_only) + numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0) if post_processing and not callable(post_processing): raise ValueError("'post_processing' must be a callable!") @@ -3599,15 +3647,16 @@ def blk_func(values: ArrayLike) -> ArrayLike: # Operate block-wise instead of column-by-column is_ser = obj.ndim == 1 mgr = self._get_data_to_aggregate() + orig_mgr_len = len(mgr) - if numeric_only: + if numeric_only_bool: mgr = mgr.get_numeric_data() res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True) - if not is_ser and len(res_mgr.items) != len(mgr.items): + if not is_ser and len(res_mgr.items) != orig_mgr_len: howstr = how.replace("group_", "") - warn_dropping_nuisance_columns_deprecated(type(self), howstr) + warn_dropping_nuisance_columns_deprecated(type(self), howstr, numeric_only) if len(res_mgr.items) == 0: # We re-call grouped_reduce to get the right exception message @@ -4153,13 +4202,27 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde return mi -def warn_dropping_nuisance_columns_deprecated(cls, how: str) -> None: - warnings.warn( - "Dropping invalid columns in " - f"{cls.__name__}.{how} is deprecated. " - "In a future version, a TypeError will be raised. " - f"Before calling .{how}, select only columns which " - "should be valid for the function.", - FutureWarning, - stacklevel=find_stack_level(), - ) +def warn_dropping_nuisance_columns_deprecated(cls, how: str, numeric_only) -> None: + if how == "add": + how = "sum" + if numeric_only is not lib.no_default and not numeric_only: + # numeric_only was specified and falsey but still dropped nuisance columns + warnings.warn( + "Dropping invalid columns in " + f"{cls.__name__}.{how} is deprecated. " + "In a future version, a TypeError will be raised. " + f"Before calling .{how}, select only columns which " + "should be valid for the function.", + FutureWarning, + stacklevel=find_stack_level(), + ) + elif numeric_only is lib.no_default: + warnings.warn( + "The default value of numeric_only in " + f"{cls.__name__}.{how} is deprecated. " + "In a future version, numeric_only will default to False. " + f"Either specify numeric_only or select only columns which " + "should be valid for the function.", + FutureWarning, + stacklevel=find_stack_level(), + ) diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 336865d32167d..711f1835446a5 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -1,5 +1,7 @@ import pytest +from pandas.core.dtypes.common import is_numeric_dtype + import pandas as pd import pandas._testing as tm from pandas.tests.extension.base.base import BaseExtensionTests @@ -96,7 +98,15 @@ def test_in_numeric_groupby(self, data_for_grouping): "C": [1, 1, 1, 1, 1, 1, 1, 1], } ) - result = df.groupby("A").sum().columns + + dtype = data_for_grouping.dtype + if is_numeric_dtype(dtype) or dtype.name == "decimal": + warn = None + else: + warn = FutureWarning + msg = "The default value of numeric_only" + with tm.assert_produces_warning(warn, match=msg): + result = df.groupby("A").sum().columns if data_for_grouping.dtype._is_numeric: expected = pd.Index(["B", "C"]) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index ba89a76a7f8c2..fedcc0e2a2284 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1785,7 +1785,9 @@ def test_stack_multiple_bug(self): multi = df.set_index(["DATE", "ID"]) multi.columns.name = "Params" unst = multi.unstack("ID") - down = unst.resample("W-THU").mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + down = unst.resample("W-THU").mean() rs = down.stack("ID") xp = unst.loc[:, ["VAR1"]].resample("W-THU").mean().stack("ID") diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index 2b248afb42057..b4a3a60e72139 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -71,7 +71,9 @@ def test_metadata_propagation_indiv_groupby(self): "D": np.random.randn(8), } ) - result = df.groupby("A").sum() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").sum() tm.assert_metadata_equivalent(df, result) def test_metadata_propagation_indiv_resample(self): diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index bdb33bff5eadd..37b02571158b9 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -238,7 +238,10 @@ def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype [[1, 2, 3, 4, 5, 6]] * 3, columns=MultiIndex.from_product([["a", "b"], ["i", "j", "k"]]), ).astype({("a", "j"): dtype, ("b", "j"): dtype}) - result = df.groupby(level=1, axis=1).agg(func) + warn = FutureWarning if func == "std" else None + msg = "The default value of numeric_only" + with tm.assert_produces_warning(warn, match=msg): + result = df.groupby(level=1, axis=1).agg(func) expected = DataFrame([expected] * 3, columns=["i", "j", "k"]).astype( result_dtype_dict ) @@ -262,7 +265,10 @@ def test_groupby_mixed_cols_axis1(func, expected_data, result_dtype_dict): columns=Index([10, 20, 10, 20], name="x"), dtype="int64", ).astype({10: "Int64"}) - result = df.groupby("x", axis=1).agg(func) + warn = FutureWarning if func == "std" else None + msg = "The default value of numeric_only" + with tm.assert_produces_warning(warn, match=msg): + result = df.groupby("x", axis=1).agg(func) expected = DataFrame( data=expected_data, index=Index([0, 1, 0], name="y"), diff --git a/pandas/tests/groupby/test_allowlist.py b/pandas/tests/groupby/test_allowlist.py index 7c64d82608c9e..e541abb368a02 100644 --- a/pandas/tests/groupby/test_allowlist.py +++ b/pandas/tests/groupby/test_allowlist.py @@ -187,7 +187,9 @@ def test_regression_allowlist_methods(raw_frame, op, level, axis, skipna, sort): if op in AGG_FUNCTIONS_WITH_SKIPNA: grouped = frame.groupby(level=level, axis=axis, sort=sort) - with tm.assert_produces_warning(warn, match="The 'mad' method is deprecated"): + with tm.assert_produces_warning( + warn, match="The 'mad' method is deprecated", raise_on_extra_warnings=False + ): result = getattr(grouped, op)(skipna=skipna) with tm.assert_produces_warning(FutureWarning): expected = getattr(frame, op)(level=level, axis=axis, skipna=skipna) @@ -196,8 +198,8 @@ def test_regression_allowlist_methods(raw_frame, op, level, axis, skipna, sort): tm.assert_frame_equal(result, expected) else: grouped = frame.groupby(level=level, axis=axis, sort=sort) - result = getattr(grouped, op)() with tm.assert_produces_warning(FutureWarning): + result = getattr(grouped, op)() expected = getattr(frame, op)(level=level, axis=axis) if sort: expected = expected.sort_index(axis=axis, level=level) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index abe1b8f13e32e..004e55f4d161f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -103,7 +103,9 @@ def test_basic(): # TODO: split this test gb = df.groupby("A", observed=False) exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True) expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)}) - result = gb.sum() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.sum() tm.assert_frame_equal(result, expected) # GH 8623 @@ -314,6 +316,7 @@ def test_apply(ordered): tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:.*value of numeric_only.*:FutureWarning") def test_observed(observed): # multiple groupers, don't re-expand the output space # of the grouper @@ -807,8 +810,12 @@ def test_preserve_categorical_dtype(): } ) for col in ["C1", "C2"]: - result1 = df.groupby(by=col, as_index=False, observed=False).mean() - result2 = df.groupby(by=col, as_index=True, observed=False).mean().reset_index() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result1 = df.groupby(by=col, as_index=False, observed=False).mean() + result2 = ( + df.groupby(by=col, as_index=True, observed=False).mean().reset_index() + ) expected = exp_full.reindex(columns=result1.columns) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index c99405dfccb66..206d37e1a800e 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -4,6 +4,7 @@ import numpy as np import pytest +from pandas._libs import lib from pandas.errors import UnsupportedFunctionCall import pandas as pd @@ -259,7 +260,9 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): # these have numeric_only kwarg, but default to False warn = FutureWarning - with tm.assert_produces_warning(warn, match="Dropping invalid columns"): + with tm.assert_produces_warning( + warn, match="Dropping invalid columns", raise_on_extra_warnings=False + ): result = getattr(gb, method)() tm.assert_index_equal(result.columns, expected_columns_numeric) @@ -297,24 +300,26 @@ def gni(self, df): return gni # TODO: non-unique columns, as_index=False - @pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning") def test_idxmax(self, gb): # object dtype so idxmax goes through _aggregate_item_by_item # GH#5610 # non-cython calls should not include the grouper expected = DataFrame([[0.0], [np.nan]], columns=["B"], index=[1, 3]) expected.index.name = "A" - result = gb.idxmax() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.idxmax() tm.assert_frame_equal(result, expected) - @pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning") def test_idxmin(self, gb): # object dtype so idxmax goes through _aggregate_item_by_item # GH#5610 # non-cython calls should not include the grouper expected = DataFrame([[0.0], [np.nan]], columns=["B"], index=[1, 3]) expected.index.name = "A" - result = gb.idxmin() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.idxmin() tm.assert_frame_equal(result, expected) def test_mad(self, gb, gni): @@ -1238,3 +1243,114 @@ def test_groupby_sum_timedelta_with_nat(): res = gb["b"].sum(min_count=2) expected = Series([td3, pd.NaT], dtype="m8[ns]", name="b", index=expected.index) tm.assert_series_equal(res, expected) + + +@pytest.mark.parametrize( + "kernel, numeric_only_default, drops_nuisance, has_arg", + [ + ("all", False, False, False), + ("any", False, False, False), + ("bfill", False, False, False), + ("corr", True, False, True), + ("corrwith", True, False, True), + ("cov", True, False, True), + ("cummax", False, True, True), + ("cummin", False, True, True), + ("cumprod", True, True, True), + ("cumsum", True, True, True), + ("diff", False, False, False), + ("ffill", False, False, False), + ("fillna", False, False, False), + ("first", False, False, True), + ("idxmax", True, False, True), + ("idxmin", True, False, True), + ("last", False, False, True), + ("max", False, True, True), + ("mean", True, True, True), + ("median", True, True, True), + ("min", False, True, True), + ("nth", False, False, False), + ("nunique", False, False, False), + ("pct_change", False, False, False), + ("prod", True, True, True), + ("quantile", True, False, True), + ("sem", True, True, True), + ("skew", True, False, True), + ("std", True, True, True), + ("sum", True, True, True), + ("var", True, False, True), + ], +) +@pytest.mark.parametrize("numeric_only", [True, False, lib.no_default]) +@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) +def test_deprecate_numeric_only( + kernel, numeric_only_default, drops_nuisance, has_arg, numeric_only, keys +): + # GH#46072 + # drops_nuisance: Whether the op drops nuisance columns even when numeric_only=False + # has_arg: Whether the op has a numeric_only arg + df = DataFrame({"a1": [1, 1], "a2": [2, 2], "a3": [5, 6], "b": 2 * [object]}) + + if kernel == "corrwith": + args = (df,) + elif kernel == "nth" or kernel == "fillna": + args = (0,) + else: + args = () + kwargs = {} if numeric_only is lib.no_default else {"numeric_only": numeric_only} + + gb = df.groupby(keys) + method = getattr(gb, kernel) + if has_arg and ( + # Cases where b does not appear in the result + numeric_only is True + or (numeric_only is lib.no_default and numeric_only_default) + or drops_nuisance + ): + if numeric_only is True or (not numeric_only_default and not drops_nuisance): + warn = None + else: + warn = FutureWarning + if numeric_only is lib.no_default and numeric_only_default: + msg = f"The default value of numeric_only in DataFrameGroupBy.{kernel}" + else: + msg = f"Dropping invalid columns in DataFrameGroupBy.{kernel}" + with tm.assert_produces_warning(warn, match=msg): + result = method(*args, **kwargs) + + assert "b" not in result.columns + elif ( + # kernels that work on any dtype and have numeric_only arg + kernel in ("first", "last", "corrwith") + or ( + # kernels that work on any dtype and don't have numeric_only arg + kernel in ("any", "all", "bfill", "ffill", "fillna", "nth", "nunique") + and numeric_only is lib.no_default + ) + ): + result = method(*args, **kwargs) + assert "b" in result.columns + elif has_arg: + assert numeric_only is not True + assert numeric_only is not lib.no_default or numeric_only_default is False + assert not drops_nuisance + # kernels that are successful on any dtype were above; this will fail + msg = ( + "(not allowed for this dtype" + "|must be a string or a number" + "|cannot be performed against 'object' dtypes" + "|must be a string or a real number)" + ) + with pytest.raises(TypeError, match=msg): + method(*args, **kwargs) + elif not has_arg and numeric_only is not lib.no_default: + with pytest.raises( + TypeError, match="got an unexpected keyword argument 'numeric_only'" + ): + method(*args, **kwargs) + else: + assert kernel in ("diff", "pct_change") + assert numeric_only is lib.no_default + # Doesn't have numeric_only argument and fails on nuisance columns + with pytest.raises(TypeError, match=r"unsupported operand type"): + method(*args, **kwargs) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 016e817e43402..61951292d55a8 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -474,13 +474,17 @@ def test_frame_groupby_columns(tsframe): def test_frame_set_name_single(df): grouped = df.groupby("A") - result = grouped.mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.mean() assert result.index.name == "A" - result = df.groupby("A", as_index=False).mean() + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A", as_index=False).mean() assert result.index.name != "A" - result = grouped.agg(np.mean) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.agg(np.mean) assert result.index.name == "A" result = grouped.agg({"C": np.mean, "D": np.std}) @@ -503,8 +507,10 @@ def test_multi_func(df): col2 = df["B"] grouped = df.groupby([col1.get, col2.get]) - agged = grouped.mean() - expected = df.groupby(["A", "B"]).mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + agged = grouped.mean() + expected = df.groupby(["A", "B"]).mean() # TODO groupby get drops names tm.assert_frame_equal( @@ -661,13 +667,16 @@ def test_groupby_as_index_agg(df): # single-key - result = grouped.agg(np.mean) - expected = grouped.mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.agg(np.mean) + expected = grouped.mean() tm.assert_frame_equal(result, expected) result2 = grouped.agg({"C": np.mean, "D": np.sum}) - expected2 = grouped.mean() - expected2["D"] = grouped.sum()["D"] + with tm.assert_produces_warning(FutureWarning, match=msg): + expected2 = grouped.mean() + expected2["D"] = grouped.sum()["D"] tm.assert_frame_equal(result2, expected2) grouped = df.groupby("A", as_index=True) @@ -754,8 +763,10 @@ def test_as_index_series_return_frame(df): grouped = df.groupby("A", as_index=False) grouped2 = df.groupby(["A", "B"], as_index=False) - result = grouped["C"].agg(np.sum) - expected = grouped.agg(np.sum).loc[:, ["A", "C"]] + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped["C"].agg(np.sum) + expected = grouped.agg(np.sum).loc[:, ["A", "C"]] assert isinstance(result, DataFrame) tm.assert_frame_equal(result, expected) @@ -765,7 +776,8 @@ def test_as_index_series_return_frame(df): tm.assert_frame_equal(result2, expected2) result = grouped["C"].sum() - expected = grouped.sum().loc[:, ["A", "C"]] + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = grouped.sum().loc[:, ["A", "C"]] assert isinstance(result, DataFrame) tm.assert_frame_equal(result, expected) @@ -789,8 +801,10 @@ def test_groupby_as_index_cython(df): # single-key grouped = data.groupby("A", as_index=False) - result = grouped.mean() - expected = data.groupby(["A"]).mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.mean() + expected = data.groupby(["A"]).mean() expected.insert(0, "A", expected.index) expected.index = np.arange(len(expected)) tm.assert_frame_equal(result, expected) @@ -859,15 +873,18 @@ def test_groupby_multi_corner(df): def test_omit_nuisance(df): grouped = df.groupby("A") - agged = grouped.agg(np.mean) - exp = grouped.mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + agged = grouped.agg(np.mean) + exp = grouped.mean() tm.assert_frame_equal(agged, exp) df = df.loc[:, ["A", "C", "D"]] df["E"] = datetime.now() grouped = df.groupby("A") - result = grouped.agg(np.sum) - expected = grouped.sum() + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.agg(np.sum) + expected = grouped.sum() tm.assert_frame_equal(result, expected) # won't work with axis = 1 @@ -898,7 +915,7 @@ def test_keep_nuisance_agg(df, agg_function): @pytest.mark.parametrize("numeric_only", [lib.no_default, True, False]) def test_omit_nuisance_agg(df, agg_function, numeric_only): # GH 38774, GH 38815 - if not numeric_only and agg_function != "sum": + if numeric_only is lib.no_default or (not numeric_only and agg_function != "sum"): # sum doesn't drop strings warn = FutureWarning else: @@ -913,7 +930,13 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): with pytest.raises(klass, match="could not convert string to float"): getattr(grouped, agg_function)(numeric_only=numeric_only) else: - with tm.assert_produces_warning(warn, match="Dropping invalid columns"): + if numeric_only is lib.no_default: + msg = ( + f"The default value of numeric_only in DataFrameGroupBy.{agg_function}" + ) + else: + msg = "Dropping invalid columns" + with tm.assert_produces_warning(warn, match=msg): result = getattr(grouped, agg_function)(numeric_only=numeric_only) if ( (numeric_only is lib.no_default or not numeric_only) @@ -923,9 +946,18 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): columns = ["A", "B", "C", "D"] else: columns = ["A", "C", "D"] - expected = getattr(df.loc[:, columns].groupby("A"), agg_function)( - numeric_only=numeric_only - ) + if agg_function == "sum" and numeric_only is False: + # sum doesn't drop nuisance string columns + warn = None + elif agg_function in ("sum", "std", "var", "sem") and numeric_only is not True: + warn = FutureWarning + else: + warn = None + msg = "The default value of numeric_only" + with tm.assert_produces_warning(warn, match=msg): + expected = getattr(df.loc[:, columns].groupby("A"), agg_function)( + numeric_only=numeric_only + ) tm.assert_frame_equal(result, expected) @@ -941,8 +973,10 @@ def test_omit_nuisance_warnings(df): def test_omit_nuisance_python_multiple(three_group): grouped = three_group.groupby(["A", "B"]) - agged = grouped.agg(np.mean) - exp = grouped.mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + agged = grouped.agg(np.mean) + exp = grouped.mean() tm.assert_frame_equal(agged, exp) @@ -959,8 +993,10 @@ def test_empty_groups_corner(mframe): ) grouped = df.groupby(["k1", "k2"]) - result = grouped.agg(np.mean) - expected = grouped.mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.agg(np.mean) + expected = grouped.mean() tm.assert_frame_equal(result, expected) grouped = mframe[3:5].groupby(level=0) @@ -982,7 +1018,9 @@ def test_wrap_aggregated_output_multindex(mframe): df["baz", "two"] = "peekaboo" keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] - agged = df.groupby(keys).agg(np.mean) + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + agged = df.groupby(keys).agg(np.mean) assert isinstance(agged.columns, MultiIndex) def aggfun(ser): @@ -1143,15 +1181,19 @@ def test_groupby_with_hier_columns(): # add a nuisance column sorted_columns, _ = columns.sortlevel(0) df["A", "foo"] = "bar" - result = df.groupby(level=0).mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(level=0).mean() tm.assert_index_equal(result.columns, df.columns[:-1]) def test_grouping_ndarray(df): grouped = df.groupby(df["A"].values) - result = grouped.sum() - expected = df.groupby("A").sum() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.sum() + expected = df.groupby("A").sum() tm.assert_frame_equal( result, expected, check_names=False ) # Note: no names when grouping by value @@ -1179,8 +1221,10 @@ def test_groupby_wrong_multi_labels(): def test_groupby_series_with_name(df): - result = df.groupby(df["A"]).mean() - result2 = df.groupby(df["A"], as_index=False).mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(df["A"]).mean() + result2 = df.groupby(df["A"], as_index=False).mean() assert result.index.name == "A" assert "A" in result2 @@ -1331,8 +1375,10 @@ def test_groupby_unit64_float_conversion(): def test_groupby_list_infer_array_like(df): - result = df.groupby(list(df["A"])).mean() - expected = df.groupby(df["A"]).mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(list(df["A"])).mean() + expected = df.groupby(df["A"]).mean() tm.assert_frame_equal(result, expected, check_names=False) with pytest.raises(KeyError, match=r"^'foo'$"): @@ -1445,7 +1491,9 @@ def test_groupby_2d_malformed(): d["zeros"] = [0, 0] d["ones"] = [1, 1] d["label"] = ["l1", "l2"] - tmp = d.groupby(["group"]).mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + tmp = d.groupby(["group"]).mean() res_values = np.array([[0.0, 1.0], [0.0, 1.0]]) tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"])) tm.assert_numpy_array_equal(tmp.values, res_values) @@ -1611,10 +1659,13 @@ def f(group): def test_no_dummy_key_names(df): # see gh-1291 - result = df.groupby(df["A"].values).sum() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(df["A"].values).sum() assert result.index.name is None - result = df.groupby([df["A"].values, df["B"].values]).sum() + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby([df["A"].values, df["B"].values]).sum() assert result.index.names == (None, None) @@ -2634,7 +2685,9 @@ def test_groupby_aggregation_numeric_with_non_numeric_dtype(): ) gb = df.groupby(by=["x"]) - result = gb.sum() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.sum() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index 54cde30ceac92..b665843728165 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -112,5 +112,7 @@ def test_groupby_resample_preserves_subclass(obj): df = df.set_index("Date") # Confirm groupby.resample() preserves dataframe type - result = df.groupby("Buyer").resample("5D").sum() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("Buyer").resample("5D").sum() assert isinstance(result, obj) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index c6e4bec3f7b2c..85602fdf7274a 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -59,8 +59,10 @@ def test_column_select_via_attr(self, df): tm.assert_series_equal(result, expected) df["mean"] = 1.5 - result = df.groupby("A").mean() - expected = df.groupby("A").agg(np.mean) + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").mean() + expected = df.groupby("A").agg(np.mean) tm.assert_frame_equal(result, expected) def test_getitem_list_of_columns(self): @@ -284,25 +286,30 @@ def test_grouper_column_and_index(self): {"A": np.arange(6), "B": ["one", "one", "two", "two", "one", "one"]}, index=idx, ) - result = df_multi.groupby(["B", pd.Grouper(level="inner")]).mean() - expected = df_multi.reset_index().groupby(["B", "inner"]).mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df_multi.groupby(["B", pd.Grouper(level="inner")]).mean() + expected = df_multi.reset_index().groupby(["B", "inner"]).mean() tm.assert_frame_equal(result, expected) # Test the reverse grouping order - result = df_multi.groupby([pd.Grouper(level="inner"), "B"]).mean() - expected = df_multi.reset_index().groupby(["inner", "B"]).mean() + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df_multi.groupby([pd.Grouper(level="inner"), "B"]).mean() + expected = df_multi.reset_index().groupby(["inner", "B"]).mean() tm.assert_frame_equal(result, expected) # Grouping a single-index frame by a column and the index should # be equivalent to resetting the index and grouping by two columns df_single = df_multi.reset_index("outer") - result = df_single.groupby(["B", pd.Grouper(level="inner")]).mean() - expected = df_single.reset_index().groupby(["B", "inner"]).mean() + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df_single.groupby(["B", pd.Grouper(level="inner")]).mean() + expected = df_single.reset_index().groupby(["B", "inner"]).mean() tm.assert_frame_equal(result, expected) # Test the reverse grouping order - result = df_single.groupby([pd.Grouper(level="inner"), "B"]).mean() - expected = df_single.reset_index().groupby(["inner", "B"]).mean() + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df_single.groupby([pd.Grouper(level="inner"), "B"]).mean() + expected = df_single.reset_index().groupby(["inner", "B"]).mean() tm.assert_frame_equal(result, expected) def test_groupby_levels_and_columns(self): @@ -376,8 +383,10 @@ def test_empty_groups(self, df): def test_groupby_grouper(self, df): grouped = df.groupby("A") - result = df.groupby(grouped.grouper).mean() - expected = grouped.mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(grouped.grouper).mean() + expected = grouped.mean() tm.assert_frame_equal(result, expected) def test_groupby_dict_mapping(self): diff --git a/pandas/tests/groupby/test_index_as_string.py b/pandas/tests/groupby/test_index_as_string.py index 971a447b84cae..501a21981a148 100644 --- a/pandas/tests/groupby/test_index_as_string.py +++ b/pandas/tests/groupby/test_index_as_string.py @@ -47,8 +47,11 @@ def series(): ], ) def test_grouper_index_level_as_string(frame, key_strs, groupers): - result = frame.groupby(key_strs).mean() - expected = frame.groupby(groupers).mean() + warn = FutureWarning if "B" not in key_strs or "outer" in frame.columns else None + msg = "The default value of numeric_only" + with tm.assert_produces_warning(warn, match=msg): + result = frame.groupby(key_strs).mean() + expected = frame.groupby(groupers).mean() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_pipe.py b/pandas/tests/groupby/test_pipe.py index 1229251f88c7d..4f58bcb5ee763 100644 --- a/pandas/tests/groupby/test_pipe.py +++ b/pandas/tests/groupby/test_pipe.py @@ -60,7 +60,9 @@ def f(dfgb, arg1): ) def g(dfgb, arg2): - return dfgb.sum() / dfgb.sum().sum() + arg2 + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + return dfgb.sum() / dfgb.sum().sum() + arg2 def h(df, arg3): return df.x + df.y - arg3 diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 0f7e71c99584d..20328426a69b2 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -246,9 +246,10 @@ def test_groupby_quantile_nullable_array(values, q): def test_groupby_quantile_skips_invalid_dtype(q, numeric_only): df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]}) - if numeric_only is None or numeric_only: + if numeric_only is lib.no_default or numeric_only: warn = FutureWarning if numeric_only is lib.no_default else None - with tm.assert_produces_warning(warn, match="Dropping invalid columns"): + msg = "The default value of numeric_only in DataFrameGroupBy.quantile" + with tm.assert_produces_warning(warn, match=msg): result = df.groupby("a").quantile(q, numeric_only=numeric_only) expected = df.groupby("a")[["b"]].quantile(q) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 7c9d6e7a73087..ae725cbb2b588 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -105,14 +105,18 @@ def test_groupby_with_timegrouper(self): ) expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype="int64") - result1 = df.resample("5D").sum() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result1 = df.resample("5D").sum() tm.assert_frame_equal(result1, expected) df_sorted = df.sort_index() - result2 = df_sorted.groupby(Grouper(freq="5D")).sum() + with tm.assert_produces_warning(FutureWarning, match=msg): + result2 = df_sorted.groupby(Grouper(freq="5D")).sum() tm.assert_frame_equal(result2, expected) - result3 = df.groupby(Grouper(freq="5D")).sum() + with tm.assert_produces_warning(FutureWarning, match=msg): + result3 = df.groupby(Grouper(freq="5D")).sum() tm.assert_frame_equal(result3, expected) @pytest.mark.parametrize("should_sort", [True, False]) @@ -186,7 +190,9 @@ def test_timegrouper_with_reg_groups(self): } ).set_index(["Date", "Buyer"]) - result = df.groupby([Grouper(freq="A"), "Buyer"]).sum() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby([Grouper(freq="A"), "Buyer"]).sum() tm.assert_frame_equal(result, expected) expected = DataFrame( @@ -201,7 +207,8 @@ def test_timegrouper_with_reg_groups(self): ], } ).set_index(["Date", "Buyer"]) - result = df.groupby([Grouper(freq="6MS"), "Buyer"]).sum() + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby([Grouper(freq="6MS"), "Buyer"]).sum() tm.assert_frame_equal(result, expected) df_original = DataFrame( @@ -239,10 +246,13 @@ def test_timegrouper_with_reg_groups(self): } ).set_index(["Date", "Buyer"]) - result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum() + warn_msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum() tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M"), "Buyer"]).sum() + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + result = df.groupby([Grouper(freq="1M"), "Buyer"]).sum() expected = DataFrame( { "Buyer": "Carl Joe Mark".split(), @@ -258,7 +268,8 @@ def test_timegrouper_with_reg_groups(self): # passing the name df = df.reset_index() - result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum() + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum() tm.assert_frame_equal(result, expected) with pytest.raises(KeyError, match="'The grouper name foo is not found'"): @@ -266,9 +277,11 @@ def test_timegrouper_with_reg_groups(self): # passing the level df = df.set_index("Date") - result = df.groupby([Grouper(freq="1M", level="Date"), "Buyer"]).sum() + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + result = df.groupby([Grouper(freq="1M", level="Date"), "Buyer"]).sum() tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M", level=0), "Buyer"]).sum() + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + result = df.groupby([Grouper(freq="1M", level=0), "Buyer"]).sum() tm.assert_frame_equal(result, expected) with pytest.raises(ValueError, match="The level foo is not valid"): @@ -277,7 +290,8 @@ def test_timegrouper_with_reg_groups(self): # multi names df = df.copy() df["Date"] = df.index + offsets.MonthEnd(2) - result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum() + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum() expected = DataFrame( { "Buyer": "Carl Joe Mark".split(), @@ -306,18 +320,22 @@ def test_timegrouper_with_reg_groups(self): [datetime(2013, 10, 31, 0, 0)], freq=offsets.MonthEnd(), name="Date" ), ) - result = df.groupby(Grouper(freq="1M")).sum() + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + result = df.groupby(Grouper(freq="1M")).sum() tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M")]).sum() + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + result = df.groupby([Grouper(freq="1M")]).sum() tm.assert_frame_equal(result, expected) expected.index = expected.index.shift(1) assert expected.index.freq == offsets.MonthEnd() - result = df.groupby(Grouper(freq="1M", key="Date")).sum() + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + result = df.groupby(Grouper(freq="1M", key="Date")).sum() tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M", key="Date")]).sum() + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + result = df.groupby([Grouper(freq="1M", key="Date")]).sum() tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("freq", ["D", "M", "A", "Q-APR"]) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 0492b143eaf1f..b325edaf2b1ea 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -203,15 +203,24 @@ def test_transform_axis_1_reducer(request, reduction_func): ): marker = pytest.mark.xfail(reason="transform incorrectly fails - GH#45986") request.node.add_marker(marker) - warn = FutureWarning if reduction_func == "mad" else None + if reduction_func == "mad": + warn = FutureWarning + msg = "The 'mad' method is deprecated" + elif reduction_func in ("sem", "std"): + warn = FutureWarning + msg = "The default value of numeric_only" + else: + warn = None + msg = "" df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"]) - with tm.assert_produces_warning(warn, match="The 'mad' method is deprecated"): + with tm.assert_produces_warning(warn, match=msg): result = df.groupby([0, 0, 1], axis=1).transform(reduction_func) if reduction_func == "size": # size doesn't behave in the same manner; hardcode expected result expected = DataFrame(2 * [[2, 2, 1]], index=df.index, columns=df.columns) else: + warn = FutureWarning if reduction_func == "mad" else None with tm.assert_produces_warning(warn, match="The 'mad' method is deprecated"): expected = df.T.groupby([0, 0, 1]).transform(reduction_func).T tm.assert_equal(result, expected) @@ -462,8 +471,10 @@ def test_transform_exclude_nuisance(df): def test_transform_function_aliases(df): - result = df.groupby("A").transform("mean") - expected = df.groupby("A").transform(np.mean) + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").transform("mean") + expected = df.groupby("A").transform(np.mean) tm.assert_frame_equal(result, expected) result = df.groupby("A")["C"].transform("mean") @@ -774,8 +785,15 @@ def test_cython_transform_frame(op, args, targop): expected = gb.apply(targop) expected = expected.sort_index(axis=1) - tm.assert_frame_equal(expected, gb.transform(op, *args).sort_index(axis=1)) - tm.assert_frame_equal(expected, getattr(gb, op)(*args).sort_index(axis=1)) + + warn = None if op == "shift" else FutureWarning + msg = "The default value of numeric_only" + with tm.assert_produces_warning(warn, match=msg): + result = gb.transform(op, *args).sort_index(axis=1) + tm.assert_frame_equal(result, expected) + with tm.assert_produces_warning(warn, match=msg): + result = getattr(gb, op)(*args).sort_index(axis=1) + tm.assert_frame_equal(result, expected) # individual columns for c in df: if ( diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index b5bae4759090a..21ef078bcf418 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -90,9 +90,10 @@ def test_groupby_resample_on_api(): } ) - expected = df.set_index("dates").groupby("key").resample("D").mean() - - result = df.groupby("key").resample("D", on="dates").mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.set_index("dates").groupby("key").resample("D").mean() + result = df.groupby("key").resample("D", on="dates").mean() tm.assert_frame_equal(result, expected) @@ -196,7 +197,9 @@ def tests_skip_nuisance(test_frame): tm.assert_frame_equal(result, expected) expected = r[["A", "B", "C"]].sum() - result = r.sum() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = r.sum() tm.assert_frame_equal(result, expected) @@ -643,10 +646,15 @@ def test_selection_api_validation(): exp = df_exp.resample("2D").sum() exp.index.name = "date" - tm.assert_frame_equal(exp, df.resample("2D", on="date").sum()) + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.resample("2D", on="date").sum() + tm.assert_frame_equal(exp, result) exp.index.name = "d" - tm.assert_frame_equal(exp, df.resample("2D", level="d").sum()) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.resample("2D", level="d").sum() + tm.assert_frame_equal(exp, result) @pytest.mark.parametrize( @@ -809,9 +817,13 @@ def test_frame_downsample_method(method, numeric_only, expected_data): func = getattr(resampled, method) if method == "prod" and numeric_only is not True: warn = FutureWarning + msg = "Dropping invalid columns in DataFrameGroupBy.prod is deprecated" + elif method == "sum" and numeric_only is lib.no_default: + warn = FutureWarning + msg = "The default value of numeric_only in DataFrameGroupBy.sum is deprecated" else: warn = None - msg = "Dropping invalid columns in DataFrameGroupBy.prod is deprecated" + msg = "" with tm.assert_produces_warning(warn, match=msg): result = func(numeric_only=numeric_only) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index cae2d77dfbd3f..5392ec88544a1 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -408,7 +408,9 @@ def test_resample_groupby_agg(): df["date"] = pd.to_datetime(df["date"]) resampled = df.groupby("cat").resample("Y", on="date") - expected = resampled.sum() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = resampled.sum() result = resampled.agg({"num": "sum"}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 5d6df078ee8c3..905c2af2d22a5 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -553,7 +553,9 @@ def test_mixed_type_join_with_suffix(self): df.insert(5, "dt", "foo") grouped = df.groupby("id") - mn = grouped.mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + mn = grouped.mean() cn = grouped.count() # it works! diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 31f720b9ec336..0d3b9f4561b55 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -146,8 +146,10 @@ def test_pivot_table_nocols(self): df = DataFrame( {"rows": ["a", "b", "c"], "cols": ["x", "y", "z"], "values": [1, 2, 3]} ) - rs = df.pivot_table(columns="cols", aggfunc=np.sum) - xp = df.pivot_table(index="cols", aggfunc=np.sum).T + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = df.pivot_table(columns="cols", aggfunc=np.sum) + xp = df.pivot_table(index="cols", aggfunc=np.sum).T tm.assert_frame_equal(rs, xp) rs = df.pivot_table(columns="cols", aggfunc={"values": "mean"}) @@ -903,12 +905,19 @@ def test_no_col(self): # to help with a buglet self.data.columns = [k * 2 for k in self.data.columns] - table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc=np.mean) + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + table = self.data.pivot_table( + index=["AA", "BB"], margins=True, aggfunc=np.mean + ) for value_col in table.columns: totals = table.loc[("All", ""), value_col] assert totals == self.data[value_col].mean() - table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") + with tm.assert_produces_warning(FutureWarning, match=msg): + table = self.data.pivot_table( + index=["AA", "BB"], margins=True, aggfunc="mean" + ) for item in ["DD", "EE", "FF"]: totals = table.loc[("All", ""), item] assert totals == self.data[item].mean() @@ -964,7 +973,9 @@ def test_margin_with_only_columns_defined( } ) - result = df.pivot_table(columns=columns, margins=True, aggfunc=aggfunc) + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.pivot_table(columns=columns, margins=True, aggfunc=aggfunc) expected = DataFrame(values, index=Index(["D", "E"]), columns=expected_columns) tm.assert_frame_equal(result, expected) @@ -1990,8 +2001,11 @@ def test_pivot_string_as_func(self): def test_pivot_string_func_vs_func(self, f, f_numpy): # GH #18713 # for consistency purposes - result = pivot_table(self.data, index="A", columns="B", aggfunc=f) - expected = pivot_table(self.data, index="A", columns="B", aggfunc=f_numpy) + + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = pivot_table(self.data, index="A", columns="B", aggfunc=f) + expected = pivot_table(self.data, index="A", columns="B", aggfunc=f_numpy) tm.assert_frame_equal(result, expected) @pytest.mark.slow From ec2d4af00da57d54ce5fe6d63cbbbecd274f42e4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 18 May 2022 06:21:24 -0700 Subject: [PATCH 06/11] DOC: Clarify decay argument validation in ewm when times is provided (#47026) --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/core/window/ewm.py | 15 +++++++-------- pandas/tests/window/conftest.py | 2 +- pandas/tests/window/test_ewm.py | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index af30add139222..9e3a2b608855a 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -150,6 +150,7 @@ Other enhancements - Added ``validate`` argument to :meth:`DataFrame.join` (:issue:`46622`) - A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`) - Added ``numeric_only`` argument to :meth:`Resampler.sum`, :meth:`Resampler.prod`, :meth:`Resampler.min`, :meth:`Resampler.max`, :meth:`Resampler.first`, and :meth:`Resampler.last` (:issue:`46442`) +- ``times`` argument in :class:`.ExponentialMovingWindow` now accepts ``np.timedelta64`` (:issue:`47003`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 32cb4938344c4..922d194f04c55 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -134,8 +134,9 @@ class ExponentialMovingWindow(BaseWindow): r""" Provide exponentially weighted (EW) calculations. - Exactly one parameter: ``com``, ``span``, ``halflife``, or ``alpha`` must be - provided. + Exactly one of ``com``, ``span``, ``halflife``, or ``alpha`` must be + provided if ``times`` is not provided. If ``times`` is provided, + ``halflife`` and one of ``com``, ``span`` or ``alpha`` may be provided. Parameters ---------- @@ -155,7 +156,7 @@ class ExponentialMovingWindow(BaseWindow): :math:`\alpha = 1 - \exp\left(-\ln(2) / halflife\right)`, for :math:`halflife > 0`. - If ``times`` is specified, the time unit (str or timedelta) over which an + If ``times`` is specified, a timedelta convertible unit over which an observation decays to half its value. Only applicable to ``mean()``, and halflife value will not apply to the other functions. @@ -389,10 +390,8 @@ def __init__( raise ValueError("times must be datetime64[ns] dtype.") if len(self.times) != len(obj): raise ValueError("times must be the same length as the object.") - if not isinstance(self.halflife, (str, datetime.timedelta)): - raise ValueError( - "halflife must be a string or datetime.timedelta object" - ) + if not isinstance(self.halflife, (str, datetime.timedelta, np.timedelta64)): + raise ValueError("halflife must be a timedelta convertible object") if isna(self.times).any(): raise ValueError("Cannot convert NaT values to integer") self._deltas = _calculate_deltas(self.times, self.halflife) @@ -404,7 +403,7 @@ def __init__( self._com = 1.0 else: if self.halflife is not None and isinstance( - self.halflife, (str, datetime.timedelta) + self.halflife, (str, datetime.timedelta, np.timedelta64) ): raise ValueError( "halflife can only be a timedelta convertible argument if " diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index f42a1a5449c5c..8977d1a0d9d1b 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -102,7 +102,7 @@ def engine_and_raw(request): return request.param -@pytest.fixture(params=["1 day", timedelta(days=1)]) +@pytest.fixture(params=["1 day", timedelta(days=1), np.timedelta64(1, "D")]) def halflife_with_times(request): """Halflife argument for EWM when times is specified.""" return request.param diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index b1e8b43258750..66cd36d121750 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -90,7 +90,7 @@ def test_ewma_times_not_same_length(): def test_ewma_halflife_not_correct_type(): - msg = "halflife must be a string or datetime.timedelta object" + msg = "halflife must be a timedelta convertible object" with pytest.raises(ValueError, match=msg): Series(range(5)).ewm(halflife=1, times=np.arange(5).astype("datetime64[ns]")) From e51455a09f4bbe849933255bbe8c811ed32e4809 Mon Sep 17 00:00:00 2001 From: Shuangchi He <34329208+Yulv-git@users.noreply.github.com> Date: Wed, 18 May 2022 21:22:07 +0800 Subject: [PATCH 07/11] DOC: Fix some typos in pandas/. (#47022) --- pandas/_libs/src/ujson/lib/ultrajson.h | 2 +- pandas/_libs/src/ujson/lib/ultrajsondec.c | 2 +- pandas/_libs/src/ujson/lib/ultrajsonenc.c | 2 +- pandas/_libs/src/ujson/python/JSONtoObj.c | 2 +- pandas/_libs/src/ujson/python/objToJSON.c | 2 +- pandas/_libs/src/ujson/python/ujson.c | 2 +- pandas/_libs/src/ujson/python/version.h | 2 +- pandas/core/array_algos/putmask.py | 2 +- pandas/core/indexes/multi.py | 2 +- pandas/core/internals/managers.py | 2 +- pandas/tests/indexes/multi/conftest.py | 2 +- pandas/tests/indexing/test_iloc.py | 2 +- pandas/tests/io/formats/style/test_matplotlib.py | 4 ++-- 13 files changed, 14 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index 5b5995a671b2c..71df0c5a186b7 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -29,7 +29,7 @@ Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) https://github.com/client9/stringencoders Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. -Numeric decoder derived from from TCL library +Numeric decoder derived from TCL library https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms * Copyright (c) 1988-1993 The Regents of the University of California. * Copyright (c) 1994 Sun Microsystems, Inc. diff --git a/pandas/_libs/src/ujson/lib/ultrajsondec.c b/pandas/_libs/src/ujson/lib/ultrajsondec.c index fee552672b8b6..c7779b8b428ae 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsondec.c +++ b/pandas/_libs/src/ujson/lib/ultrajsondec.c @@ -32,7 +32,7 @@ Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. -Numeric decoder derived from from TCL library +Numeric decoder derived from TCL library https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms * Copyright (c) 1988-1993 The Regents of the University of California. * Copyright (c) 1994 Sun Microsystems, Inc. diff --git a/pandas/_libs/src/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/ujson/lib/ultrajsonenc.c index 4469631b7b3f7..5d90710441a94 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/ujson/lib/ultrajsonenc.c @@ -32,7 +32,7 @@ Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. -Numeric decoder derived from from TCL library +Numeric decoder derived from TCL library https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms * Copyright (c) 1988-1993 The Regents of the University of California. * Copyright (c) 1994 Sun Microsystems, Inc. diff --git a/pandas/_libs/src/ujson/python/JSONtoObj.c b/pandas/_libs/src/ujson/python/JSONtoObj.c index 14683f4c28cbe..c58f25b8f99ea 100644 --- a/pandas/_libs/src/ujson/python/JSONtoObj.c +++ b/pandas/_libs/src/ujson/python/JSONtoObj.c @@ -29,7 +29,7 @@ Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) https://github.com/client9/stringencoders Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. -Numeric decoder derived from from TCL library +Numeric decoder derived from TCL library https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms * Copyright (c) 1988-1993 The Regents of the University of California. * Copyright (c) 1994 Sun Microsystems, Inc. diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 7de47749e500c..73d2a1f786f8b 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -30,7 +30,7 @@ Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. -Numeric decoder derived from from TCL library +Numeric decoder derived from TCL library https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms * Copyright (c) 1988-1993 The Regents of the University of California. * Copyright (c) 1994 Sun Microsystems, Inc. diff --git a/pandas/_libs/src/ujson/python/ujson.c b/pandas/_libs/src/ujson/python/ujson.c index a8fdb4f55bfca..def06cdf2db84 100644 --- a/pandas/_libs/src/ujson/python/ujson.c +++ b/pandas/_libs/src/ujson/python/ujson.c @@ -29,7 +29,7 @@ Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) https://github.com/client9/stringencoders Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. -Numeric decoder derived from from TCL library +Numeric decoder derived from TCL library https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms * Copyright (c) 1988-1993 The Regents of the University of California. * Copyright (c) 1994 Sun Microsystems, Inc. diff --git a/pandas/_libs/src/ujson/python/version.h b/pandas/_libs/src/ujson/python/version.h index 3f38642b6df87..15c55309d6270 100644 --- a/pandas/_libs/src/ujson/python/version.h +++ b/pandas/_libs/src/ujson/python/version.h @@ -29,7 +29,7 @@ Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) https://github.com/client9/stringencoders Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. -Numeric decoder derived from from TCL library +Numeric decoder derived from TCL library https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms * Copyright (c) 1988-1993 The Regents of the University of California. * Copyright (c) 1994 Sun Microsystems, Inc. diff --git a/pandas/core/array_algos/putmask.py b/pandas/core/array_algos/putmask.py index d2e6b6e935ed5..84160344437b5 100644 --- a/pandas/core/array_algos/putmask.py +++ b/pandas/core/array_algos/putmask.py @@ -1,5 +1,5 @@ """ -EA-compatible analogue to to np.putmask +EA-compatible analogue to np.putmask """ from __future__ import annotations diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index fc89a03a2fbda..fb54ca749c25b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -721,7 +721,7 @@ def _values(self) -> np.ndarray: if isinstance(vals, ABCDatetimeIndex): # TODO: this can be removed after Timestamp.freq is removed # The astype(object) below does not remove the freq from - # the underlying Timestamps so we remove it here to to match + # the underlying Timestamps so we remove it here to match # the behavior of self._get_level_values vals = vals.copy() vals.freq = None diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index ded525cd099fc..2e638f5b0fb3d 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1892,7 +1892,7 @@ def create_block_manager_from_blocks( # If verify_integrity=False, then caller is responsible for checking # all(x.shape[-1] == len(axes[1]) for x in blocks) # sum(x.shape[0] for x in blocks) == len(axes[0]) - # set(x for for blk in blocks for x in blk.mgr_locs) == set(range(len(axes[0]))) + # set(x for blk in blocks for x in blk.mgr_locs) == set(range(len(axes[0]))) # all(blk.ndim == 2 for blk in blocks) # This allows us to safely pass verify_integrity=False diff --git a/pandas/tests/indexes/multi/conftest.py b/pandas/tests/indexes/multi/conftest.py index 9d0a2fa81b53b..3cc4fa4713831 100644 --- a/pandas/tests/indexes/multi/conftest.py +++ b/pandas/tests/indexes/multi/conftest.py @@ -8,7 +8,7 @@ ) -# Note: identical the the "multi" entry in the top-level "index" fixture +# Note: identical the "multi" entry in the top-level "index" fixture @pytest.fixture def idx(): # a MultiIndex used to test the general functionality of the diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 426192ab46914..19ea6753c616c 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -1180,7 +1180,7 @@ def test_iloc_getitem_int_single_ea_block_view(self): arr = interval_range(1, 10.0)._values df = DataFrame(arr) - # ser should be a *view* on the the DataFrame data + # ser should be a *view* on the DataFrame data ser = df.iloc[2] # if we have a view, then changing arr[2] should also change ser[0] diff --git a/pandas/tests/io/formats/style/test_matplotlib.py b/pandas/tests/io/formats/style/test_matplotlib.py index a350b6fe7546d..8d9f075d8674d 100644 --- a/pandas/tests/io/formats/style/test_matplotlib.py +++ b/pandas/tests/io/formats/style/test_matplotlib.py @@ -216,7 +216,7 @@ def test_background_gradient_gmap_array_raises(gmap, axis): ], ) def test_background_gradient_gmap_dataframe_align(styler_blank, gmap, subset, exp_gmap): - # test gmap given as DataFrame that it aligns to the the data including subset + # test gmap given as DataFrame that it aligns to the data including subset expected = styler_blank.background_gradient(axis=None, gmap=exp_gmap, subset=subset) result = styler_blank.background_gradient(axis=None, gmap=gmap, subset=subset) assert expected._compute().ctx == result._compute().ctx @@ -232,7 +232,7 @@ def test_background_gradient_gmap_dataframe_align(styler_blank, gmap, subset, ex ], ) def test_background_gradient_gmap_series_align(styler_blank, gmap, axis, exp_gmap): - # test gmap given as Series that it aligns to the the data including subset + # test gmap given as Series that it aligns to the data including subset expected = styler_blank.background_gradient(axis=None, gmap=exp_gmap)._compute() result = styler_blank.background_gradient(axis=axis, gmap=gmap)._compute() assert expected.ctx == result.ctx From 64f8fe03167d3601ba4270b433d6801a3f5dc6c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Wed, 18 May 2022 12:31:29 -0400 Subject: [PATCH 08/11] remove two more casts --- pandas/core/groupby/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2acf5c826eb57..090554f2eafe5 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1580,7 +1580,7 @@ def idxmax( # DataFrame.idxmax for backwards compatibility numeric_only_arg = None if axis == 0 else False else: - numeric_only_arg = cast(bool, numeric_only) + numeric_only_arg = numeric_only def func(df): res = df._reduce( @@ -1616,7 +1616,7 @@ def idxmin( # DataFrame.idxmin for backwards compatibility numeric_only_arg = None if axis == 0 else False else: - numeric_only_arg = cast(bool, numeric_only) + numeric_only_arg = numeric_only def func(df): res = df._reduce( From 8a7ec8772003e5f9710cbed472d92ca9a3bd22b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Fri, 20 May 2022 08:39:28 -0400 Subject: [PATCH 09/11] avoid cast-like annotation --- pandas/core/arrays/interval.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 0979e97b84a99..4c81fe8b61a1f 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -956,9 +956,7 @@ def _concat_same_type( ------- IntervalArray """ - closed_set: set[IntervalClosedType] = { - interval.closed for interval in to_concat - } + closed_set = {interval.closed for interval in to_concat} if len(closed_set) != 1: raise ValueError("Intervals must all be closed on the same side.") closed = closed_set.pop() @@ -1330,7 +1328,7 @@ def overlaps(self, other): # --------------------------------------------------------------------- @property - def closed(self): + def closed(self) -> IntervalClosedType: """ Whether the intervals are closed on the left-side, right-side, both or neither. From 8bce6b0374d048caafcd8a09b0bf19d5898c0eab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Tue, 24 May 2022 14:36:53 -0400 Subject: [PATCH 10/11] left/right --- pandas/_typing.py | 3 ++- pandas/core/indexes/datetimes.py | 3 ++- pandas/core/reshape/tile.py | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index e71859e91785e..4f4adc1e74576 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -313,7 +313,8 @@ def closed(self) -> bool: XMLParsers = Literal["lxml", "etree"] # Interval closed type -IntervalClosedType = Literal["left", "right", "both", "neither"] +IntervalLeftRight = Literal["left", "right"] +IntervalClosedType = IntervalLeftRight | Literal["both", "neither"] # datetime and NaTType DatetimeNaTType = Union[datetime, "NaTType"] diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index d74aa55c340e4..e7b810dacdf57 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -34,6 +34,7 @@ Dtype, DtypeObj, IntervalClosedType, + IntervalLeftRight, npt, ) from pandas.util._decorators import ( @@ -1087,7 +1088,7 @@ def bdate_range( name: Hashable = None, weekmask=None, holidays=None, - closed: Literal["left", "right"] | lib.NoDefault | None = lib.no_default, + closed: IntervalLeftRight | lib.NoDefault | None = lib.no_default, inclusive: IntervalClosedType | None = None, **kwargs, ) -> DatetimeIndex: diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index ae948eda93e7c..94705790e40bd 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -16,6 +16,7 @@ Timestamp, ) from pandas._libs.lib import infer_dtype +from pandas._typing import IntervalLeftRight from pandas.core.dtypes.common import ( DT64NS_DTYPE, @@ -560,7 +561,7 @@ def _format_labels( bins, precision: int, right: bool = True, include_lowest: bool = False, dtype=None ): """based on the dtype, return our labels""" - closed: Literal["right", "left"] = "right" if right else "left" + closed: IntervalLeftRight = "right" if right else "left" formatter: Callable[[Any], Timestamp] | Callable[[Any], Timedelta] From 71d5b12244e8d36e5e3e013105e4882d72247d4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Tue, 24 May 2022 14:45:20 -0400 Subject: [PATCH 11/11] cannot use | --- pandas/_typing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 4f4adc1e74576..a85820a403fde 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -314,7 +314,7 @@ def closed(self) -> bool: # Interval closed type IntervalLeftRight = Literal["left", "right"] -IntervalClosedType = IntervalLeftRight | Literal["both", "neither"] +IntervalClosedType = Union[IntervalLeftRight, Literal["both", "neither"]] # datetime and NaTType DatetimeNaTType = Union[datetime, "NaTType"]