From 65fe4e460b607c41d62408fb664d35e21ca4f52b Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 26 Oct 2020 10:29:06 -0700 Subject: [PATCH 01/11] REF: avoid special case in DTA/TDA.median, flesh out tests --- pandas/core/arrays/datetimelike.py | 21 +++++----- pandas/core/nanops.py | 7 +++- pandas/tests/arrays/test_datetimelike.py | 52 +++++++++++++++++++++++- pandas/tests/arrays/test_datetimes.py | 2 +- 4 files changed, 67 insertions(+), 15 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 4523ea1030ef1..b5f71e5bbe853 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1359,21 +1359,20 @@ def median(self, axis: Optional[int] = None, skipna: bool = True, *args, **kwarg if axis is not None and abs(axis) >= self.ndim: raise ValueError("abs(axis) must be less than ndim") - if self.size == 0: - if self.ndim == 1 or axis is None: - return NaT - shape = list(self.shape) - del shape[axis] - shape = [1 if x == 0 else x for x in shape] - result = np.empty(shape, dtype="i8") - result.fill(iNaT) + if is_period_dtype(self.dtype): + # pass datetime64 values to nanops to get correct NaT semantics + result = nanops.nanmedian( + self._ndarray.view("M8[ns]"), axis=axis, skipna=skipna + ) + result = result.view("i8") + if axis is None or self.ndim == 1: + return self._box_func(result) return self._from_backing_data(result) - mask = self.isna() - result = nanops.nanmedian(self.asi8, axis=axis, skipna=skipna, mask=mask) + result = nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna) if axis is None or self.ndim == 1: return self._box_func(result) - return self._from_backing_data(result.astype("i8")) + return self._from_backing_data(result) class DatelikeOps(DatetimeLikeArrayMixin): diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index c7b6e132f9a74..0a4658fceba4a 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -339,7 +339,12 @@ def _wrap_results(result, dtype: DtypeObj, fill_value=None): assert not isna(fill_value), "Expected non-null fill_value" if result == fill_value: result = np.nan - result = Timestamp(result, tz=tz) + if tz is not None: + result = Timestamp(result, tz=tz) + elif isna(result): + result = np.datetime64("NaT", "ns") + else: + result = np.int64(result).view("datetime64[ns]") else: # If we have float dtype, taking a view will give the wrong result result = result.astype(dtype) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 463196eaa36bf..f621479e4f311 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -4,7 +4,7 @@ import pytest import pytz -from pandas._libs import OutOfBoundsDatetime, Timestamp +from pandas._libs import NaT, OutOfBoundsDatetime, Timestamp from pandas.compat.numpy import np_version_under1p18 import pandas as pd @@ -456,6 +456,54 @@ def test_shift_fill_int_deprecated(self): expected[1:] = arr[:-1] tm.assert_equal(result, expected) + def test_median(self, arr1d): + arr = arr1d + if len(arr) % 2 == 0: + # make it easier to define `expected` + arr = arr[:-1] + + expected = arr[len(arr) // 2] + + result = arr.median() + assert type(result) is type(expected) + assert result == expected + + arr[len(arr) // 2] = NaT + if not isinstance(expected, Period): + expected = arr[len(arr) // 2 - 1 : len(arr) // 2 + 2].mean() + + assert arr.median(skipna=False) is NaT + + result = arr.median() + assert type(result) is type(expected) + assert result == expected + + assert arr[:0].median() is NaT + assert arr[:0].median(skipna=False) is NaT + + # 2d Case + arr2 = arr.reshape(-1, 1) + + result = arr2.median(axis=None) + assert type(result) is type(expected) + assert result == expected + + assert arr2.median(axis=None, skipna=False) is NaT + + result = arr2.median(axis=0) + expected2 = type(arr)._from_sequence([expected], dtype=arr.dtype) + tm.assert_equal(result, expected2) + + result = arr2.median(axis=0, skipna=False) + expected2 = type(arr)._from_sequence([NaT], dtype=arr.dtype) + tm.assert_equal(result, expected2) + + result = arr2.median(axis=1) + tm.assert_equal(result, arr) + + result = arr2.median(axis=1, skipna=False) + tm.assert_equal(result, arr) + class TestDatetimeArray(SharedTests): index_cls = pd.DatetimeIndex @@ -465,7 +513,7 @@ class TestDatetimeArray(SharedTests): @pytest.fixture def arr1d(self, tz_naive_fixture, freqstr): tz = tz_naive_fixture - dti = pd.date_range("2016-01-01 01:01:00", periods=3, freq=freqstr, tz=tz) + dti = pd.date_range("2016-01-01 01:01:00", periods=5, freq=freqstr, tz=tz) dta = dti._data return dta diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 78721fc2fe1c1..e7ac32e7c9ccc 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -514,7 +514,7 @@ def test_median_empty(self, skipna, tz): tm.assert_equal(result, expected) result = arr.median(axis=1, skipna=skipna) - expected = type(arr)._from_sequence([pd.NaT], dtype=arr.dtype) + expected = type(arr)._from_sequence([], dtype=arr.dtype) tm.assert_equal(result, expected) def test_median(self, arr1d): From c52b0bce6f4f38aa1e1eccd1d6142ee1cb8f526b Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 26 Oct 2020 13:53:05 -0700 Subject: [PATCH 02/11] suppress warning --- pandas/core/nanops.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 0a4658fceba4a..aee5e5262073e 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -2,6 +2,7 @@ import itertools import operator from typing import Any, Optional, Tuple, Union, cast +import warnings import numpy as np @@ -643,7 +644,11 @@ def get_median(x): mask = notna(x) if not skipna and not mask.all(): return np.nan - return np.nanmedian(x[mask]) + with warnings.catch_warnings(): + # Suppress RuntimeWarning about All-NaN slice + warnings.filterwarnings("ignore", "All-NaN slice encountered") + res = np.nanmedian(x[mask]) + return res values, mask, dtype, _, _ = _get_values(values, skipna, mask=mask) if not is_float_dtype(values.dtype): @@ -671,7 +676,11 @@ def get_median(x): ) # fastpath for the skipna case - return _wrap_results(np.nanmedian(values, axis), dtype) + with warnings.catch_warnings(): + # Suppress RuntimeWarning about All-NaN slice + warnings.filterwarnings("ignore", "All-NaN slice encountered") + res = np.nanmedian(values, axis) + return _wrap_results(res, dtype) # must return the correct shape, but median is not defined for the # empty set so return nans of shape "everything but the passed axis" From 434692ac4e51322f179d1d9398c835a71e23cc13 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 24 Oct 2020 12:59:31 -0700 Subject: [PATCH 03/11] checkpoint tests passing --- pandas/compat/numpy/function.py | 4 +-- pandas/core/arrays/datetimelike.py | 50 ++++++++++++++++------------ pandas/core/nanops.py | 17 +++++++++- pandas/tests/frame/test_analytics.py | 25 ++++++++++++++ 4 files changed, 71 insertions(+), 25 deletions(-) diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index c074b06042e26..c2e91c7877d35 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -387,7 +387,7 @@ def validate_resampler_func(method: str, args, kwargs) -> None: raise TypeError("too many arguments passed in") -def validate_minmax_axis(axis: Optional[int]) -> None: +def validate_minmax_axis(axis: Optional[int], ndim: int = 1) -> None: """ Ensure that the axis argument passed to min, max, argmin, or argmax is zero or None, as otherwise it will be incorrectly ignored. @@ -395,12 +395,12 @@ def validate_minmax_axis(axis: Optional[int]) -> None: Parameters ---------- axis : int or None + ndim : int, default 1 Raises ------ ValueError """ - ndim = 1 # hard-coded for Index if axis is None: return if axis >= ndim or (axis < 0 and ndim + axis < 0): diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index b5f71e5bbe853..25e57866480d8 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1264,13 +1264,21 @@ def min(self, axis=None, skipna=True, *args, **kwargs): Series.min : Return the minimum value in a Series. """ nv.validate_min(args, kwargs) - nv.validate_minmax_axis(axis) + nv.validate_minmax_axis(axis, self.ndim) - result = nanops.nanmin(self.asi8, skipna=skipna, mask=self.isna()) - if isna(result): - # Period._from_ordinal does not handle np.nan gracefully - return NaT - return self._box_func(result) + # View as M8[ns] to get correct NaT/masking semantics for PeriodDtype + result = nanops.nanmin( + self._ndarray.view("M8[ns]"), skipna=skipna, mask=self.isna() + ) + if lib.is_scalar(result): + if isna(result): + # Period._from_ordinal does not handle NaT gracefully + return NaT + # nanops may unwantedly cast to Timestamp + result = getattr(result, "value", result) + return self._box_func(result) + result = result.astype("i8", copy=False) + return self._from_backing_data(result) def max(self, axis=None, skipna=True, *args, **kwargs): """ @@ -1286,23 +1294,21 @@ def max(self, axis=None, skipna=True, *args, **kwargs): # TODO: skipna is broken with max. # See https://github.com/pandas-dev/pandas/issues/24265 nv.validate_max(args, kwargs) - nv.validate_minmax_axis(axis) + nv.validate_minmax_axis(axis, self.ndim) - mask = self.isna() - if skipna: - values = self[~mask].asi8 - elif mask.any(): - return NaT - else: - values = self.asi8 - - if not len(values): - # short-circuit for empty max / min - return NaT - - result = nanops.nanmax(values, skipna=skipna) - # Don't have to worry about NA `result`, since no NA went in. - return self._box_func(result) + # View as M8[ns] to get correct NaT/masking semantics for PeriodDtype + result = nanops.nanmax( + self._ndarray.view("M8[ns]"), skipna=skipna, mask=self.isna() + ) + if lib.is_scalar(result): + if isna(result): + # Period._from_ordinal does not handle NaT gracefully + return NaT + # nanops may unwantedly cast to Timestamp + result = getattr(result, "value", result) + return self._box_func(result) + result = result.astype("i8", copy=False) + return self._from_backing_data(result) def mean(self, skipna=True): """ diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index aee5e5262073e..5bfa560040ac7 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -927,10 +927,15 @@ def reduction( mask: Optional[np.ndarray] = None, ) -> Dtype: + orig_values = values values, mask, dtype, dtype_max, fill_value = _get_values( values, skipna, fill_value_typ=fill_value_typ, mask=mask ) + datetimelike = orig_values.dtype.kind in ["m", "M"] + if datetimelike and mask is None: + mask = isna(orig_values) + if (axis is not None and values.shape[axis] == 0) or values.size == 0: try: result = getattr(values, meth)(axis, dtype=dtype_max) @@ -941,7 +946,17 @@ def reduction( result = getattr(values, meth)(axis) result = _wrap_results(result, dtype, fill_value) - return _maybe_null_out(result, axis, mask, values.shape) + result = _maybe_null_out(result, axis, mask, values.shape) + + if datetimelike and not skipna: + if axis is None or values.ndim == 1: + if mask.any(): + return orig_values.dtype.type("NaT") + else: + axis_mask = mask.any(axis=axis) + result[axis_mask] = orig_values.dtype.type("NaT") + + return result return reduction diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index ee4da37ce10f3..8e49037ae8b25 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1169,6 +1169,31 @@ def test_min_max_dt64_with_NaT(self): exp = Series([pd.NaT], index=["foo"]) tm.assert_series_equal(res, exp) + def test_min_max_dt64_with_NaT_skipna_false(self, tz_naive_fixture): + # GH#36907 + tz = tz_naive_fixture + df = pd.DataFrame( + { + "a": [ + pd.Timestamp("2020-01-01 08:00:00", tz=tz), + pd.Timestamp("1920-02-01 09:00:00", tz=tz), + ], + "b": [pd.Timestamp("2020-02-01 08:00:00", tz=tz), pd.NaT], + } + ) + + res = df.min(axis=1, skipna=False) + expected = pd.Series([df.loc[0, "a"], pd.NaT]) + assert expected.dtype == df["a"].dtype + + tm.assert_series_equal(res, expected) + + res = df.max(axis=1, skipna=False) + expected = pd.Series([df.loc[0, "b"], pd.NaT]) + assert expected.dtype == df["a"].dtype + + tm.assert_series_equal(res, expected) + def test_min_max_dt64_api_consistency_with_NaT(self): # Calling the following sum functions returned an error for dataframes but # returned NaT for series. These tests check that the API is consistent in From bdad92ce718542b01b0fc7a93f8a1afd53fed210 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 25 Oct 2020 12:41:31 -0700 Subject: [PATCH 04/11] checkpoint tests passing --- pandas/core/arrays/datetimelike.py | 34 ++++++++++++------------------ pandas/core/nanops.py | 1 + 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 25e57866480d8..437faedd53663 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1266,18 +1266,15 @@ def min(self, axis=None, skipna=True, *args, **kwargs): nv.validate_min(args, kwargs) nv.validate_minmax_axis(axis, self.ndim) - # View as M8[ns] to get correct NaT/masking semantics for PeriodDtype - result = nanops.nanmin( - self._ndarray.view("M8[ns]"), skipna=skipna, mask=self.isna() - ) + if is_period_dtype(self.dtype): + result = self.to_timestamp("S").min(axis=axis, skipna=skipna) + if result is not NaT: + result = result.to_period(self.freq) + return result + + result = nanops.nanmin(self._ndarray, skipna=skipna, mask=self.isna()) if lib.is_scalar(result): - if isna(result): - # Period._from_ordinal does not handle NaT gracefully - return NaT - # nanops may unwantedly cast to Timestamp - result = getattr(result, "value", result) return self._box_func(result) - result = result.astype("i8", copy=False) return self._from_backing_data(result) def max(self, axis=None, skipna=True, *args, **kwargs): @@ -1296,18 +1293,15 @@ def max(self, axis=None, skipna=True, *args, **kwargs): nv.validate_max(args, kwargs) nv.validate_minmax_axis(axis, self.ndim) - # View as M8[ns] to get correct NaT/masking semantics for PeriodDtype - result = nanops.nanmax( - self._ndarray.view("M8[ns]"), skipna=skipna, mask=self.isna() - ) + if is_period_dtype(self.dtype): + result = self.to_timestamp("S").max(axis=axis, skipna=skipna) + if result is not NaT: + result = result.to_period(self.freq) + return result + + result = nanops.nanmax(self._ndarray, skipna=skipna, mask=self.isna()) if lib.is_scalar(result): - if isna(result): - # Period._from_ordinal does not handle NaT gracefully - return NaT - # nanops may unwantedly cast to Timestamp - result = getattr(result, "value", result) return self._box_func(result) - result = result.astype("i8", copy=False) return self._from_backing_data(result) def mean(self, skipna=True): diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 5bfa560040ac7..0d32911e9b19c 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -341,6 +341,7 @@ def _wrap_results(result, dtype: DtypeObj, fill_value=None): if result == fill_value: result = np.nan if tz is not None: + # we get here e.g. via nanmean when we call it on a DTA[tz] result = Timestamp(result, tz=tz) elif isna(result): result = np.datetime64("NaT", "ns") From 5bbc5d1a40f07acba5bae4f7777c4e67b0a7d066 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 26 Oct 2020 08:50:19 -0700 Subject: [PATCH 05/11] follow-up clenaup --- pandas/core/nanops.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 0d32911e9b19c..97ad92f6adcdb 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -340,6 +340,7 @@ def _wrap_results(result, dtype: DtypeObj, fill_value=None): assert not isna(fill_value), "Expected non-null fill_value" if result == fill_value: result = np.nan + if tz is not None: # we get here e.g. via nanmean when we call it on a DTA[tz] result = Timestamp(result, tz=tz) @@ -934,8 +935,6 @@ def reduction( ) datetimelike = orig_values.dtype.kind in ["m", "M"] - if datetimelike and mask is None: - mask = isna(orig_values) if (axis is not None and values.shape[axis] == 0) or values.size == 0: try: @@ -950,12 +949,7 @@ def reduction( result = _maybe_null_out(result, axis, mask, values.shape) if datetimelike and not skipna: - if axis is None or values.ndim == 1: - if mask.any(): - return orig_values.dtype.type("NaT") - else: - axis_mask = mask.any(axis=axis) - result[axis_mask] = orig_values.dtype.type("NaT") + result = _mask_datetimelike_result(result, axis, mask, orig_values) return result From a69f762d664c11be2ec75b108968e431753b263e Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 26 Oct 2020 10:30:31 -0700 Subject: [PATCH 06/11] cln: no need to pass mask --- pandas/core/arrays/datetimelike.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 437faedd53663..d7ace48136834 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1272,7 +1272,7 @@ def min(self, axis=None, skipna=True, *args, **kwargs): result = result.to_period(self.freq) return result - result = nanops.nanmin(self._ndarray, skipna=skipna, mask=self.isna()) + result = nanops.nanmin(self._ndarray, skipna=skipna) if lib.is_scalar(result): return self._box_func(result) return self._from_backing_data(result) @@ -1299,7 +1299,7 @@ def max(self, axis=None, skipna=True, *args, **kwargs): result = result.to_period(self.freq) return result - result = nanops.nanmax(self._ndarray, skipna=skipna, mask=self.isna()) + result = nanops.nanmax(self._ndarray, skipna=skipna) if lib.is_scalar(result): return self._box_func(result) return self._from_backing_data(result) From 121108e6bf8fb0224c1cdd0af7251601a66fe01a Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 26 Oct 2020 10:50:26 -0700 Subject: [PATCH 07/11] unnecessary mask arg --- pandas/core/arrays/datetimelike.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index d7ace48136834..65084f0b5f66e 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1272,7 +1272,7 @@ def min(self, axis=None, skipna=True, *args, **kwargs): result = result.to_period(self.freq) return result - result = nanops.nanmin(self._ndarray, skipna=skipna) + result = nanops.nanmin(self._ndarray, axis=axis, skipna=skipna) if lib.is_scalar(result): return self._box_func(result) return self._from_backing_data(result) @@ -1299,7 +1299,7 @@ def max(self, axis=None, skipna=True, *args, **kwargs): result = result.to_period(self.freq) return result - result = nanops.nanmax(self._ndarray, skipna=skipna) + result = nanops.nanmax(self._ndarray, axis=axis, skipna=skipna) if lib.is_scalar(result): return self._box_func(result) return self._from_backing_data(result) From d3898841a4c231633617b4d4a246e7aef5e6af29 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 26 Oct 2020 11:16:01 -0700 Subject: [PATCH 08/11] lint fixup --- pandas/tests/frame/test_analytics.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 8e49037ae8b25..f2847315f4959 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1172,24 +1172,24 @@ def test_min_max_dt64_with_NaT(self): def test_min_max_dt64_with_NaT_skipna_false(self, tz_naive_fixture): # GH#36907 tz = tz_naive_fixture - df = pd.DataFrame( + df = DataFrame( { "a": [ - pd.Timestamp("2020-01-01 08:00:00", tz=tz), - pd.Timestamp("1920-02-01 09:00:00", tz=tz), + Timestamp("2020-01-01 08:00:00", tz=tz), + Timestamp("1920-02-01 09:00:00", tz=tz), ], - "b": [pd.Timestamp("2020-02-01 08:00:00", tz=tz), pd.NaT], + "b": [Timestamp("2020-02-01 08:00:00", tz=tz), pd.NaT], } ) res = df.min(axis=1, skipna=False) - expected = pd.Series([df.loc[0, "a"], pd.NaT]) + expected = Series([df.loc[0, "a"], pd.NaT]) assert expected.dtype == df["a"].dtype tm.assert_series_equal(res, expected) res = df.max(axis=1, skipna=False) - expected = pd.Series([df.loc[0, "b"], pd.NaT]) + expected = Series([df.loc[0, "b"], pd.NaT]) assert expected.dtype == df["a"].dtype tm.assert_series_equal(res, expected) From 39afdfdb6ef5f8db042cb0a90217464aa64b1a08 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 26 Oct 2020 13:13:48 -0700 Subject: [PATCH 09/11] REF: avoid special-casing inside DTA/TDA.mean (#37422) --- pandas/core/arrays/datetimelike.py | 24 ++++-------- pandas/core/nanops.py | 3 +- pandas/tests/arrays/test_datetimes.py | 52 ++++++++++++++++++++++++++ pandas/tests/arrays/test_timedeltas.py | 36 +++++++++++++++++- 4 files changed, 97 insertions(+), 18 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 65084f0b5f66e..7db018a825566 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1304,7 +1304,7 @@ def max(self, axis=None, skipna=True, *args, **kwargs): return self._box_func(result) return self._from_backing_data(result) - def mean(self, skipna=True): + def mean(self, skipna=True, axis: Optional[int] = 0): """ Return the mean value of the Array. @@ -1314,6 +1314,7 @@ def mean(self, skipna=True): ---------- skipna : bool, default True Whether to ignore any NaT elements. + axis : int, optional, default 0 Returns ------- @@ -1337,21 +1338,12 @@ def mean(self, skipna=True): "obj.to_timestamp(how='start').mean()" ) - mask = self.isna() - if skipna: - values = self[~mask] - elif mask.any(): - return NaT - else: - values = self - - if not len(values): - # short-circuit for empty max / min - return NaT - - result = nanops.nanmean(values.view("i8"), skipna=skipna) - # Don't have to worry about NA `result`, since no NA went in. - return self._box_func(result) + result = nanops.nanmean( + self._ndarray, axis=axis, skipna=skipna, mask=self.isna() + ) + if axis is None or self.ndim == 1: + return self._box_func(result) + return self._from_backing_data(result) def median(self, axis: Optional[int] = None, skipna: bool = True, *args, **kwargs): nv.validate_median(args, kwargs) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 97ad92f6adcdb..4da7ce0cb7b7c 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -394,8 +394,9 @@ def _na_for_min_count( if values.ndim == 1: return fill_value + elif axis is None: + return fill_value else: - assert axis is not None # assertion to make mypy happy result_shape = values.shape[:axis] + values.shape[axis + 1 :] result = np.full(result_shape, fill_value, dtype=values.dtype) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index e7ac32e7c9ccc..66a92dd6f1cff 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -9,6 +9,7 @@ from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd +from pandas import NaT import pandas._testing as tm from pandas.core.arrays import DatetimeArray from pandas.core.arrays.datetimes import sequence_to_dt64ns @@ -566,3 +567,54 @@ def test_median_2d(self, arr1d): result = arr.median(axis=1, skipna=False) expected = type(arr)._from_sequence([pd.NaT], dtype=arr.dtype) tm.assert_equal(result, expected) + + def test_mean(self, arr1d): + arr = arr1d + + # manually verified result + expected = arr[0] + 0.4 * pd.Timedelta(days=1) + + result = arr.mean() + assert result == expected + result = arr.mean(skipna=False) + assert result is pd.NaT + + result = arr.dropna().mean(skipna=False) + assert result == expected + + result = arr.mean(axis=0) + assert result == expected + + def test_mean_2d(self): + dti = pd.date_range("2016-01-01", periods=6, tz="US/Pacific") + dta = dti._data.reshape(3, 2) + + result = dta.mean(axis=0) + expected = dta[1] + tm.assert_datetime_array_equal(result, expected) + + result = dta.mean(axis=1) + expected = dta[:, 0] + pd.Timedelta(hours=12) + tm.assert_datetime_array_equal(result, expected) + + result = dta.mean(axis=None) + expected = dti.mean() + assert result == expected + + @pytest.mark.parametrize("skipna", [True, False]) + def test_mean_empty(self, arr1d, skipna): + arr = arr1d[:0] + + assert arr.mean(skipna=skipna) is NaT + + arr2d = arr.reshape(0, 3) + result = arr2d.mean(axis=0, skipna=skipna) + expected = DatetimeArray._from_sequence([NaT, NaT, NaT], dtype=arr.dtype) + tm.assert_datetime_array_equal(result, expected) + + result = arr2d.mean(axis=1, skipna=skipna) + expected = arr # i.e. 1D, empty + tm.assert_datetime_array_equal(result, expected) + + result = arr2d.mean(axis=None, skipna=skipna) + assert result is NaT diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index a5a74b16ed1cd..95265a958c35d 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -177,7 +177,7 @@ def test_neg_freq(self): class TestReductions: - @pytest.mark.parametrize("name", ["std", "min", "max", "median"]) + @pytest.mark.parametrize("name", ["std", "min", "max", "median", "mean"]) @pytest.mark.parametrize("skipna", [True, False]) def test_reductions_empty(self, name, skipna): tdi = pd.TimedeltaIndex([]) @@ -334,3 +334,37 @@ def test_median(self): result = tdi.median(skipna=False) assert result is pd.NaT + + def test_mean(self): + tdi = pd.TimedeltaIndex(["0H", "3H", "NaT", "5H06m", "0H", "2H"]) + arr = tdi._data + + # manually verified result + expected = pd.Timedelta(arr.dropna()._ndarray.mean()) + + result = arr.mean() + assert result == expected + result = arr.mean(skipna=False) + assert result is pd.NaT + + result = arr.dropna().mean(skipna=False) + assert result == expected + + result = arr.mean(axis=0) + assert result == expected + + def test_mean_2d(self): + tdi = pd.timedelta_range("14 days", periods=6) + tda = tdi._data.reshape(3, 2) + + result = tda.mean(axis=0) + expected = tda[1] + tm.assert_timedelta_array_equal(result, expected) + + result = tda.mean(axis=1) + expected = tda[:, 0] + pd.Timedelta(hours=12) + tm.assert_timedelta_array_equal(result, expected) + + result = tda.mean(axis=None) + expected = tdi.mean() + assert result == expected From 857572d69e421f94b42722749e0693811206bcb5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 26 Oct 2020 15:05:20 -0700 Subject: [PATCH 10/11] TST/REF: misplaced tests in frame.test_dtypes (#37424) --- .../tests/frame/{ => methods}/test_dtypes.py | 104 +----------------- .../methods/test_is_homogeneous_dtype.py | 49 +++++++++ pandas/tests/frame/test_constructors.py | 25 +++++ pandas/tests/frame/test_npfuncs.py | 16 +++ pandas/tests/indexing/test_iloc.py | 9 ++ pandas/tests/indexing/test_loc.py | 13 +++ 6 files changed, 113 insertions(+), 103 deletions(-) rename pandas/tests/frame/{ => methods}/test_dtypes.py (53%) create mode 100644 pandas/tests/frame/methods/test_is_homogeneous_dtype.py create mode 100644 pandas/tests/frame/test_npfuncs.py diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/methods/test_dtypes.py similarity index 53% rename from pandas/tests/frame/test_dtypes.py rename to pandas/tests/frame/methods/test_dtypes.py index 1add4c0db2e53..0105eef435121 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/methods/test_dtypes.py @@ -1,7 +1,6 @@ from datetime import timedelta import numpy as np -import pytest from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -89,16 +88,7 @@ def test_dtypes_gh8722(self, float_string_frame): result = df.dtypes tm.assert_series_equal(result, Series({0: np.dtype("int64")})) - def test_singlerow_slice_categoricaldtype_gives_series(self): - # GH29521 - df = DataFrame({"x": pd.Categorical("a b c d e".split())}) - result = df.iloc[0] - raw_cat = pd.Categorical(["a"], categories=["a", "b", "c", "d", "e"]) - expected = Series(raw_cat, index=["x"], name=0, dtype="category") - - tm.assert_series_equal(result, expected) - - def test_timedeltas(self): + def test_dtypes_timedeltas(self): df = DataFrame( dict( A=Series(date_range("2012-1-1", periods=3, freq="D")), @@ -136,95 +126,3 @@ def test_timedeltas(self): index=list("ABCD"), ) tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( - "input_vals", - [ - ([1, 2]), - (["1", "2"]), - (list(pd.date_range("1/1/2011", periods=2, freq="H"))), - (list(pd.date_range("1/1/2011", periods=2, freq="H", tz="US/Eastern"))), - ([pd.Interval(left=0, right=5)]), - ], - ) - def test_constructor_list_str(self, input_vals, string_dtype): - # GH 16605 - # Ensure that data elements are converted to strings when - # dtype is str, 'str', or 'U' - - result = DataFrame({"A": input_vals}, dtype=string_dtype) - expected = DataFrame({"A": input_vals}).astype({"A": string_dtype}) - tm.assert_frame_equal(result, expected) - - def test_constructor_list_str_na(self, string_dtype): - - result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype) - expected = DataFrame({"A": ["1.0", "2.0", None]}, dtype=object) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "data, expected", - [ - # empty - (DataFrame(), True), - # multi-same - (DataFrame({"A": [1, 2], "B": [1, 2]}), True), - # multi-object - ( - DataFrame( - { - "A": np.array([1, 2], dtype=object), - "B": np.array(["a", "b"], dtype=object), - } - ), - True, - ), - # multi-extension - ( - DataFrame( - {"A": pd.Categorical(["a", "b"]), "B": pd.Categorical(["a", "b"])} - ), - True, - ), - # differ types - (DataFrame({"A": [1, 2], "B": [1.0, 2.0]}), False), - # differ sizes - ( - DataFrame( - { - "A": np.array([1, 2], dtype=np.int32), - "B": np.array([1, 2], dtype=np.int64), - } - ), - False, - ), - # multi-extension differ - ( - DataFrame( - {"A": pd.Categorical(["a", "b"]), "B": pd.Categorical(["b", "c"])} - ), - False, - ), - ], - ) - def test_is_homogeneous_type(self, data, expected): - assert data._is_homogeneous_type is expected - - def test_asarray_homogenous(self): - df = DataFrame({"A": pd.Categorical([1, 2]), "B": pd.Categorical([1, 2])}) - result = np.asarray(df) - # may change from object in the future - expected = np.array([[1, 1], [2, 2]], dtype="object") - tm.assert_numpy_array_equal(result, expected) - - def test_str_to_small_float_conversion_type(self): - # GH 20388 - np.random.seed(13) - col_data = [str(np.random.random() * 1e-12) for _ in range(5)] - result = DataFrame(col_data, columns=["A"]) - expected = DataFrame(col_data, columns=["A"], dtype=object) - tm.assert_frame_equal(result, expected) - # change the dtype of the elements from object to float one by one - result.loc[result.index, "A"] = [float(x) for x in col_data] - expected = DataFrame(col_data, columns=["A"], dtype=float) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_is_homogeneous_dtype.py b/pandas/tests/frame/methods/test_is_homogeneous_dtype.py new file mode 100644 index 0000000000000..0fca4e988b775 --- /dev/null +++ b/pandas/tests/frame/methods/test_is_homogeneous_dtype.py @@ -0,0 +1,49 @@ +import numpy as np +import pytest + +from pandas import Categorical, DataFrame + + +@pytest.mark.parametrize( + "data, expected", + [ + # empty + (DataFrame(), True), + # multi-same + (DataFrame({"A": [1, 2], "B": [1, 2]}), True), + # multi-object + ( + DataFrame( + { + "A": np.array([1, 2], dtype=object), + "B": np.array(["a", "b"], dtype=object), + } + ), + True, + ), + # multi-extension + ( + DataFrame({"A": Categorical(["a", "b"]), "B": Categorical(["a", "b"])}), + True, + ), + # differ types + (DataFrame({"A": [1, 2], "B": [1.0, 2.0]}), False), + # differ sizes + ( + DataFrame( + { + "A": np.array([1, 2], dtype=np.int32), + "B": np.array([1, 2], dtype=np.int64), + } + ), + False, + ), + # multi-extension differ + ( + DataFrame({"A": Categorical(["a", "b"]), "B": Categorical(["b", "c"])}), + False, + ), + ], +) +def test_is_homogeneous_type(data, expected): + assert data._is_homogeneous_type is expected diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 1521f66a6bc61..bbcc286d89986 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2697,6 +2697,31 @@ def test_frame_ctor_datetime64_column(self): df = DataFrame({"A": np.random.randn(len(rng)), "B": dates}) assert np.issubdtype(df["B"].dtype, np.dtype("M8[ns]")) + @pytest.mark.parametrize( + "input_vals", + [ + ([1, 2]), + (["1", "2"]), + (list(date_range("1/1/2011", periods=2, freq="H"))), + (list(date_range("1/1/2011", periods=2, freq="H", tz="US/Eastern"))), + ([pd.Interval(left=0, right=5)]), + ], + ) + def test_constructor_list_str(self, input_vals, string_dtype): + # GH#16605 + # Ensure that data elements are converted to strings when + # dtype is str, 'str', or 'U' + + result = DataFrame({"A": input_vals}, dtype=string_dtype) + expected = DataFrame({"A": input_vals}).astype({"A": string_dtype}) + tm.assert_frame_equal(result, expected) + + def test_constructor_list_str_na(self, string_dtype): + + result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype) + expected = DataFrame({"A": ["1.0", "2.0", None]}, dtype=object) + tm.assert_frame_equal(result, expected) + class TestDataFrameConstructorWithDatetimeTZ: def test_constructor_data_aware_dtype_naive(self, tz_aware_fixture): diff --git a/pandas/tests/frame/test_npfuncs.py b/pandas/tests/frame/test_npfuncs.py new file mode 100644 index 0000000000000..a3b4c659a4124 --- /dev/null +++ b/pandas/tests/frame/test_npfuncs.py @@ -0,0 +1,16 @@ +""" +Tests for np.foo applied to DataFrame, not necessarily ufuncs. +""" +import numpy as np + +from pandas import Categorical, DataFrame +import pandas._testing as tm + + +class TestAsArray: + def test_asarray_homogenous(self): + df = DataFrame({"A": Categorical([1, 2]), "B": Categorical([1, 2])}) + result = np.asarray(df) + # may change from object in the future + expected = np.array([[1, 1], [2, 2]], dtype="object") + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 31abe45215432..4ef6463fd9e31 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -739,6 +739,15 @@ def test_iloc_with_boolean_operation(self): expected = DataFrame([[0.0, 4.0], [8.0, 12.0], [4.0, 5.0], [6.0, np.nan]]) tm.assert_frame_equal(result, expected) + def test_iloc_getitem_singlerow_slice_categoricaldtype_gives_series(self): + # GH#29521 + df = DataFrame({"x": pd.Categorical("a b c d e".split())}) + result = df.iloc[0] + raw_cat = pd.Categorical(["a"], categories=["a", "b", "c", "d", "e"]) + expected = Series(raw_cat, index=["x"], name=0, dtype="category") + + tm.assert_series_equal(result, expected) + class TestILocSetItemDuplicateColumns: def test_iloc_setitem_scalar_duplicate_columns(self): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 3b915f13c7568..dd9657ad65ce7 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -978,6 +978,19 @@ def test_loc_reverse_assignment(self): tm.assert_series_equal(result, expected) + def test_loc_setitem_str_to_small_float_conversion_type(self): + # GH#20388 + np.random.seed(13) + col_data = [str(np.random.random() * 1e-12) for _ in range(5)] + result = DataFrame(col_data, columns=["A"]) + expected = DataFrame(col_data, columns=["A"], dtype=object) + tm.assert_frame_equal(result, expected) + + # change the dtype of the elements from object to float one by one + result.loc[result.index, "A"] = [float(x) for x in col_data] + expected = DataFrame(col_data, columns=["A"], dtype=float) + tm.assert_frame_equal(result, expected) + class TestLocWithMultiIndex: @pytest.mark.parametrize( From 198c4de4984538d5d644a5c0f0d00412a7d21a5f Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 5 Nov 2020 19:43:30 -0800 Subject: [PATCH 11/11] REF: implement _wrap_reduction_result --- pandas/core/arrays/_mixins.py | 7 ++++- pandas/core/arrays/categorical.py | 4 +-- pandas/core/arrays/datetimelike.py | 16 +++------- pandas/core/arrays/numpy_.py | 48 +++++++++++++++++++----------- pandas/core/arrays/string_.py | 17 +++++++++++ pandas/core/arrays/timedeltas.py | 4 +-- 6 files changed, 60 insertions(+), 36 deletions(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 63c414d96c8de..c1b5897164d76 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -1,4 +1,4 @@ -from typing import Any, Sequence, TypeVar +from typing import Any, Optional, Sequence, TypeVar import numpy as np @@ -254,6 +254,11 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): msg = f"'{type(self).__name__}' does not implement reduction '{name}'" raise TypeError(msg) + def _wrap_reduction_result(self, axis: Optional[int], result): + if axis is None or self.ndim == 1: + return self._box_func(result) + return self._from_backing_data(result) + # ------------------------------------------------------------------------ def __repr__(self) -> str: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index edbf24ca87f5c..57d934a633911 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1957,7 +1957,7 @@ def min(self, *, skipna=True, **kwargs): return np.nan else: pointer = self._codes.min() - return self.categories[pointer] + return self._wrap_reduction_result(None, pointer) @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna") def max(self, *, skipna=True, **kwargs): @@ -1993,7 +1993,7 @@ def max(self, *, skipna=True, **kwargs): return np.nan else: pointer = self._codes.max() - return self.categories[pointer] + return self._wrap_reduction_result(None, pointer) def mode(self, dropna=True): """ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 7a0d88f29b9b0..8d90035491d28 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1283,9 +1283,7 @@ def min(self, *, axis=None, skipna=True, **kwargs): return self._from_backing_data(result) result = nanops.nanmin(self._ndarray, axis=axis, skipna=skipna) - if lib.is_scalar(result): - return self._box_func(result) - return self._from_backing_data(result) + return self._wrap_reduction_result(axis, result) def max(self, *, axis=None, skipna=True, **kwargs): """ @@ -1316,9 +1314,7 @@ def max(self, *, axis=None, skipna=True, **kwargs): return self._from_backing_data(result) result = nanops.nanmax(self._ndarray, axis=axis, skipna=skipna) - if lib.is_scalar(result): - return self._box_func(result) - return self._from_backing_data(result) + return self._wrap_reduction_result(axis, result) def mean(self, *, skipna=True, axis: Optional[int] = 0): """ @@ -1357,9 +1353,7 @@ def mean(self, *, skipna=True, axis: Optional[int] = 0): result = nanops.nanmean( self._ndarray, axis=axis, skipna=skipna, mask=self.isna() ) - if axis is None or self.ndim == 1: - return self._box_func(result) - return self._from_backing_data(result) + return self._wrap_reduction_result(axis, result) def median(self, *, axis: Optional[int] = None, skipna: bool = True, **kwargs): nv.validate_median((), kwargs) @@ -1378,9 +1372,7 @@ def median(self, *, axis: Optional[int] = None, skipna: bool = True, **kwargs): return self._from_backing_data(result) result = nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna) - if axis is None or self.ndim == 1: - return self._box_func(result) - return self._from_backing_data(result) + return self._wrap_reduction_result(axis, result) class DatelikeOps(DatetimeLikeArrayMixin): diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index e1a424b719a4a..20fae20c395e6 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -12,7 +12,6 @@ from pandas.core.dtypes.missing import isna from pandas.core import nanops, ops -from pandas.core.array_algos import masked_reductions from pandas.core.arraylike import OpsMixin from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.strings.object_array import ObjectStringArrayMixin @@ -273,39 +272,46 @@ def _values_for_factorize(self) -> Tuple[np.ndarray, int]: def any(self, *, axis=None, out=None, keepdims=False, skipna=True): nv.validate_any((), dict(out=out, keepdims=keepdims)) - return nanops.nanany(self._ndarray, axis=axis, skipna=skipna) + result = nanops.nanany(self._ndarray, axis=axis, skipna=skipna) + return self._wrap_reduction_result(axis, result) def all(self, *, axis=None, out=None, keepdims=False, skipna=True): nv.validate_all((), dict(out=out, keepdims=keepdims)) - return nanops.nanall(self._ndarray, axis=axis, skipna=skipna) + result = nanops.nanall(self._ndarray, axis=axis, skipna=skipna) + return self._wrap_reduction_result(axis, result) - def min(self, *, skipna: bool = True, **kwargs) -> Scalar: + def min(self, *, axis=None, skipna: bool = True, **kwargs) -> Scalar: nv.validate_min((), kwargs) - return masked_reductions.min( - values=self.to_numpy(), mask=self.isna(), skipna=skipna + result = nanops.nanmin( + values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna ) + return self._wrap_reduction_result(axis, result) - def max(self, *, skipna: bool = True, **kwargs) -> Scalar: + def max(self, *, axis=None, skipna: bool = True, **kwargs) -> Scalar: nv.validate_max((), kwargs) - return masked_reductions.max( - values=self.to_numpy(), mask=self.isna(), skipna=skipna + result = nanops.nanmax( + values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna ) + return self._wrap_reduction_result(axis, result) def sum(self, *, axis=None, skipna=True, min_count=0, **kwargs) -> Scalar: nv.validate_sum((), kwargs) - return nanops.nansum( + result = nanops.nansum( self._ndarray, axis=axis, skipna=skipna, min_count=min_count ) + return self._wrap_reduction_result(axis, result) def prod(self, *, axis=None, skipna=True, min_count=0, **kwargs) -> Scalar: nv.validate_prod((), kwargs) - return nanops.nanprod( + result = nanops.nanprod( self._ndarray, axis=axis, skipna=skipna, min_count=min_count ) + return self._wrap_reduction_result(axis, result) def mean(self, *, axis=None, dtype=None, out=None, keepdims=False, skipna=True): nv.validate_mean((), dict(dtype=dtype, out=out, keepdims=keepdims)) - return nanops.nanmean(self._ndarray, axis=axis, skipna=skipna) + result = nanops.nanmean(self._ndarray, axis=axis, skipna=skipna) + return self._wrap_reduction_result(axis, result) def median( self, *, axis=None, out=None, overwrite_input=False, keepdims=False, skipna=True @@ -313,7 +319,8 @@ def median( nv.validate_median( (), dict(out=out, overwrite_input=overwrite_input, keepdims=keepdims) ) - return nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna) + result = nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna) + return self._wrap_reduction_result(axis, result) def std( self, *, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True @@ -321,7 +328,8 @@ def std( nv.validate_stat_ddof_func( (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="std" ) - return nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + result = nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + return self._wrap_reduction_result(axis, result) def var( self, *, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True @@ -329,7 +337,8 @@ def var( nv.validate_stat_ddof_func( (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="var" ) - return nanops.nanvar(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + result = nanops.nanvar(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + return self._wrap_reduction_result(axis, result) def sem( self, *, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True @@ -337,19 +346,22 @@ def sem( nv.validate_stat_ddof_func( (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="sem" ) - return nanops.nansem(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + result = nanops.nansem(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + return self._wrap_reduction_result(axis, result) def kurt(self, *, axis=None, dtype=None, out=None, keepdims=False, skipna=True): nv.validate_stat_ddof_func( (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="kurt" ) - return nanops.nankurt(self._ndarray, axis=axis, skipna=skipna) + result = nanops.nankurt(self._ndarray, axis=axis, skipna=skipna) + return self._wrap_reduction_result(axis, result) def skew(self, *, axis=None, dtype=None, out=None, keepdims=False, skipna=True): nv.validate_stat_ddof_func( (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="skew" ) - return nanops.nanskew(self._ndarray, axis=axis, skipna=skipna) + result = nanops.nanskew(self._ndarray, axis=axis, skipna=skipna) + return self._wrap_reduction_result(axis, result) # ------------------------------------------------------------------------ # Additional Methods diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 8231a5fa0509b..a51dd1098a359 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -3,6 +3,8 @@ import numpy as np from pandas._libs import lib, missing as libmissing +from pandas._typing import Scalar +from pandas.compat.numpy import function as nv from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype from pandas.core.dtypes.common import ( @@ -15,6 +17,7 @@ ) from pandas.core import ops +from pandas.core.array_algos import masked_reductions from pandas.core.arrays import IntegerArray, PandasArray from pandas.core.arrays.integer import _IntegerDtype from pandas.core.construction import extract_array @@ -301,6 +304,20 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: + nv.validate_min((), kwargs) + result = masked_reductions.min( + values=self.to_numpy(), mask=self.isna(), skipna=skipna + ) + return self._wrap_reduction_result(axis, result) + + def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: + nv.validate_max((), kwargs) + result = masked_reductions.max( + values=self.to_numpy(), mask=self.isna(), skipna=skipna + ) + return self._wrap_reduction_result(axis, result) + def value_counts(self, dropna=False): from pandas import value_counts diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 8a87df18b6adb..c227c071546ce 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -381,9 +381,7 @@ def sum( result = nanops.nansum( self._ndarray, axis=axis, skipna=skipna, min_count=min_count ) - if axis is None or self.ndim == 1: - return self._box_func(result) - return self._from_backing_data(result) + return self._wrap_reduction_result(axis, result) def std( self,