From 08ff20f7d20834bbbbff22abcbc5dbe34a7e6b64 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 24 Oct 2020 12:59:31 -0700 Subject: [PATCH 1/8] checkpoint tests passing --- pandas/compat/numpy/function.py | 4 +- pandas/core/arrays/datetimelike.py | 57 ++++++++++++++++------------ pandas/core/nanops.py | 17 ++++++++- pandas/tests/frame/test_analytics.py | 25 ++++++++++++ 4 files changed, 76 insertions(+), 27 deletions(-) diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index c074b06042e26..c2e91c7877d35 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -387,7 +387,7 @@ def validate_resampler_func(method: str, args, kwargs) -> None: raise TypeError("too many arguments passed in") -def validate_minmax_axis(axis: Optional[int]) -> None: +def validate_minmax_axis(axis: Optional[int], ndim: int = 1) -> None: """ Ensure that the axis argument passed to min, max, argmin, or argmax is zero or None, as otherwise it will be incorrectly ignored. @@ -395,12 +395,12 @@ def validate_minmax_axis(axis: Optional[int]) -> None: Parameters ---------- axis : int or None + ndim : int, default 1 Raises ------ ValueError """ - ndim = 1 # hard-coded for Index if axis is None: return if axis >= ndim or (axis < 0 and ndim + axis < 0): diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 4523ea1030ef1..476428e833b3e 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1264,13 +1264,21 @@ def min(self, axis=None, skipna=True, *args, **kwargs): Series.min : Return the minimum value in a Series. """ nv.validate_min(args, kwargs) - nv.validate_minmax_axis(axis) + nv.validate_minmax_axis(axis, self.ndim) - result = nanops.nanmin(self.asi8, skipna=skipna, mask=self.isna()) - if isna(result): - # Period._from_ordinal does not handle np.nan gracefully - return NaT - return self._box_func(result) + # View as M8[ns] to get correct NaT/masking semantics for PeriodDtype + result = nanops.nanmin( + self._ndarray.view("M8[ns]"), skipna=skipna, mask=self.isna() + ) + if lib.is_scalar(result): + if isna(result): + # Period._from_ordinal does not handle NaT gracefully + return NaT + # nanops may unwantedly cast to Timestamp + result = getattr(result, "value", result) + return self._box_func(result) + result = result.astype("i8", copy=False) + return self._from_backing_data(result) def max(self, axis=None, skipna=True, *args, **kwargs): """ @@ -1286,23 +1294,21 @@ def max(self, axis=None, skipna=True, *args, **kwargs): # TODO: skipna is broken with max. # See https://github.com/pandas-dev/pandas/issues/24265 nv.validate_max(args, kwargs) - nv.validate_minmax_axis(axis) - - mask = self.isna() - if skipna: - values = self[~mask].asi8 - elif mask.any(): - return NaT - else: - values = self.asi8 + nv.validate_minmax_axis(axis, self.ndim) - if not len(values): - # short-circuit for empty max / min - return NaT - - result = nanops.nanmax(values, skipna=skipna) - # Don't have to worry about NA `result`, since no NA went in. - return self._box_func(result) + # View as M8[ns] to get correct NaT/masking semantics for PeriodDtype + result = nanops.nanmax( + self._ndarray.view("M8[ns]"), skipna=skipna, mask=self.isna() + ) + if lib.is_scalar(result): + if isna(result): + # Period._from_ordinal does not handle NaT gracefully + return NaT + # nanops may unwantedly cast to Timestamp + result = getattr(result, "value", result) + return self._box_func(result) + result = result.astype("i8", copy=False) + return self._from_backing_data(result) def mean(self, skipna=True): """ @@ -1359,7 +1365,10 @@ def median(self, axis: Optional[int] = None, skipna: bool = True, *args, **kwarg if axis is not None and abs(axis) >= self.ndim: raise ValueError("abs(axis) must be less than ndim") - if self.size == 0: + if not self.size and (self.ndim == 1 or axis is None): + return NaT + + if self.size == 0: # TODO: try to do this directly in nanmedian if self.ndim == 1 or axis is None: return NaT shape = list(self.shape) @@ -1369,7 +1378,7 @@ def median(self, axis: Optional[int] = None, skipna: bool = True, *args, **kwarg result.fill(iNaT) return self._from_backing_data(result) - mask = self.isna() + mask = self.isna() # TODO: can we use ndarray instead of asi8? result = nanops.nanmedian(self.asi8, axis=axis, skipna=skipna, mask=mask) if axis is None or self.ndim == 1: return self._box_func(result) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 83399a87e5667..2b4cefd46f4b7 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -917,10 +917,15 @@ def reduction( mask: Optional[np.ndarray] = None, ) -> Dtype: + orig_values = values values, mask, dtype, dtype_max, fill_value = _get_values( values, skipna, fill_value_typ=fill_value_typ, mask=mask ) + datetimelike = orig_values.dtype.kind in ["m", "M"] + if datetimelike and mask is None: + mask = isna(orig_values) + if (axis is not None and values.shape[axis] == 0) or values.size == 0: try: result = getattr(values, meth)(axis, dtype=dtype_max) @@ -931,7 +936,17 @@ def reduction( result = getattr(values, meth)(axis) result = _wrap_results(result, dtype, fill_value) - return _maybe_null_out(result, axis, mask, values.shape) + result = _maybe_null_out(result, axis, mask, values.shape) + + if datetimelike and not skipna: + if axis is None or values.ndim == 1: + if mask.any(): + return orig_values.dtype.type("NaT") + else: + axis_mask = mask.any(axis=axis) + result[axis_mask] = orig_values.dtype.type("NaT") + + return result return reduction diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index ddca67306d804..14dbcf6b10ade 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1153,6 +1153,31 @@ def test_min_max_dt64_with_NaT(self): exp = Series([pd.NaT], index=["foo"]) tm.assert_series_equal(res, exp) + def test_min_max_dt64_with_NaT_skipna_false(self, tz_naive_fixture): + # GH#36907 + tz = tz_naive_fixture + df = pd.DataFrame( + { + "a": [ + pd.Timestamp("2020-01-01 08:00:00", tz=tz), + pd.Timestamp("1920-02-01 09:00:00", tz=tz), + ], + "b": [pd.Timestamp("2020-02-01 08:00:00", tz=tz), pd.NaT], + } + ) + + res = df.min(axis=1, skipna=False) + expected = pd.Series([df.loc[0, "a"], pd.NaT]) + assert expected.dtype == df["a"].dtype + + tm.assert_series_equal(res, expected) + + res = df.max(axis=1, skipna=False) + expected = pd.Series([df.loc[0, "b"], pd.NaT]) + assert expected.dtype == df["a"].dtype + + tm.assert_series_equal(res, expected) + def test_min_max_dt64_api_consistency_with_NaT(self): # Calling the following sum functions returned an error for dataframes but # returned NaT for series. These tests check that the API is consistent in From 0f6b64d07acb9321cbdc344eabe314817afe430d Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 25 Oct 2020 12:41:31 -0700 Subject: [PATCH 2/8] checkpoint tests passing --- pandas/core/arrays/datetimelike.py | 41 ++++++++++++------------------ pandas/core/nanops.py | 10 ++++++-- 2 files changed, 24 insertions(+), 27 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 476428e833b3e..a7524930d9436 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1266,18 +1266,15 @@ def min(self, axis=None, skipna=True, *args, **kwargs): nv.validate_min(args, kwargs) nv.validate_minmax_axis(axis, self.ndim) - # View as M8[ns] to get correct NaT/masking semantics for PeriodDtype - result = nanops.nanmin( - self._ndarray.view("M8[ns]"), skipna=skipna, mask=self.isna() - ) + if is_period_dtype(self.dtype): + result = self.to_timestamp("S").min(axis=axis, skipna=skipna) + if result is not NaT: + result = result.to_period(self.freq) + return result + + result = nanops.nanmin(self._ndarray, skipna=skipna, mask=self.isna()) if lib.is_scalar(result): - if isna(result): - # Period._from_ordinal does not handle NaT gracefully - return NaT - # nanops may unwantedly cast to Timestamp - result = getattr(result, "value", result) return self._box_func(result) - result = result.astype("i8", copy=False) return self._from_backing_data(result) def max(self, axis=None, skipna=True, *args, **kwargs): @@ -1296,18 +1293,15 @@ def max(self, axis=None, skipna=True, *args, **kwargs): nv.validate_max(args, kwargs) nv.validate_minmax_axis(axis, self.ndim) - # View as M8[ns] to get correct NaT/masking semantics for PeriodDtype - result = nanops.nanmax( - self._ndarray.view("M8[ns]"), skipna=skipna, mask=self.isna() - ) + if is_period_dtype(self.dtype): + result = self.to_timestamp("S").max(axis=axis, skipna=skipna) + if result is not NaT: + result = result.to_period(self.freq) + return result + + result = nanops.nanmax(self._ndarray, skipna=skipna, mask=self.isna()) if lib.is_scalar(result): - if isna(result): - # Period._from_ordinal does not handle NaT gracefully - return NaT - # nanops may unwantedly cast to Timestamp - result = getattr(result, "value", result) return self._box_func(result) - result = result.astype("i8", copy=False) return self._from_backing_data(result) def mean(self, skipna=True): @@ -1365,10 +1359,7 @@ def median(self, axis: Optional[int] = None, skipna: bool = True, *args, **kwarg if axis is not None and abs(axis) >= self.ndim: raise ValueError("abs(axis) must be less than ndim") - if not self.size and (self.ndim == 1 or axis is None): - return NaT - - if self.size == 0: # TODO: try to do this directly in nanmedian + if self.size == 0: if self.ndim == 1 or axis is None: return NaT shape = list(self.shape) @@ -1378,7 +1369,7 @@ def median(self, axis: Optional[int] = None, skipna: bool = True, *args, **kwarg result.fill(iNaT) return self._from_backing_data(result) - mask = self.isna() # TODO: can we use ndarray instead of asi8? + mask = self.isna() result = nanops.nanmedian(self.asi8, axis=axis, skipna=skipna, mask=mask) if axis is None or self.ndim == 1: return self._box_func(result) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 2b4cefd46f4b7..7f9c1b1cc5611 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -337,9 +337,15 @@ def _wrap_results(result, dtype: DtypeObj, fill_value=None): if not isinstance(result, np.ndarray): tz = getattr(dtype, "tz", None) assert not isna(fill_value), "Expected non-null fill_value" - if result == fill_value: + # assert tz is None # hit in nanmean + if tz is not None: + # we get here e.g. via nanmean when we call it on a DTA[tz] + result = Timestamp(result, tz=tz) + elif result == fill_value: result = np.nan - result = Timestamp(result, tz=tz) + result = np.datetime64("NaT", "ns") + else: + result = np.int64(result).view("datetime64[ns]") else: # If we have float dtype, taking a view will give the wrong result result = result.astype(dtype) From 30ca9d29d03b3a2d3d8d94638d7924d2818f82d1 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 26 Oct 2020 08:50:19 -0700 Subject: [PATCH 3/8] follow-up clenaup --- pandas/core/nanops.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 6588906cf6e2f..1c6d762c28b17 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -337,7 +337,6 @@ def _wrap_results(result, dtype: DtypeObj, fill_value=None): if not isinstance(result, np.ndarray): tz = getattr(dtype, "tz", None) assert not isna(fill_value), "Expected non-null fill_value" - # assert tz is None # hit in nanmean if tz is not None: # we get here e.g. via nanmean when we call it on a DTA[tz] result = Timestamp(result, tz=tz) @@ -925,8 +924,6 @@ def reduction( ) datetimelike = orig_values.dtype.kind in ["m", "M"] - if datetimelike and mask is None: - mask = isna(orig_values) if (axis is not None and values.shape[axis] == 0) or values.size == 0: try: @@ -941,12 +938,7 @@ def reduction( result = _maybe_null_out(result, axis, mask, values.shape) if datetimelike and not skipna: - if axis is None or values.ndim == 1: - if mask.any(): - return orig_values.dtype.type("NaT") - else: - axis_mask = mask.any(axis=axis) - result[axis_mask] = orig_values.dtype.type("NaT") + result = _mask_datetimelike_result(result, axis, mask, orig_values) return result From 1eda3a86b9424c1ecc8398b6b99ca18e3cf57863 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 26 Oct 2020 10:30:31 -0700 Subject: [PATCH 4/8] cln: no need to pass mask --- pandas/core/arrays/datetimelike.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a7524930d9436..9beeb66215b8b 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1272,7 +1272,7 @@ def min(self, axis=None, skipna=True, *args, **kwargs): result = result.to_period(self.freq) return result - result = nanops.nanmin(self._ndarray, skipna=skipna, mask=self.isna()) + result = nanops.nanmin(self._ndarray, skipna=skipna) if lib.is_scalar(result): return self._box_func(result) return self._from_backing_data(result) @@ -1299,7 +1299,7 @@ def max(self, axis=None, skipna=True, *args, **kwargs): result = result.to_period(self.freq) return result - result = nanops.nanmax(self._ndarray, skipna=skipna, mask=self.isna()) + result = nanops.nanmax(self._ndarray, skipna=skipna) if lib.is_scalar(result): return self._box_func(result) return self._from_backing_data(result) From 2c46e67f83b3647bc111eef98a7bb0b7a9e4d77c Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 26 Oct 2020 10:50:26 -0700 Subject: [PATCH 5/8] unnecessary mask arg --- pandas/core/arrays/datetimelike.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 9beeb66215b8b..cb36db2eb151c 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1272,7 +1272,7 @@ def min(self, axis=None, skipna=True, *args, **kwargs): result = result.to_period(self.freq) return result - result = nanops.nanmin(self._ndarray, skipna=skipna) + result = nanops.nanmin(self._ndarray, axis=axis, skipna=skipna) if lib.is_scalar(result): return self._box_func(result) return self._from_backing_data(result) @@ -1299,7 +1299,7 @@ def max(self, axis=None, skipna=True, *args, **kwargs): result = result.to_period(self.freq) return result - result = nanops.nanmax(self._ndarray, skipna=skipna) + result = nanops.nanmax(self._ndarray, axis=axis, skipna=skipna) if lib.is_scalar(result): return self._box_func(result) return self._from_backing_data(result) From 471bf94071c22e2ebcc531ef7964eb4244bd9f54 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 26 Oct 2020 11:16:01 -0700 Subject: [PATCH 6/8] lint fixup --- pandas/tests/frame/test_analytics.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 8e49037ae8b25..f2847315f4959 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1172,24 +1172,24 @@ def test_min_max_dt64_with_NaT(self): def test_min_max_dt64_with_NaT_skipna_false(self, tz_naive_fixture): # GH#36907 tz = tz_naive_fixture - df = pd.DataFrame( + df = DataFrame( { "a": [ - pd.Timestamp("2020-01-01 08:00:00", tz=tz), - pd.Timestamp("1920-02-01 09:00:00", tz=tz), + Timestamp("2020-01-01 08:00:00", tz=tz), + Timestamp("1920-02-01 09:00:00", tz=tz), ], - "b": [pd.Timestamp("2020-02-01 08:00:00", tz=tz), pd.NaT], + "b": [Timestamp("2020-02-01 08:00:00", tz=tz), pd.NaT], } ) res = df.min(axis=1, skipna=False) - expected = pd.Series([df.loc[0, "a"], pd.NaT]) + expected = Series([df.loc[0, "a"], pd.NaT]) assert expected.dtype == df["a"].dtype tm.assert_series_equal(res, expected) res = df.max(axis=1, skipna=False) - expected = pd.Series([df.loc[0, "b"], pd.NaT]) + expected = Series([df.loc[0, "b"], pd.NaT]) assert expected.dtype == df["a"].dtype tm.assert_series_equal(res, expected) From 8cc32d339ba1c435b365f15f365794b16f276460 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 27 Oct 2020 08:17:32 -0700 Subject: [PATCH 7/8] whatsnew, non-copy for perioddtype --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/arrays/datetimelike.py | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index f1f24ab7a101b..d6169d6a61038 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -404,6 +404,7 @@ Numeric - Bug in :class:`IntervalArray` comparisons with :class:`Series` not returning :class:`Series` (:issue:`36908`) - Bug in :class:`DataFrame` allowing arithmetic operations with list of array-likes with undefined results. Behavior changed to raising ``ValueError`` (:issue:`36702`) - Bug in :meth:`DataFrame.std`` with ``timedelta64`` dtype and ``skipna=False`` (:issue:`37392`) +- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` with ``datetime64`` dtype and ``skipna=False`` (:issue:`36907`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a7956c8590035..f3d629f5e8f0a 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1267,10 +1267,14 @@ def min(self, axis=None, skipna=True, *args, **kwargs): nv.validate_minmax_axis(axis, self.ndim) if is_period_dtype(self.dtype): - result = self.to_timestamp("S").min(axis=axis, skipna=skipna) - if result is not NaT: - result = result.to_period(self.freq) - return result + # pass datetime64 values to nanops to get correct NaT semantics + result = nanops.nanmin( + self._ndarray.view("M8[ns]"), axis=axis, skipna=skipna + ) + result = result.view("i8") + if axis is None or self.ndim == 1: + return self._box_func(result) + return self._from_backing_data(result) result = nanops.nanmin(self._ndarray, axis=axis, skipna=skipna) if lib.is_scalar(result): @@ -1294,10 +1298,14 @@ def max(self, axis=None, skipna=True, *args, **kwargs): nv.validate_minmax_axis(axis, self.ndim) if is_period_dtype(self.dtype): - result = self.to_timestamp("S").max(axis=axis, skipna=skipna) - if result is not NaT: - result = result.to_period(self.freq) - return result + # pass datetime64 values to nanops to get correct NaT semantics + result = nanops.nanmax( + self._ndarray.view("M8[ns]"), axis=axis, skipna=skipna + ) + result = result.view("i8") + if axis is None or self.ndim == 1: + return self._box_func(result) + return self._from_backing_data(result) result = nanops.nanmax(self._ndarray, axis=axis, skipna=skipna) if lib.is_scalar(result): From 5ade21bd2a67ec78ded475e466968116469bfe72 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 27 Oct 2020 08:49:04 -0700 Subject: [PATCH 8/8] catch nat --- pandas/core/arrays/datetimelike.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index f3d629f5e8f0a..a6c74294c4c75 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1271,6 +1271,8 @@ def min(self, axis=None, skipna=True, *args, **kwargs): result = nanops.nanmin( self._ndarray.view("M8[ns]"), axis=axis, skipna=skipna ) + if result is NaT: + return NaT result = result.view("i8") if axis is None or self.ndim == 1: return self._box_func(result) @@ -1302,6 +1304,8 @@ def max(self, axis=None, skipna=True, *args, **kwargs): result = nanops.nanmax( self._ndarray.view("M8[ns]"), axis=axis, skipna=skipna ) + if result is NaT: + return result result = result.view("i8") if axis is None or self.ndim == 1: return self._box_func(result)