From 854da9e80257ed48dc5dc26bd787a7036494fe78 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 15 Oct 2020 13:51:34 -0700 Subject: [PATCH 1/4] BUG: DataFrame[td64].sum(skipna=False) --- pandas/core/arrays/timedeltas.py | 8 ++-- pandas/core/nanops.py | 66 +++++++++++++++++++++----- pandas/tests/arrays/test_timedeltas.py | 24 ++++++++++ pandas/tests/frame/test_analytics.py | 26 ++++++++++ pandas/tests/test_nanops.py | 17 +++++++ 5 files changed, 125 insertions(+), 16 deletions(-) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 82cd54182a33d..64e5f78d961d1 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -381,15 +381,15 @@ def sum( nv.validate_sum( (), dict(dtype=dtype, out=out, keepdims=keepdims, initial=initial) ) - if not len(self): - return NaT - if not skipna and self._hasnans: + if not self.size and (self.ndim == 1 or axis is None): return NaT result = nanops.nansum( self._data, axis=axis, skipna=skipna, min_count=min_count ) - return Timedelta(result) + if is_scalar(result): + return Timedelta(result) + return self._from_backing_data(result) def std( self, diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index f2354f649b1e3..289005825c289 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -327,7 +327,10 @@ def _na_ok_dtype(dtype: DtypeObj) -> bool: def _wrap_results(result, dtype: DtypeObj, fill_value=None): """ wrap our results if needed """ - if is_datetime64_any_dtype(dtype): + if result is NaT: + pass + + elif is_datetime64_any_dtype(dtype): if fill_value is None: # GH#24293 fill_value = iNaT @@ -498,18 +501,40 @@ def nansum( >>> nanops.nansum(s) 3.0 """ + orig_values = values + values, mask, dtype, dtype_max, _ = _get_values( values, skipna, fill_value=0, mask=mask ) dtype_sum = dtype_max + datetimelike = False if is_float_dtype(dtype): dtype_sum = dtype elif is_timedelta64_dtype(dtype): + datetimelike = True dtype_sum = np.float64 + if mask is None and not skipna: + mask = isna(orig_values) + the_sum = values.sum(axis, dtype=dtype_sum) the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count) - return _wrap_results(the_sum, dtype) + the_sum = _wrap_results(the_sum, dtype) + if datetimelike and not skipna: + the_sum = mask_datetimelike_result(the_sum, axis, mask, orig_values.dtype) + return the_sum + + +def mask_datetimelike_result(result, axis, mask, orig_dtype): + if isinstance(result, np.ndarray): + # we need to apply the mask + result = result.astype("i8").view(orig_dtype) + axis_mask = mask.any(axis=axis) + result[axis_mask] = iNaT + else: + if mask.any(): + result = NaT + return result @disallow(PeriodDtype) @@ -544,21 +569,27 @@ def nanmean( >>> nanops.nanmean(s) 1.5 """ + orig_values = values + values, mask, dtype, dtype_max, _ = _get_values( values, skipna, fill_value=0, mask=mask ) dtype_sum = dtype_max dtype_count = np.float64 + # not using needs_i8_conversion because that includes period - if ( - is_integer_dtype(dtype) - or is_datetime64_any_dtype(dtype) - or is_timedelta64_dtype(dtype) - ): + datetimelike = False + if dtype.kind in ["m", "M"]: + datetimelike = True + if mask is None and not skipna: + mask = isna(orig_values) + dtype_sum = np.float64 + elif is_integer_dtype(dtype): dtype_sum = np.float64 elif is_float_dtype(dtype): dtype_sum = dtype dtype_count = dtype + count = _get_counts(values.shape, mask, axis, dtype=dtype_count) the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) @@ -573,7 +604,10 @@ def nanmean( else: the_mean = the_sum / count if count > 0 else np.nan - return _wrap_results(the_mean, dtype) + the_mean = _wrap_results(the_mean, dtype) + if datetimelike and not skipna: + the_mean = mask_datetimelike_result(the_mean, axis, mask, orig_values.dtype) + return the_mean @bottleneck_switch() @@ -639,16 +673,24 @@ def get_median(x): # empty set so return nans of shape "everything but the passed axis" # since "axis" is where the reduction would occur if we had a nonempty # array - shp = np.array(values.shape) - dims = np.arange(values.ndim) - ret = np.empty(shp[dims != axis]) - ret.fill(np.nan) + ret = get_empty_reduction_result(values.shape, axis, np.float_, np.nan) return _wrap_results(ret, dtype) # otherwise return a scalar value return _wrap_results(get_median(values) if notempty else np.nan, dtype) +def get_empty_reduction_result(shape, axis: int, dtype, fill_value) -> np.ndarray: + """ + The result from a reduction on an empty ndarray. + """ + shp = np.array(shape) + dims = np.arange(len(shape)) + ret = np.empty(shp[dims != axis], dtype=dtype) + ret.fill(fill_value) + return ret + + def _get_counts_nanvar( value_counts: Tuple[int], mask: Optional[np.ndarray], diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index b3b8f4d55e4de..a09e85010318c 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -251,6 +251,30 @@ def test_npsum(self): assert isinstance(result, pd.Timedelta) assert result == expected + def test_sum_2d_skipna_false(self): + arr = np.arange(8).astype(np.int64).view("m8[s]").astype("m8[ns]").reshape(4, 2) + arr[-1, -1] = "Nat" + + tda = TimedeltaArray(arr) + + result = tda.sum(skipna=False) + assert result is pd.NaT + + result = tda.sum(axis=0, skipna=False) + expected = pd.TimedeltaIndex([pd.Timedelta(seconds=12), pd.NaT])._values + tm.assert_timedelta_array_equal(result, expected) + + result = tda.sum(axis=1, skipna=False) + expected = pd.TimedeltaIndex( + [ + pd.Timedelta(seconds=1), + pd.Timedelta(seconds=5), + pd.Timedelta(seconds=9), + pd.NaT, + ] + )._values + tm.assert_timedelta_array_equal(result, expected) + def test_std(self): tdi = pd.TimedeltaIndex(["0H", "4H", "NaT", "4H", "0H", "2H"]) arr = tdi.array diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index ee136533b0775..a1f45324a920f 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1273,6 +1273,32 @@ def test_preserve_timezone(self, initial: str, method): tm.assert_series_equal(result, expected) +def test_sum_timedelta64_skipna_false(): + # GH#17235 + arr = np.arange(8).astype(np.int64).view("m8[s]").reshape(4, 2) + arr[-1, -1] = "Nat" + + df = pd.DataFrame(arr) + + result = df.sum(skipna=False) + expected = pd.Series([pd.Timedelta(seconds=12), pd.NaT]) + tm.assert_series_equal(result, expected) + + result = df.sum(axis=0, skipna=False) + tm.assert_series_equal(result, expected) + + result = df.sum(axis=1, skipna=False) + expected = pd.Series( + [ + pd.Timedelta(seconds=1), + pd.Timedelta(seconds=5), + pd.Timedelta(seconds=9), + pd.NaT, + ] + ) + tm.assert_series_equal(result, expected) + + def test_mixed_frame_with_integer_sum(): # https://github.com/pandas-dev/pandas/issues/34520 df = pd.DataFrame([["a", 1]], columns=list("ab")) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index c45e4508c6153..da474f2c2978c 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -1005,6 +1005,23 @@ def test_nanmean(self, tz): result = nanops.nanmean(obj) assert result == expected + @pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"]) + def test_nanmean_skipna_false(self, dtype): + arr = np.arange(12).astype(np.int64).view(dtype).reshape(4, 3) + + arr[-1, -1] = "NaT" + + result = nanops.nanmean(arr, skipna=False) + assert result is pd.NaT + + result = nanops.nanmean(arr, axis=0, skipna=False) + expected = np.array([4, 5, "NaT"], dtype=arr.dtype) + tm.assert_numpy_array_equal(result, expected) + + result = nanops.nanmean(arr, axis=1, skipna=False) + expected = np.array([arr[0, 1], arr[1, 1], arr[2, 1], arr[-1, -1]]) + tm.assert_numpy_array_equal(result, expected) + def test_use_bottleneck(): From 58182aa41d88b1c6e696aeec3cce3a56659f661c Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 16 Oct 2020 19:03:10 -0700 Subject: [PATCH 2/4] annotate, privatize --- pandas/core/nanops.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 289005825c289..fd26e5b678bef 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -521,11 +521,16 @@ def nansum( the_sum = _wrap_results(the_sum, dtype) if datetimelike and not skipna: - the_sum = mask_datetimelike_result(the_sum, axis, mask, orig_values.dtype) + the_sum = _mask_datetimelike_result(the_sum, axis, mask, orig_values.dtype) return the_sum -def mask_datetimelike_result(result, axis, mask, orig_dtype): +def _mask_datetimelike_result( + result: Union[np.ndarray, np.datetime64, np.timedelta64], + axis: Optional[int], + mask: np.ndarray, + orig_dtype: np.dtype, +): if isinstance(result, np.ndarray): # we need to apply the mask result = result.astype("i8").view(orig_dtype) @@ -606,7 +611,7 @@ def nanmean( the_mean = _wrap_results(the_mean, dtype) if datetimelike and not skipna: - the_mean = mask_datetimelike_result(the_mean, axis, mask, orig_values.dtype) + the_mean = _mask_datetimelike_result(the_mean, axis, mask, orig_values.dtype) return the_mean From 482ac89cd82330eaa273e3500eb9fcfd1e0d03c9 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 17 Oct 2020 17:05:06 -0700 Subject: [PATCH 3/4] annotate --- pandas/core/nanops.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index fd26e5b678bef..cee11f6fa5888 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -685,9 +685,22 @@ def get_median(x): return _wrap_results(get_median(values) if notempty else np.nan, dtype) -def get_empty_reduction_result(shape, axis: int, dtype, fill_value) -> np.ndarray: +def get_empty_reduction_result( + shape: Tuple[int, ...], axis: int, dtype: np.dtype, fill_value: Any +) -> np.ndarray: """ The result from a reduction on an empty ndarray. + + Parameters + ---------- + shape : Tuple[int] + axis : int + dtype : np.dtype + fill_value : Any + + Returns + ------- + np.ndarray """ shp = np.array(shape) dims = np.arange(len(shape)) From 3987063c84173a510ebfca6a4568ddeac57640af Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 18 Oct 2020 19:00:49 -0700 Subject: [PATCH 4/4] calculate mask in mask_datetimelike_result --- pandas/core/nanops.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index cee11f6fa5888..83399a87e5667 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -513,27 +513,27 @@ def nansum( elif is_timedelta64_dtype(dtype): datetimelike = True dtype_sum = np.float64 - if mask is None and not skipna: - mask = isna(orig_values) the_sum = values.sum(axis, dtype=dtype_sum) the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count) the_sum = _wrap_results(the_sum, dtype) if datetimelike and not skipna: - the_sum = _mask_datetimelike_result(the_sum, axis, mask, orig_values.dtype) + the_sum = _mask_datetimelike_result(the_sum, axis, mask, orig_values) return the_sum def _mask_datetimelike_result( result: Union[np.ndarray, np.datetime64, np.timedelta64], axis: Optional[int], - mask: np.ndarray, - orig_dtype: np.dtype, + mask: Optional[np.ndarray], + orig_values: np.ndarray, ): + if mask is None: + mask = isna(orig_values) if isinstance(result, np.ndarray): # we need to apply the mask - result = result.astype("i8").view(orig_dtype) + result = result.astype("i8").view(orig_values.dtype) axis_mask = mask.any(axis=axis) result[axis_mask] = iNaT else: @@ -586,8 +586,6 @@ def nanmean( datetimelike = False if dtype.kind in ["m", "M"]: datetimelike = True - if mask is None and not skipna: - mask = isna(orig_values) dtype_sum = np.float64 elif is_integer_dtype(dtype): dtype_sum = np.float64 @@ -611,7 +609,7 @@ def nanmean( the_mean = _wrap_results(the_mean, dtype) if datetimelike and not skipna: - the_mean = _mask_datetimelike_result(the_mean, axis, mask, orig_values.dtype) + the_mean = _mask_datetimelike_result(the_mean, axis, mask, orig_values) return the_mean