BUG: DataFrame[td64].sum(skipna=False) (pandas-dev#37148)

jbrockmendel · JulianWgs · commit 5859c159d8eb · 2020-10-26T14:13:23.000+01:00
* BUG: DataFrame[td64].sum(skipna=False)

* annotate, privatize

* annotate

* calculate mask in mask_datetimelike_result
diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
@@ -381,15 +381,15 @@ def sum(
         nv.validate_sum(
             (), dict(dtype=dtype, out=out, keepdims=keepdims, initial=initial)
         )
-        if not len(self):
-            return NaT
-        if not skipna and self._hasnans:
+        if not self.size and (self.ndim == 1 or axis is None):
             return NaT
 
         result = nanops.nansum(
             self._data, axis=axis, skipna=skipna, min_count=min_count
         )
-        return Timedelta(result)
+        if is_scalar(result):
+            return Timedelta(result)
+        return self._from_backing_data(result)
 
     def std(
         self,
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -327,7 +327,10 @@ def _na_ok_dtype(dtype: DtypeObj) -> bool:
 
 def _wrap_results(result, dtype: DtypeObj, fill_value=None):
     """ wrap our results if needed """
-    if is_datetime64_any_dtype(dtype):
+    if result is NaT:
+        pass
+
+    elif is_datetime64_any_dtype(dtype):
         if fill_value is None:
             # GH#24293
             fill_value = iNaT
@@ -498,18 +501,45 @@ def nansum(
     >>> nanops.nansum(s)
     3.0
     """
+    orig_values = values
+
     values, mask, dtype, dtype_max, _ = _get_values(
         values, skipna, fill_value=0, mask=mask
     )
     dtype_sum = dtype_max
+    datetimelike = False
     if is_float_dtype(dtype):
         dtype_sum = dtype
     elif is_timedelta64_dtype(dtype):
+        datetimelike = True
         dtype_sum = np.float64
+
     the_sum = values.sum(axis, dtype=dtype_sum)
     the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count)
 
-    return _wrap_results(the_sum, dtype)
+    the_sum = _wrap_results(the_sum, dtype)
+    if datetimelike and not skipna:
+        the_sum = _mask_datetimelike_result(the_sum, axis, mask, orig_values)
+    return the_sum
+
+
+def _mask_datetimelike_result(
+    result: Union[np.ndarray, np.datetime64, np.timedelta64],
+    axis: Optional[int],
+    mask: Optional[np.ndarray],
+    orig_values: np.ndarray,
+):
+    if mask is None:
+        mask = isna(orig_values)
+    if isinstance(result, np.ndarray):
+        # we need to apply the mask
+        result = result.astype("i8").view(orig_values.dtype)
+        axis_mask = mask.any(axis=axis)
+        result[axis_mask] = iNaT
+    else:
+        if mask.any():
+            result = NaT
+    return result
 
 
 @disallow(PeriodDtype)
@@ -544,21 +574,25 @@ def nanmean(
     >>> nanops.nanmean(s)
     1.5
     """
+    orig_values = values
+
     values, mask, dtype, dtype_max, _ = _get_values(
         values, skipna, fill_value=0, mask=mask
     )
     dtype_sum = dtype_max
     dtype_count = np.float64
+
     # not using needs_i8_conversion because that includes period
-    if (
-        is_integer_dtype(dtype)
-        or is_datetime64_any_dtype(dtype)
-        or is_timedelta64_dtype(dtype)
-    ):
+    datetimelike = False
+    if dtype.kind in ["m", "M"]:
+        datetimelike = True
+        dtype_sum = np.float64
+    elif is_integer_dtype(dtype):
         dtype_sum = np.float64
     elif is_float_dtype(dtype):
         dtype_sum = dtype
         dtype_count = dtype
+
     count = _get_counts(values.shape, mask, axis, dtype=dtype_count)
     the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))
 
@@ -573,7 +607,10 @@ def nanmean(
     else:
         the_mean = the_sum / count if count > 0 else np.nan
 
-    return _wrap_results(the_mean, dtype)
+    the_mean = _wrap_results(the_mean, dtype)
+    if datetimelike and not skipna:
+        the_mean = _mask_datetimelike_result(the_mean, axis, mask, orig_values)
+    return the_mean
 
 
 @bottleneck_switch()
@@ -639,16 +676,37 @@ def get_median(x):
         # empty set so return nans of shape "everything but the passed axis"
         # since "axis" is where the reduction would occur if we had a nonempty
         # array
-        shp = np.array(values.shape)
-        dims = np.arange(values.ndim)
-        ret = np.empty(shp[dims != axis])
-        ret.fill(np.nan)
+        ret = get_empty_reduction_result(values.shape, axis, np.float_, np.nan)
         return _wrap_results(ret, dtype)
 
     # otherwise return a scalar value
     return _wrap_results(get_median(values) if notempty else np.nan, dtype)
 
 
+def get_empty_reduction_result(
+    shape: Tuple[int, ...], axis: int, dtype: np.dtype, fill_value: Any
+) -> np.ndarray:
+    """
+    The result from a reduction on an empty ndarray.
+
+    Parameters
+    ----------
+    shape : Tuple[int]
+    axis : int
+    dtype : np.dtype
+    fill_value : Any
+
+    Returns
+    -------
+    np.ndarray
+    """
+    shp = np.array(shape)
+    dims = np.arange(len(shape))
+    ret = np.empty(shp[dims != axis], dtype=dtype)
+    ret.fill(fill_value)
+    return ret
+
+
 def _get_counts_nanvar(
     value_counts: Tuple[int],
     mask: Optional[np.ndarray],
diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py
@@ -251,6 +251,30 @@ def test_npsum(self):
         assert isinstance(result, pd.Timedelta)
         assert result == expected
 
+    def test_sum_2d_skipna_false(self):
+        arr = np.arange(8).astype(np.int64).view("m8[s]").astype("m8[ns]").reshape(4, 2)
+        arr[-1, -1] = "Nat"
+
+        tda = TimedeltaArray(arr)
+
+        result = tda.sum(skipna=False)
+        assert result is pd.NaT
+
+        result = tda.sum(axis=0, skipna=False)
+        expected = pd.TimedeltaIndex([pd.Timedelta(seconds=12), pd.NaT])._values
+        tm.assert_timedelta_array_equal(result, expected)
+
+        result = tda.sum(axis=1, skipna=False)
+        expected = pd.TimedeltaIndex(
+            [
+                pd.Timedelta(seconds=1),
+                pd.Timedelta(seconds=5),
+                pd.Timedelta(seconds=9),
+                pd.NaT,
+            ]
+        )._values
+        tm.assert_timedelta_array_equal(result, expected)
+
     def test_std(self):
         tdi = pd.TimedeltaIndex(["0H", "4H", "NaT", "4H", "0H", "2H"])
         arr = tdi.array
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -1228,6 +1228,32 @@ def test_frame_any_with_timedelta(self):
         tm.assert_series_equal(result, expected)
 
 
+def test_sum_timedelta64_skipna_false():
+    # GH#17235
+    arr = np.arange(8).astype(np.int64).view("m8[s]").reshape(4, 2)
+    arr[-1, -1] = "Nat"
+
+    df = pd.DataFrame(arr)
+
+    result = df.sum(skipna=False)
+    expected = pd.Series([pd.Timedelta(seconds=12), pd.NaT])
+    tm.assert_series_equal(result, expected)
+
+    result = df.sum(axis=0, skipna=False)
+    tm.assert_series_equal(result, expected)
+
+    result = df.sum(axis=1, skipna=False)
+    expected = pd.Series(
+        [
+            pd.Timedelta(seconds=1),
+            pd.Timedelta(seconds=5),
+            pd.Timedelta(seconds=9),
+            pd.NaT,
+        ]
+    )
+    tm.assert_series_equal(result, expected)
+
+
 def test_mixed_frame_with_integer_sum():
     # https://github.com/pandas-dev/pandas/issues/34520
     df = DataFrame([["a", 1]], columns=list("ab"))
diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py
@@ -1005,6 +1005,23 @@ def test_nanmean(self, tz):
             result = nanops.nanmean(obj)
             assert result == expected
 
+    @pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"])
+    def test_nanmean_skipna_false(self, dtype):
+        arr = np.arange(12).astype(np.int64).view(dtype).reshape(4, 3)
+
+        arr[-1, -1] = "NaT"
+
+        result = nanops.nanmean(arr, skipna=False)
+        assert result is pd.NaT
+
+        result = nanops.nanmean(arr, axis=0, skipna=False)
+        expected = np.array([4, 5, "NaT"], dtype=arr.dtype)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = nanops.nanmean(arr, axis=1, skipna=False)
+        expected = np.array([arr[0, 1], arr[1, 1], arr[2, 1], arr[-1, -1]])
+        tm.assert_numpy_array_equal(result, expected)
+
 
 def test_use_bottleneck():