ENH: support datetime64, datetime64tz in nanops.mean, nanops.median (pandas-dev#29941)

jbrockmendel · web-flow · commit a4d743ed9d52 · 2020-02-12T08:19:12.000-08:00
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -74,6 +74,7 @@ Backwards incompatible API changes
 Deprecations
 ~~~~~~~~~~~~
 - Lookups on a :class:`Series` with a single-item list containing a slice (e.g. ``ser[[slice(0, 4)]]``) are deprecated, will raise in a future version.  Either convert the list to tuple, or pass the slice directly instead (:issue:`31333`)
+- :meth:`DataFrame.mean` and :meth:`DataFrame.median` with ``numeric_only=None`` will include datetime64 and datetime64tz columns in a future version (:issue:`29941`)
 -
 -
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -7957,6 +7957,19 @@ def _count_level(self, level, axis=0, numeric_only=False):
     def _reduce(
         self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds
     ):
+
+        dtype_is_dt = self.dtypes.apply(lambda x: x.kind == "M")
+        if numeric_only is None and name in ["mean", "median"] and dtype_is_dt.any():
+            warnings.warn(
+                "DataFrame.mean and DataFrame.median with numeric_only=None "
+                "will include datetime64 and datetime64tz columns in a "
+                "future version.",
+                FutureWarning,
+                stacklevel=3,
+            )
+            cols = self.columns[~dtype_is_dt]
+            self = self[cols]
+
         if axis is None and filter_type == "bool":
             labels = None
             constructor = None
@@ -7996,9 +8009,15 @@ def _get_data(axis_matters):
 
             out_dtype = "bool" if filter_type == "bool" else None
 
+            def blk_func(values):
+                if values.ndim == 1 and not isinstance(values, np.ndarray):
+                    # we can't pass axis=1
+                    return op(values, axis=0, skipna=skipna, **kwds)
+                return op(values, axis=1, skipna=skipna, **kwds)
+
             # After possibly _get_data and transposing, we are now in the
             #  simple case where we can use BlockManager._reduce
-            res = df._data.reduce(op, axis=1, skipna=skipna, **kwds)
+            res = df._data.reduce(blk_func)
             assert isinstance(res, dict)
             if len(res):
                 assert len(res) == max(list(res.keys())) + 1, res.keys()
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -30,7 +30,6 @@
     is_timedelta64_dtype,
     pandas_dtype,
 )
-from pandas.core.dtypes.dtypes import DatetimeTZDtype
 from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna
 
 bn = import_optional_dependency("bottleneck", raise_on_missing=False, on_version="warn")
@@ -516,7 +515,6 @@ def nansum(
     return _wrap_results(the_sum, dtype)
 
 
-@disallow("M8", DatetimeTZDtype)
 @bottleneck_switch()
 def nanmean(values, axis=None, skipna=True, mask=None):
     """
@@ -574,7 +572,6 @@ def nanmean(values, axis=None, skipna=True, mask=None):
     return _wrap_results(the_mean, dtype)
 
 
-@disallow("M8")
 @bottleneck_switch()
 def nanmedian(values, axis=None, skipna=True, mask=None):
     """
@@ -607,8 +604,12 @@ def get_median(x):
         return np.nanmedian(x[mask])
 
     values, mask, dtype, dtype_max, _ = _get_values(values, skipna, mask=mask)
-    if not is_float_dtype(values):
-        values = values.astype("f8")
+    if not is_float_dtype(values.dtype):
+        try:
+            values = values.astype("f8")
+        except ValueError:
+            # e.g. "could not convert string to float: 'a'"
+            raise TypeError
         if mask is not None:
             values[mask] = np.nan
 
@@ -1355,7 +1356,11 @@ def _ensure_numeric(x):
             try:
                 x = x.astype(np.complex128)
             except (TypeError, ValueError):
-                x = x.astype(np.float64)
+                try:
+                    x = x.astype(np.float64)
+                except ValueError:
+                    # GH#29941 we get here with object arrays containing strs
+                    raise TypeError(f"Could not convert {x} to numeric")
             else:
                 if not np.any(np.imag(x)):
                     x = x.real
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -63,12 +63,15 @@ def assert_stat_op_calc(
     f = getattr(frame, opname)
 
     if check_dates:
+        expected_warning = FutureWarning if opname in ["mean", "median"] else None
         df = DataFrame({"b": date_range("1/1/2001", periods=2)})
-        result = getattr(df, opname)()
+        with tm.assert_produces_warning(expected_warning):
+            result = getattr(df, opname)()
         assert isinstance(result, Series)
 
         df["a"] = range(len(df))
-        result = getattr(df, opname)()
+        with tm.assert_produces_warning(expected_warning):
+            result = getattr(df, opname)()
         assert isinstance(result, Series)
         assert len(result)
 
@@ -457,7 +460,8 @@ def test_nunique(self):
     def test_mean_mixed_datetime_numeric(self, tz):
         # https://github.com/pandas-dev/pandas/issues/24752
         df = pd.DataFrame({"A": [1, 1], "B": [pd.Timestamp("2000", tz=tz)] * 2})
-        result = df.mean()
+        with tm.assert_produces_warning(FutureWarning):
+            result = df.mean()
         expected = pd.Series([1.0], index=["A"])
         tm.assert_series_equal(result, expected)
 
@@ -467,7 +471,9 @@ def test_mean_excludes_datetimes(self, tz):
         # Our long-term desired behavior is unclear, but the behavior in
         # 0.24.0rc1 was buggy.
         df = pd.DataFrame({"A": [pd.Timestamp("2000", tz=tz)] * 2})
-        result = df.mean()
+        with tm.assert_produces_warning(FutureWarning):
+            result = df.mean()
+
         expected = pd.Series(dtype=np.float64)
         tm.assert_series_equal(result, expected)
 
@@ -863,7 +869,9 @@ def test_mean_datetimelike(self):
         expected = pd.Series({"A": 1.0})
         tm.assert_series_equal(result, expected)
 
-        result = df.mean()
+        with tm.assert_produces_warning(FutureWarning):
+            # in the future datetime columns will be included
+            result = df.mean()
         expected = pd.Series({"A": 1.0, "C": df.loc[1, "C"]})
         tm.assert_series_equal(result, expected)
 
diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py
@@ -750,8 +750,8 @@ def test_ndarray(self):
 
         # Test non-convertible string ndarray
         s_values = np.array(["foo", "bar", "baz"], dtype=object)
-        msg = r"could not convert string to float: '(foo|baz)'"
-        with pytest.raises(ValueError, match=msg):
+        msg = r"Could not convert .* to numeric"
+        with pytest.raises(TypeError, match=msg):
             nanops._ensure_numeric(s_values)
 
     def test_convertable_values(self):
@@ -993,7 +993,6 @@ def prng(self):
 
 class TestDatetime64NaNOps:
     @pytest.mark.parametrize("tz", [None, "UTC"])
-    @pytest.mark.xfail(reason="disabled")
     # Enabling mean changes the behavior of DataFrame.mean
     # See https://github.com/pandas-dev/pandas/issues/24752
     def test_nanmean(self, tz):

Original file line number	Diff line number	Diff line change
`@@ -74,6 +74,7 @@ Backwards incompatible API changes`
`74`	`74`	`Deprecations`
`75`	`75`	`~~~~~~~~~~~~`
`76`	`76`	- Lookups on a :class:`Series` with a single-item list containing a slice (e.g. ``ser[[slice(0, 4)]]``) are deprecated, will raise in a future version. Either convert the list to tuple, or pass the slice directly instead (:issue:`31333`)
	`77`	+- :meth:`DataFrame.mean` and :meth:`DataFrame.median` with ``numeric_only=None`` will include datetime64 and datetime64tz columns in a future version (:issue:`29941`)
`77`	`78`	`-`
`78`	`79`	`-`
`79`	`80`