From 441502e637b538fdecf38948236e0dfd4941395d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 30 Nov 2019 15:13:48 -0800 Subject: [PATCH 1/5] ENH: support datetime64, datetime64tz in nanops.mean, nanops.median --- pandas/core/frame.py | 17 ++++++++++++++++- pandas/core/nanops.py | 11 ++++++----- pandas/tests/frame/test_analytics.py | 17 ++++++++++++----- pandas/tests/test_nanops.py | 1 - 4 files changed, 34 insertions(+), 12 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5dfa7002abfca..4cb31df8ce670 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7575,6 +7575,19 @@ def _count_level(self, level, axis=0, numeric_only=False): def _reduce( self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds ): + + dtype_is_dt = self.dtypes.apply(lambda x: x.kind == "M") + if numeric_only is None and name in ["mean", "median"] and dtype_is_dt.any(): + warnings.warn( + "DataFrame.mean and DataFrame.median with numeric_only=None " + "will include datetime64 and datetime64tz columns in a " + "future version.", + FutureWarning, + stacklevel=3, + ) + cols = self.columns[~dtype_is_dt] + self = self[cols] + if axis is None and filter_type == "bool": labels = None constructor = None @@ -7614,8 +7627,10 @@ def _get_data(axis_matters): # TODO: combine with hasattr(result, 'dtype') further down # hard since we don't have `values` down there. result = np.bool_(result) - except TypeError: + except (TypeError, ValueError): # e.g. in nanops trying to convert strs to float + # TODO: the ValueError is raised in trying to convert str + # to float, should we make that a TypError? # try by-column first if filter_type is None and axis == 0: diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index a2a40bbf93604..0b16c387e2462 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -29,7 +29,6 @@ is_timedelta64_dtype, pandas_dtype, ) -from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna bn = import_optional_dependency("bottleneck", raise_on_missing=False, on_version="warn") @@ -494,7 +493,6 @@ def nansum(values, axis=None, skipna=True, min_count=0, mask=None): return _wrap_results(the_sum, dtype) -@disallow("M8", DatetimeTZDtype) @bottleneck_switch() def nanmean(values, axis=None, skipna=True, mask=None): """ @@ -552,7 +550,6 @@ def nanmean(values, axis=None, skipna=True, mask=None): return _wrap_results(the_mean, dtype) -@disallow("M8") @bottleneck_switch() def nanmedian(values, axis=None, skipna=True, mask=None): """ @@ -585,8 +582,12 @@ def get_median(x): return np.nanmedian(x[mask]) values, mask, dtype, dtype_max, _ = _get_values(values, skipna, mask=mask) - if not is_float_dtype(values): - values = values.astype("f8") + if not is_float_dtype(values.dtype): + try: + values = values.astype("f8") + except ValueError: + # e.g. "could not convert string to float: 'a'" + raise TypeError if mask is not None: values[mask] = np.nan diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 005ca8d95182e..16dfa4f144134 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -66,12 +66,15 @@ def assert_stat_op_calc( f = getattr(frame, opname) if check_dates: + expected_warning = FutureWarning if opname in ["mean", "median"] else None df = DataFrame({"b": date_range("1/1/2001", periods=2)}) - result = getattr(df, opname)() + with tm.assert_produces_warning(expected_warning): + result = getattr(df, opname)() assert isinstance(result, Series) df["a"] = range(len(df)) - result = getattr(df, opname)() + with tm.assert_produces_warning(expected_warning): + result = getattr(df, opname)() assert isinstance(result, Series) assert len(result) @@ -1062,7 +1065,8 @@ def test_nunique(self): def test_mean_mixed_datetime_numeric(self, tz): # https://github.com/pandas-dev/pandas/issues/24752 df = pd.DataFrame({"A": [1, 1], "B": [pd.Timestamp("2000", tz=tz)] * 2}) - result = df.mean() + with tm.assert_produces_warning(FutureWarning): + result = df.mean() expected = pd.Series([1.0], index=["A"]) tm.assert_series_equal(result, expected) @@ -1072,7 +1076,8 @@ def test_mean_excludeds_datetimes(self, tz): # Our long-term desired behavior is unclear, but the behavior in # 0.24.0rc1 was buggy. df = pd.DataFrame({"A": [pd.Timestamp("2000", tz=tz)] * 2}) - result = df.mean() + with tm.assert_produces_warning(FutureWarning): + result = df.mean() expected = pd.Series() tm.assert_series_equal(result, expected) @@ -1458,7 +1463,9 @@ def test_mean_datetimelike(self): expected = pd.Series({"A": 1.0}) tm.assert_series_equal(result, expected) - result = df.mean() + with tm.assert_produces_warning(FutureWarning): + # in the future datetime columns will be included + result = df.mean() expected = pd.Series({"A": 1.0, "C": df.loc[1, "C"]}) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index e5d963a307502..dd225949fdc57 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -985,7 +985,6 @@ def prng(self): class TestDatetime64NaNOps: @pytest.mark.parametrize("tz", [None, "UTC"]) - @pytest.mark.xfail(reason="disabled") # Enabling mean changes the behavior of DataFrame.mean # See https://github.com/pandas-dev/pandas/issues/24752 def test_nanmean(self, tz): From cb77a0d8f9e55d31a98d890e9d30f8b385d3c484 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 25 Dec 2019 16:46:35 -0800 Subject: [PATCH 2/5] re-raise as typeError --- pandas/core/frame.py | 4 +--- pandas/core/nanops.py | 6 +++++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a667901111fc6..0c1536185d76a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7609,10 +7609,8 @@ def _get_data(axis_matters): # TODO: combine with hasattr(result, 'dtype') further down # hard since we don't have `values` down there. result = np.bool_(result) - except (TypeError, ValueError): + except TypeError: # e.g. in nanops trying to convert strs to float - # TODO: the ValueError is raised in trying to convert str - # to float, should we make that a TypError? # try by-column first if filter_type is None and axis == 0: diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 593ede8daaf0a..facf8734ae924 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1289,7 +1289,11 @@ def _ensure_numeric(x): try: x = x.astype(np.complex128) except (TypeError, ValueError): - x = x.astype(np.float64) + try: + x = x.astype(np.float64) + except ValueError: + # GH#29941 we get here with object arrays containing strs + raise TypeError(f"Could not convert {x} to numeric") else: if not np.any(np.imag(x)): x = x.real From 406de1ba93ebb2c8f97893754230dc590bcc93fe Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 25 Dec 2019 17:30:45 -0800 Subject: [PATCH 3/5] update test --- pandas/tests/test_nanops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 324c1a8aed2c8..575c351026b4a 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -742,8 +742,8 @@ def test_ndarray(self): # Test non-convertible string ndarray s_values = np.array(["foo", "bar", "baz"], dtype=object) - msg = r"could not convert string to float: '(foo|baz)'" - with pytest.raises(ValueError, match=msg): + msg = r"Could not convert .* to numeric" + with pytest.raises(TypeError, match=msg): nanops._ensure_numeric(s_values) def test_convertable_values(self): From 1eafb5ebea7fc4d2517cc81004767eceb7664f91 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 26 Jan 2020 10:57:56 -0800 Subject: [PATCH 4/5] whatsnew --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 920919755dc23..be8228fe90b06 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -68,7 +68,7 @@ Backwards incompatible API changes Deprecations ~~~~~~~~~~~~ - +- :meth:`DataFrame.mean` and :meth:`DataFrame.median` with ``numeric_only=None`` will include datetime64 and datetime64tz columns in a future version (:issue:`29941`) - - From 385ae34699174fc7b54ebe7545d381876cc53573 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 27 Jan 2020 15:49:48 -0800 Subject: [PATCH 5/5] compat for CI fails --- pandas/core/frame.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 834bff6d2f314..3df646206821a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7973,9 +7973,15 @@ def _get_data(axis_matters): out_dtype = "bool" if filter_type == "bool" else None + def blk_func(values): + if values.ndim == 1 and not isinstance(values, np.ndarray): + # we can't pass axis=1 + return op(values, axis=0, skipna=skipna, **kwds) + return op(values, axis=1, skipna=skipna, **kwds) + # After possibly _get_data and transposing, we are now in the # simple case where we can use BlockManager._reduce - res = df._data.reduce(op, axis=1, skipna=skipna, **kwds) + res = df._data.reduce(blk_func) assert isinstance(res, dict) if len(res): assert len(res) == max(list(res.keys())) + 1, res.keys()