From bf23e99c36f37bcd0cabb923a3632f1737f217d6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 20 Mar 2020 10:51:21 -0700 Subject: [PATCH 1/6] REF: DF._reduce do frame_apply early --- pandas/core/frame.py | 70 ++++++++++++++++------------ pandas/tests/frame/test_analytics.py | 2 +- pandas/tests/frame/test_missing.py | 2 +- 3 files changed, 43 insertions(+), 31 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cd5d81bc70dd9..b8a2842c80af6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7856,54 +7856,66 @@ def blk_func(values): out.index = df.columns return out + if not self._is_homogeneous_type: + # try to avoid self.values call + + # try by-column first + if filter_type is None and axis == 0 and len(self) > 0: + # numeric_only must be None here, as other cases caught above + # require len(self) > 0 bc frame_apply messes up empty prod/sum + + # this can end up with a non-reduction + # but not always. if the types are mixed + # with datelike then need to make sure a series + + # we only end up here if we have not specified + # numeric_only and yet we have tried a + # column-by-column reduction, where we have mixed type. + # So let's just do what we can + from pandas.core.apply import frame_apply + + opa = frame_apply( + self, func=f, result_type="expand", ignore_failures=True + ) + result = opa.get_result() + if result.ndim == self.ndim: + result = result.iloc[0].rename(None) + return result + + data = self if numeric_only is None: + values = self.values try: result = f(values) - if filter_type == "bool" and is_object_dtype(values) and axis is None: - # work around https://github.com/numpy/numpy/issues/10489 - # TODO: combine with hasattr(result, 'dtype') further down - # hard since we don't have `values` down there. - result = np.bool_(result) except TypeError: # e.g. in nanops trying to convert strs to float - # try by-column first - if filter_type is None and axis == 0: - # this can end up with a non-reduction - # but not always. if the types are mixed - # with datelike then need to make sure a series - - # we only end up here if we have not specified - # numeric_only and yet we have tried a - # column-by-column reduction, where we have mixed type. - # So let's just do what we can - from pandas.core.apply import frame_apply - - opa = frame_apply( - self, func=f, result_type="expand", ignore_failures=True - ) - result = opa.get_result() - if result.ndim == self.ndim: - result = result.iloc[0] - return result - # TODO: why doesnt axis matter here? data = _get_data(axis_matters=False) - with np.errstate(all="ignore"): - result = f(data.values) labels = data._get_agg_axis(axis) + + values = data.values + with np.errstate(all="ignore"): + result = f(values) + else: if numeric_only: data = _get_data(axis_matters=True) + labels = data._get_agg_axis(axis) values = data.values - labels = data._get_agg_axis(axis) else: values = self.values result = f(values) + if filter_type == "bool" and is_object_dtype(values.dtype) and axis is None: + # work around https://github.com/numpy/numpy/issues/10489 + # TODO: combine with hasattr(result, 'dtype') further down + # hard since we don't have `values` down there. + result = np.bool_(result) + if hasattr(result, "dtype") and is_object_dtype(result.dtype): try: if filter_type is None or filter_type == "numeric": @@ -7914,7 +7926,7 @@ def blk_func(values): # try to coerce to the original dtypes item by item if we can if axis == 0: - result = coerce_to_dtypes(result, self.dtypes) + result = coerce_to_dtypes(result, data.dtypes) if constructor is not None: result = self._constructor_sliced(result, index=labels) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 07e30d41c216d..3a0f1b2a2ec17 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -344,7 +344,7 @@ def kurt(x): "sum", np.sum, float_frame_with_na, skipna_alternative=np.nansum ) assert_stat_op_calc("mean", np.mean, float_frame_with_na, check_dates=True) - assert_stat_op_calc("product", np.prod, float_frame_with_na) + assert_stat_op_calc("product", np.prod, float_frame_with_na, skipna_alternative=np.nanprod) assert_stat_op_calc("mad", mad, float_frame_with_na) assert_stat_op_calc("var", var, float_frame_with_na) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 196df8ba00476..6cd9c0a7fde88 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -372,7 +372,7 @@ def test_fillna_categorical_nan(self): cat = Categorical([np.nan, 2, np.nan]) val = Categorical([np.nan, np.nan, np.nan]) df = DataFrame({"cats": cat, "vals": val}) - with tm.assert_produces_warning(RuntimeWarning): + with tm.assert_produces_warning(None): res = df.fillna(df.median()) v_exp = [np.nan, np.nan, np.nan] df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, dtype="category") From d85805fdb9c06e266dd29d92a22b6a940345e792 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 20 Mar 2020 10:51:30 -0700 Subject: [PATCH 2/6] REF: DF._reduce do frame_apply early --- pandas/tests/frame/test_analytics.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 3a0f1b2a2ec17..c83e2dda20d0a 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -344,7 +344,9 @@ def kurt(x): "sum", np.sum, float_frame_with_na, skipna_alternative=np.nansum ) assert_stat_op_calc("mean", np.mean, float_frame_with_na, check_dates=True) - assert_stat_op_calc("product", np.prod, float_frame_with_na, skipna_alternative=np.nanprod) + assert_stat_op_calc( + "product", np.prod, float_frame_with_na, skipna_alternative=np.nanprod + ) assert_stat_op_calc("mad", mad, float_frame_with_na) assert_stat_op_calc("var", var, float_frame_with_na) From f9235fca115772e2ace24cd621532e4a1a815a84 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 24 Mar 2020 13:34:08 -0700 Subject: [PATCH 3/6] move fillna call outside of context --- pandas/tests/frame/test_missing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 749f4080582d3..10e0b52fb6221 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -373,7 +373,8 @@ def test_fillna_categorical_nan(self): val = Categorical([np.nan, np.nan, np.nan]) df = DataFrame({"cats": cat, "vals": val}) with tm.assert_produces_warning(None): - res = df.fillna(df.median()) + median = df.median() + res = df.fillna(median) v_exp = [np.nan, np.nan, np.nan] df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, dtype="category") tm.assert_frame_equal(res, df_exp) From c22dd07097cacfa7a2f4217e1a2796dc27a7c825 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 30 Mar 2020 11:08:39 -0700 Subject: [PATCH 4/6] debugging assertion --- pandas/tests/frame/test_missing.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 10e0b52fb6221..4e4ab25031afc 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -374,6 +374,11 @@ def test_fillna_categorical_nan(self): df = DataFrame({"cats": cat, "vals": val}) with tm.assert_produces_warning(None): median = df.median() + + # GH#32950 check that we got the right expected median + exmed = Series({"cats": 2.0, "vals": np.nan}) + tm.assert_series_equal(median, exmed) + res = df.fillna(median) v_exp = [np.nan, np.nan, np.nan] df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, dtype="category") From 2f8fc79db508c816cc6bfdf19f5b3cfdc838880a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 30 Mar 2020 12:09:18 -0700 Subject: [PATCH 5/6] Troubleshoot CI --- pandas/core/frame.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7c143dcf4b222..cf974a2fedab4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -77,6 +77,7 @@ ensure_platform_int, infer_dtype_from_object, is_bool_dtype, + is_categorical_dtype, is_dataclass, is_datetime64_any_dtype, is_dict_like, @@ -7998,8 +7999,15 @@ def blk_func(values): if not self._is_homogeneous_type: # try to avoid self.values call - # try by-column first - if filter_type is None and axis == 0 and len(self) > 0: + if self.dtypes.apply(is_categorical_dtype).any(): + # GH#32950 Fall through to operating on self.values, since + # operating column-wise will fail on Categorical.median + # (TODO: only on some builds, not clear why) + pass + + elif filter_type is None and axis == 0 and len(self) > 0: + # operate column-wise + # numeric_only must be None here, as other cases caught above # require len(self) > 0 bc frame_apply messes up empty prod/sum From 28fd5607ecf29b8317bc9d899364d4eb76e36173 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 30 Mar 2020 13:11:32 -0700 Subject: [PATCH 6/6] update test --- pandas/core/frame.py | 9 +-------- pandas/tests/frame/test_missing.py | 8 +++----- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cf974a2fedab4..58494d2fcaa5f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -77,7 +77,6 @@ ensure_platform_int, infer_dtype_from_object, is_bool_dtype, - is_categorical_dtype, is_dataclass, is_datetime64_any_dtype, is_dict_like, @@ -7999,13 +7998,7 @@ def blk_func(values): if not self._is_homogeneous_type: # try to avoid self.values call - if self.dtypes.apply(is_categorical_dtype).any(): - # GH#32950 Fall through to operating on self.values, since - # operating column-wise will fail on Categorical.median - # (TODO: only on some builds, not clear why) - pass - - elif filter_type is None and axis == 0 and len(self) > 0: + if filter_type is None and axis == 0 and len(self) > 0: # operate column-wise # numeric_only must be None here, as other cases caught above diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 4e4ab25031afc..7cb7115276f71 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -372,12 +372,10 @@ def test_fillna_categorical_nan(self): cat = Categorical([np.nan, 2, np.nan]) val = Categorical([np.nan, np.nan, np.nan]) df = DataFrame({"cats": cat, "vals": val}) - with tm.assert_produces_warning(None): - median = df.median() - # GH#32950 check that we got the right expected median - exmed = Series({"cats": 2.0, "vals": np.nan}) - tm.assert_series_equal(median, exmed) + # GH#32950 df.median() is poorly behaved because there is no + # Categorical.median + median = Series({"cats": 2.0, "vals": np.nan}) res = df.fillna(median) v_exp = [np.nan, np.nan, np.nan]