From 935855242bd61b5175d0a07fab0c2ef668f2290d Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 12 Nov 2020 18:33:00 -0800 Subject: [PATCH 1/9] BUG: DataFrame reductions consistent with Series --- pandas/core/frame.py | 8 +++++++- pandas/core/internals/blocks.py | 29 +++++++++++++++++++++------ pandas/tests/frame/test_reductions.py | 11 ++++++---- 3 files changed, 37 insertions(+), 11 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9ce5ef2fc3cfe..0f543dc397da8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8752,6 +8752,8 @@ def func(values): def blk_func(values): if isinstance(values, ExtensionArray): + if values.ndim == 2: + return values._reduce(name, skipna=skipna, axis=1, **kwds) return values._reduce(name, skipna=skipna, **kwds) else: return op(values, axis=1, skipna=skipna, **kwds) @@ -8765,7 +8767,7 @@ def _get_data() -> DataFrame: data = self._get_bool_data() return data - if numeric_only is not None: + if numeric_only is not None or axis == 0: # For numeric_only non-None and axis non-None, we know # which blocks to use and no try/except is needed. # For numeric_only=None only the case with axis==0 and no object @@ -8790,6 +8792,10 @@ def _get_data() -> DataFrame: # GH#35865 careful to cast explicitly to object nvs = coerce_to_dtypes(out.values, df.dtypes.iloc[np.sort(indexer)]) out[:] = np.array(nvs, dtype=object) + if axis == 0 and len(self) == 0 and name in ["sum", "prod"]: + # Even if we are object dtype, follow numpy and return + # float64, see test_apply_funcs_over_empty + out = out.astype(np.float64) return out assert numeric_only is None diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 13e428c0419ec..ac8b2a51a6f08 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -375,7 +375,11 @@ def reduce(self, func, ignore_failures: bool = False) -> List["Block"]: assert self.ndim == 2 try: - result = func(self.values) + if self.is_datetimetz: + # FIXME: kludge + result = func(self.values.reshape(self.shape)) + else: + result = func(self.values) except (TypeError, NotImplementedError): if ignore_failures: return [] @@ -384,6 +388,8 @@ def reduce(self, func, ignore_failures: bool = False) -> List["Block"]: if np.ndim(result) == 0: # TODO(EA2D): special case not needed with 2D EAs res_values = np.array([[result]]) + elif self.is_datetimetz: + res_values = result # FIXME: kludge else: res_values = result.reshape(-1, 1) @@ -450,7 +456,9 @@ def f(mask, val, idx): return self.split_and_operate(None, f, inplace) - def split_and_operate(self, mask, f, inplace: bool) -> List["Block"]: + def split_and_operate( + self, mask, f, inplace: bool, ignore_failures: bool = False + ) -> List["Block"]: """ split the block per-column, and apply the callable f per-column, return a new block for each. Handle @@ -460,7 +468,8 @@ def split_and_operate(self, mask, f, inplace: bool) -> List["Block"]: ---------- mask : 2-d boolean mask f : callable accepting (1d-mask, 1d values, indexer) - inplace : boolean + inplace : bool + ignore_failures : bool, default False Returns ------- @@ -499,8 +508,14 @@ def make_a_block(nv, ref_loc): v = new_values[i] # need a new block - if m.any(): - nv = f(m, v, i) + if m.any() or m.size == 0: + try: + nv = f(m, v, i) + except TypeError: + if ignore_failures: + continue + else: + raise else: nv = v if inplace else v.copy() @@ -2461,7 +2476,9 @@ def mask_func(mask, values, inplace): values = values.reshape(1, -1) return func(values) - return self.split_and_operate(None, mask_func, False) + return self.split_and_operate( + None, mask_func, False, ignore_failures=ignore_failures + ) try: res = func(values) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 374d185f45844..a018da429687f 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -12,6 +12,7 @@ from pandas import ( Categorical, DataFrame, + Index, MultiIndex, Series, Timestamp, @@ -1083,10 +1084,12 @@ def test_any_all_bool_only(self): pytest.param(np.any, {"A": Series([0, 1], dtype="m8[ns]")}, True), pytest.param(np.all, {"A": Series([1, 2], dtype="m8[ns]")}, True), pytest.param(np.any, {"A": Series([1, 2], dtype="m8[ns]")}, True), - (np.all, {"A": Series([0, 1], dtype="category")}, False), - (np.any, {"A": Series([0, 1], dtype="category")}, True), + # np.all on Categorical raises, so the reduction drops the + # column, so all is being done on an empty Series, so is True + (np.all, {"A": Series([0, 1], dtype="category")}, True), + (np.any, {"A": Series([0, 1], dtype="category")}, False), (np.all, {"A": Series([1, 2], dtype="category")}, True), - (np.any, {"A": Series([1, 2], dtype="category")}, True), + (np.any, {"A": Series([1, 2], dtype="category")}, False), # Mix GH#21484 pytest.param( np.all, @@ -1321,6 +1324,6 @@ def test_minmax_extensionarray(method, numeric_only): df = DataFrame({"Int64": ser}) result = getattr(df, method)(numeric_only=numeric_only) expected = Series( - [getattr(int64_info, method)], index=pd.Index(["Int64"], dtype="object") + [getattr(int64_info, method)], index=Index(["Int64"], dtype="object") ) tm.assert_series_equal(result, expected) From 4a34b4af2972ef3836f01caa2a9c7918e5d9b1d9 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 13 Nov 2020 14:39:02 -0800 Subject: [PATCH 2/9] Tests --- pandas/core/frame.py | 2 - pandas/core/internals/blocks.py | 8 +-- pandas/tests/frame/test_reductions.py | 88 +++++++++++++++++++++++++++ 3 files changed, 89 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d8f63d4a1d58d..202bc885a6c62 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8752,8 +8752,6 @@ def func(values): def blk_func(values): if isinstance(values, ExtensionArray): - if values.ndim == 2: - return values._reduce(name, skipna=skipna, axis=1, **kwds) return values._reduce(name, skipna=skipna, **kwds) else: return op(values, axis=1, skipna=skipna, **kwds) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index d816aaaed8184..c539ce63d444e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -375,11 +375,7 @@ def reduce(self, func, ignore_failures: bool = False) -> List["Block"]: assert self.ndim == 2 try: - if self.is_datetimetz: - # FIXME: kludge - result = func(self.values.reshape(self.shape)) - else: - result = func(self.values) + result = func(self.values) except (TypeError, NotImplementedError): if ignore_failures: return [] @@ -388,8 +384,6 @@ def reduce(self, func, ignore_failures: bool = False) -> List["Block"]: if np.ndim(result) == 0: # TODO(EA2D): special case not needed with 2D EAs res_values = np.array([[result]]) - elif self.is_datetimetz: - res_values = result # FIXME: kludge else: res_values = result.reshape(-1, 1) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index a018da429687f..e83d88570d4e5 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1279,6 +1279,94 @@ def test_frame_any_with_timedelta(self): expected = Series(data=[False, True]) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("method", ["any", "all"]) + def test_any_all_categorical_dtype_nuisance_column(self, method): + # GH#36076 DataFrame should match Series behavior + ser = Series([0, 1], dtype="category", name="A") + df = ser.to_frame() + + # Double-check the Series behavior is to raise + with pytest.raises(TypeError, match="does not implement reduction"): + getattr(ser, method)() + with pytest.raises(TypeError, match="does not implement reduction"): + getattr(np, method)(ser) + + with pytest.raises(TypeError, match="does not implement reduction"): + getattr(df, method)(bool_only=False) + + # With bool_only=None, operating on this column raises and is ignored, + # so we expect an empty result. + result = getattr(df, method)(bool_only=None) + expected = Series([], index=Index([]), dtype=bool) + tm.assert_series_equal(result, expected) + + result = getattr(np, method)(df, axis=0) + tm.assert_series_equal(result, expected) + + def test_median_categorical_dtype_nuisance_column(self): + # GH#21020 DataFrame.median should match Series.median + df = DataFrame({"A": Categorical([1, 2, 2, 2, 3])}) + ser = df["A"] + + # Double-check the Series behavior is to raise + with pytest.raises(TypeError, match="does not implement reduction"): + ser.median() + + with pytest.raises(TypeError, match="does not implement reduction"): + df.median(numeric_only=False) + + result = df.median() + expected = Series([], index=Index([]), dtype=np.float64) + tm.assert_series_equal(result, expected) + + # same thing, but with an additional non-categorical column + df["B"] = df["A"].astype(int) + + with pytest.raises(TypeError, match="does not implement reduction"): + df.median(numeric_only=False) + + result = df.median() + expected = Series([2.0], index=["B"]) + tm.assert_series_equal(result, expected) + + # TODO: np.median(df, axis=0) gives np.array([2.0, 2.0]) instead + # of expected.values + + @pytest.mark.parametrize("method", ["min", "max"]) + def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method): + # GH#28948 DataFrame.min should behave like Series.min + cat = Categorical(["a", "b", "c", "b"], ordered=False) + ser = Series(cat) + df = ser.to_frame("A") + + # Double-check the Series behavior + with pytest.raises(TypeError, match="is not ordered for operation"): + getattr(ser, method)() + with pytest.raises(TypeError, match="is not ordered for operation"): + getattr(np, method)(ser) + + with pytest.raises(TypeError, match="is not ordered for operation"): + getattr(df, method)(numeric_only=False) + + result = getattr(df, method)() + expected = Series([], index=Index([]), dtype=np.float64) + tm.assert_series_equal(result, expected) + + result = getattr(np, method)(df) + tm.assert_series_equal(result, expected) + + # same thing, but with an additional non-categorical column + df["B"] = df["A"].astype(object) + result = getattr(df, method)() + if method == "min": + expected = Series(["a"], index=["B"]) + else: + expected = Series(["c"], index=["B"]) + tm.assert_series_equal(result, expected) + + result = getattr(np, method)(df) + tm.assert_series_equal(result, expected) + def test_sum_timedelta64_skipna_false(): # GH#17235 From 2174ef634a07cd9ad00e4e93c153f1bc3d2e13d3 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 13 Nov 2020 15:00:56 -0800 Subject: [PATCH 3/9] CLN: remove no-longer-reachable --- pandas/core/frame.py | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 202bc885a6c62..4310d23e4d7f3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8798,32 +8798,6 @@ def _get_data() -> DataFrame: assert numeric_only is None - if not self._is_homogeneous_type or self._mgr.any_extension_types: - # try to avoid self.values call - - if filter_type is None and axis == 0: - # operate column-wise - - # numeric_only must be None here, as other cases caught above - - # this can end up with a non-reduction - # but not always. if the types are mixed - # with datelike then need to make sure a series - - # we only end up here if we have not specified - # numeric_only and yet we have tried a - # column-by-column reduction, where we have mixed type. - # So let's just do what we can - from pandas.core.apply import frame_apply - - opa = frame_apply( - self, func=func, result_type="expand", ignore_failures=True - ) - result = opa.get_result() - if result.ndim == self.ndim: - result = result.iloc[0].rename(None) - return result - data = self values = data.values From c8cd631ad37b00300953577d4112ce6935640638 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 13 Nov 2020 16:21:58 -0800 Subject: [PATCH 4/9] TST: splitting of object-dtype blocks --- pandas/tests/frame/test_reductions.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index e83d88570d4e5..142833493f6b5 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1279,6 +1279,8 @@ def test_frame_any_with_timedelta(self): expected = Series(data=[False, True]) tm.assert_series_equal(result, expected) + +class TestNuisanceColumns: @pytest.mark.parametrize("method", ["any", "all"]) def test_any_all_categorical_dtype_nuisance_column(self, method): # GH#36076 DataFrame should match Series behavior @@ -1367,6 +1369,21 @@ def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method): result = getattr(np, method)(df) tm.assert_series_equal(result, expected) + def test_reduction_object_block_splits_nuisance_columns(self): + df = DataFrame({"A": [0, 1, 2], "B": ["a", "b", "c"]}, dtype=object) + + # We should only exclude "B", not "A" + result = df.mean() + expected = Series([1.0], index=["A"]) + tm.assert_series_equal(result, expected) + + # Same behavior but heterogeneous dtype + df["C"] = df["A"].astype(int) + 4 + + result = df.mean() + expected = Series([1.0, 5.0], index=["A", "C"]) + tm.assert_series_equal(result, expected) + def test_sum_timedelta64_skipna_false(): # GH#17235 From 034a1249534c50346309eb0794bd6179f17fd113 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 13 Nov 2020 18:54:31 -0800 Subject: [PATCH 5/9] whatsnew --- doc/source/whatsnew/v1.2.0.rst | 52 ++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e623b76fed0a9..dafbaa1e576b6 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -284,6 +284,58 @@ of columns could result in a larger :class:`Series` result. See (:issue:`37799`) In [6]: df[["B", "C"]].all(bool_only=True) +Other :class:`DataFrame` reductions with ``numeric_only=None`` will also avoid +this pathological behavior: + +.. ipython:: python + + df = pd.DataFrame({"A": [0, 1, 2], "B": ["a", "b", "c"]}, dtype=object) + + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: df.mean() + Out[3]: Series([], dtype: float64) + + In [4]: df[["A"]].mean() + Out[4]: + A 1.0 + dtype: float64 + +*New behavior*: + + In [3]: df.mean() + + In [4]: df[["A"]].mean() + +Moreover, :class:`DataFrame` reductions with ``numeric_only=None`` will now be +consistent with their :class:`Series` counterparts. In particular, for +reductions where the :class:`Series` method raises ``TypeError``, the +:class:`DataFrame` reduction will now consider that column non-numeric +instead of casting to NumPy which may have different semantics. + +.. ipython:: python + + ser = pd.Series([0, 1], dtype="category", name="A") + df = ser.to_frame() + + +*Previous behavior*: + +.. code-block:: ipython + + In [5]: df.any() + Out[5]: + A True + dtype: bool + +*New behavior*: + + In [5]: df.any() + + .. _whatsnew_120.api_breaking.python: Increased minimum version for Python From 61fd8d99aa06dc86fe953424413b6162eaa24a21 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 13 Nov 2020 18:56:13 -0800 Subject: [PATCH 6/9] GH refs --- doc/source/whatsnew/v1.2.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index dafbaa1e576b6..f9eef74df216e 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -285,7 +285,7 @@ of columns could result in a larger :class:`Series` result. See (:issue:`37799`) Other :class:`DataFrame` reductions with ``numeric_only=None`` will also avoid -this pathological behavior: +this pathological behavior (:issue:`??`): .. ipython:: python @@ -314,7 +314,7 @@ Moreover, :class:`DataFrame` reductions with ``numeric_only=None`` will now be consistent with their :class:`Series` counterparts. In particular, for reductions where the :class:`Series` method raises ``TypeError``, the :class:`DataFrame` reduction will now consider that column non-numeric -instead of casting to NumPy which may have different semantics. +instead of casting to NumPy which may have different semantics (:issue:`36076`). .. ipython:: python From 2132ea090229160c9f50e184692dc217e1729230 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 13 Nov 2020 18:58:55 -0800 Subject: [PATCH 7/9] update GH refs --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/tests/frame/test_reductions.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index f9eef74df216e..2a95f66697dc5 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -285,7 +285,7 @@ of columns could result in a larger :class:`Series` result. See (:issue:`37799`) Other :class:`DataFrame` reductions with ``numeric_only=None`` will also avoid -this pathological behavior (:issue:`??`): +this pathological behavior (:issue:`37827`): .. ipython:: python diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index e74f80ef7649a..7d398c6a0906b 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1401,6 +1401,7 @@ def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method): tm.assert_series_equal(result, expected) def test_reduction_object_block_splits_nuisance_columns(self): + # GH#37827 df = DataFrame({"A": [0, 1, 2], "B": ["a", "b", "c"]}, dtype=object) # We should only exclude "B", not "A" From 470f7d2edd4963fa3348120fbc6af2e80bd6d176 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 14 Nov 2020 10:17:31 -0800 Subject: [PATCH 8/9] fix GH ref, comments --- doc/source/whatsnew/v1.2.0.rst | 13 +++++++++---- pandas/core/internals/blocks.py | 2 ++ pandas/tests/frame/test_reductions.py | 4 +++- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 2a95f66697dc5..4f55e84a7da67 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -306,15 +306,18 @@ this pathological behavior (:issue:`37827`): *New behavior*: - In [3]: df.mean() +.. code-block:: ipython - In [4]: df[["A"]].mean() + df.mean() + + df[["A"]].mean() Moreover, :class:`DataFrame` reductions with ``numeric_only=None`` will now be consistent with their :class:`Series` counterparts. In particular, for reductions where the :class:`Series` method raises ``TypeError``, the :class:`DataFrame` reduction will now consider that column non-numeric -instead of casting to NumPy which may have different semantics (:issue:`36076`). +instead of casting to NumPy which may have different semantics (:issue:`36076`, +:issue:`28949`, :issue:`21020`). .. ipython:: python @@ -333,7 +336,9 @@ instead of casting to NumPy which may have different semantics (:issue:`36076`). *New behavior*: - In [5]: df.any() +.. code-block:: ipython + + df.any() .. _whatsnew_120.api_breaking.python: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 166c74e543a30..967e218078a28 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -517,6 +517,8 @@ def make_a_block(nv, ref_loc): # need a new block if m.any() or m.size == 0: + # Apply our function; we may ignore_failures if this is a + # reduction that is dropping nuisance columns GH#37827 try: nv = f(m, v, i) except TypeError: diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 7d398c6a0906b..299f00e818105 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1321,6 +1321,7 @@ def test_any_all_categorical_dtype_nuisance_column(self, method): # Double-check the Series behavior is to raise with pytest.raises(TypeError, match="does not implement reduction"): getattr(ser, method)() + with pytest.raises(TypeError, match="does not implement reduction"): getattr(np, method)(ser) @@ -1367,7 +1368,7 @@ def test_median_categorical_dtype_nuisance_column(self): @pytest.mark.parametrize("method", ["min", "max"]) def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method): - # GH#28948 DataFrame.min should behave like Series.min + # GH#28949 DataFrame.min should behave like Series.min cat = Categorical(["a", "b", "c", "b"], ordered=False) ser = Series(cat) df = ser.to_frame("A") @@ -1375,6 +1376,7 @@ def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method): # Double-check the Series behavior with pytest.raises(TypeError, match="is not ordered for operation"): getattr(ser, method)() + with pytest.raises(TypeError, match="is not ordered for operation"): getattr(np, method)(ser) From 15381ce9e195c57f031e4ecb9e604f60f218bc88 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 14 Nov 2020 10:32:41 -0800 Subject: [PATCH 9/9] code block better --- doc/source/whatsnew/v1.2.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 8c9a29e2644d1..d848413d7193c 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -306,7 +306,7 @@ this pathological behavior (:issue:`37827`): *New behavior*: -.. code-block:: ipython +.. ipython:: python df.mean() @@ -336,7 +336,7 @@ instead of casting to NumPy which may have different semantics (:issue:`36076`, *New behavior*: -.. code-block:: ipython +.. ipython:: python df.any()