From d1d07ffe81b1a007f04f5b2fa2ce245b8d9f5994 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 24 Nov 2019 18:38:15 -0800 Subject: [PATCH 1/5] REF: implement DataFrame reductions blockwsie --- pandas/core/frame.py | 65 +++++++++++++++++++--------- pandas/core/internals/managers.py | 44 +++++++++++++++++++ pandas/tests/groupby/test_groupby.py | 2 +- 3 files changed, 89 insertions(+), 22 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 46b213b25df49..ad3cda6d96e5c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7606,6 +7606,46 @@ def _reduce( def f(x): return op(x, axis=axis, skipna=skipna, **kwds) + def _get_data(axis_matters): + if filter_type is None or filter_type == "numeric": + data = self._get_numeric_data() + elif filter_type == "bool": + if axis_matters: + # GH#25101, GH#24434 + data = self._get_bool_data() if axis == 0 else self + else: + data = self._get_bool_data() + else: # pragma: no cover + msg = ( + "Generating numeric_only data with filter_type {f}" + "not supported.".format(f=filter_type) + ) + raise NotImplementedError(msg) + return data + + if self.size == 0: + pass + + elif numeric_only is False: + res = self._data.reduce(op) + assert isinstance(res, dict) + assert len(res) == max(list(res.keys())) + 1, res.keys() + out = self._constructor_sliced(res, index=range(len(res))) + out.index = self.columns + return out + + elif numeric_only is True and axis == 0: + data = _get_data(axis_matters=True) + return data._reduce( + op, + name, + axis=axis, + skipna=skipna, + numeric_only=False, + filter_type=filter_type, + **kwds, + ) + if numeric_only is None: values = self.values try: @@ -7616,7 +7656,7 @@ def f(x): # TODO: combine with hasattr(result, 'dtype') further down # hard since we don't have `values` down there. result = np.bool_(result) - except TypeError as err: + except TypeError: # e.g. in nanops trying to convert strs to float # try by-column first @@ -7639,31 +7679,14 @@ def f(x): result = result.iloc[0] return result - if filter_type is None or filter_type == "numeric": - data = self._get_numeric_data() - elif filter_type == "bool": - data = self._get_bool_data() - else: # pragma: no cover - raise NotImplementedError( - "Handling exception with filter_type {f} not" - "implemented.".format(f=filter_type) - ) from err + # TODO: why doesnt axis matter here? + data = _get_data(axis_matters=False) with np.errstate(all="ignore"): result = f(data.values) labels = data._get_agg_axis(axis) else: if numeric_only: - if filter_type is None or filter_type == "numeric": - data = self._get_numeric_data() - elif filter_type == "bool": - # GH 25101, # GH 24434 - data = self._get_bool_data() if axis == 0 else self - else: # pragma: no cover - msg = ( - "Generating numeric_only data with filter_type {f}" - "not supported.".format(f=filter_type) - ) - raise NotImplementedError(msg) + data = _get_data(axis_matters=True) values = data.values labels = data._get_agg_axis(axis) else: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 5e60440f1577e..e94f040d93697 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -343,6 +343,50 @@ def _verify_integrity(self): "tot_items: {1}".format(len(self.items), tot_items) ) + def reduce(self, func, *args, **kwargs): + # If 2D, we assume that we're operating column-wise + if self.ndim == 1: + # we'll be returning a scalar + blk = self.blocks[0] + return func(blk.values, *args, **kwargs) + + res = {} + for blk in self.blocks: + bres = func(blk.values, *args, **kwargs) + if np.ndim(bres) == 0 and blk.shape[0] != 1: + # i.e. we reduced over all axes and not just one; re-do column-wise + new_res = { + blk.mgr_locs.as_array[i]: func(blk.values[i], *args, **kwargs) + for i in range(len(blk.values)) + } + elif np.ndim(bres) == 0: + # EA + assert blk.shape[0] == 1, ( + blk.shape, + blk.values.dtype, + bres, + func, + args, + kwargs, + ) + new_res = zip(blk.mgr_locs.as_array, [bres]) + else: + assert bres.ndim == 1, bres.shape + assert blk.shape[0] == len(bres), ( + blk.shape, + bres.shape, + func, + args, + kwargs, + ) + new_res = zip(blk.mgr_locs.as_array, bres) + + nr = dict(new_res) + assert not any(key in res for key in nr) + res.update(nr) + + return res + def apply( self, f, diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b848e9caad9be..badebc1c63f11 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -772,7 +772,7 @@ def test_omit_nuisance(df): # won't work with axis = 1 grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1) - msg = r"unsupported operand type\(s\) for \+: 'Timestamp'" + msg = "reduction operation 'sum' not allowed for this dtype" with pytest.raises(TypeError, match=msg): grouped.agg(lambda x: x.sum(0, numeric_only=False)) From 9469400e546bf447588fe0ac738eb5ace0996fa8 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 25 Nov 2019 08:35:47 -0800 Subject: [PATCH 2/5] handle axis==1 with numeric_only --- pandas/core/frame.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ad3cda6d96e5c..9e3dd68cc8142 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7645,6 +7645,17 @@ def _get_data(axis_matters): filter_type=filter_type, **kwds, ) + elif numeric_only is True and axis == 1: + data = _get_data(axis_matters=True) + return data.T._reduce( + op, + name, + axis=0, + skipna=skipna, + numeric_only=False, + filter_type=filter_type, + **kwds, + ) if numeric_only is None: values = self.values From 237253abc3b4eafa23e9d103d28c9738d72ca1be Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 25 Nov 2019 16:55:48 -0800 Subject: [PATCH 3/5] clean up assertions --- pandas/core/internals/managers.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 275681b8e1d58..ce40bd99d2b86 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -363,24 +363,11 @@ def reduce(self, func, *args, **kwargs): } elif np.ndim(bres) == 0: # EA - assert blk.shape[0] == 1, ( - blk.shape, - blk.values.dtype, - bres, - func, - args, - kwargs, - ) + assert blk.shape[0] == 1 new_res = zip(blk.mgr_locs.as_array, [bres]) else: assert bres.ndim == 1, bres.shape - assert blk.shape[0] == len(bres), ( - blk.shape, - bres.shape, - func, - args, - kwargs, - ) + assert blk.shape[0] == len(bres) new_res = zip(blk.mgr_locs.as_array, bres) nr = dict(new_res) From ebb33c1bc93e88b62fc3d88ef5107c493b67cedc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 22 Dec 2019 12:39:36 -0800 Subject: [PATCH 4/5] consolidate+simplify --- pandas/core/frame.py | 37 ++++++++++++--------------- pandas/core/internals/managers.py | 11 +++----- pandas/core/nanops.py | 2 +- pandas/tests/frame/test_reductions.py | 12 +++++++++ 4 files changed, 32 insertions(+), 30 deletions(-) create mode 100644 pandas/tests/frame/test_reductions.py diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2f2f5a95b9222..f176b1f888605 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7593,31 +7593,26 @@ def _get_data(axis_matters): raise NotImplementedError(msg) return data - if self.size == 0: - pass + if numeric_only is not None and axis in [0, 1]: + df = self + if numeric_only is True: + df = _get_data(axis_matters=True) + if axis == 1: + df = df.T + axis = 0 + + out_dtype = "bool" if filter_type == "bool" else None - elif numeric_only is False: - res = self._data.reduce(op) + # After possibly _get_data and transposing, we are now in the + # simple case where we can use BlockManager._reduce + res = df._data.reduce(op, axis=1, skipna=skipna, **kwds) assert isinstance(res, dict) - assert len(res) == max(list(res.keys())) + 1, res.keys() - out = self._constructor_sliced(res, index=range(len(res))) - out.index = self.columns + if len(res): + assert len(res) == max(list(res.keys())) + 1, res.keys() + out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype) + out.index = df.columns return out - elif numeric_only is True and axis in [0, 1]: - data = _get_data(axis_matters=True) - if axis == 1: - data = data.T - return data._reduce( - op, - name, - axis=0, - skipna=skipna, - numeric_only=False, - filter_type=filter_type, - **kwds, - ) - if numeric_only is None: values = self.values try: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b4e88ac8e49f7..4d0235e1e6773 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -350,19 +350,14 @@ def reduce(self, func, *args, **kwargs): res = {} for blk in self.blocks: bres = func(blk.values, *args, **kwargs) - if np.ndim(bres) == 0 and blk.shape[0] != 1: - # i.e. we reduced over all axes and not just one; re-do column-wise - new_res = { - blk.mgr_locs.as_array[i]: func(blk.values[i], *args, **kwargs) - for i in range(len(blk.values)) - } - elif np.ndim(bres) == 0: + + if np.ndim(bres) == 0: # EA assert blk.shape[0] == 1 new_res = zip(blk.mgr_locs.as_array, [bres]) else: assert bres.ndim == 1, bres.shape - assert blk.shape[0] == len(bres) + assert blk.shape[0] == len(bres), (blk.shape, bres.shape, args, kwargs) new_res = zip(blk.mgr_locs.as_array, bres) nr = dict(new_res) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index f27e3d4527921..504c432c4e163 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -831,7 +831,7 @@ def reduction(values, axis=None, skipna=True, mask=None): try: result = getattr(values, meth)(axis, dtype=dtype_max) result.fill(np.nan) - except (AttributeError, TypeError, ValueError, np.core._internal.AxisError): + except (AttributeError, TypeError, ValueError): result = np.nan else: result = getattr(values, meth)(axis) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py new file mode 100644 index 0000000000000..d508c2786a49a --- /dev/null +++ b/pandas/tests/frame/test_reductions.py @@ -0,0 +1,12 @@ +""" +Tests for DataFrame reductions that are DataFrame-specific, i.e. cannot +be shared in tests.reductions. +""" +import pandas as pd +import numpy as np + + +if True:#def test_blockwise_reduction(): + arr = np.arange(10) + tdarr = arr.astype("m8[s]") + df = pd.DataFrame({"A": arr, "B": tdarr}) From 4a16663c1482e08592d134587258cd2d4b4e5b66 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 22 Dec 2019 12:41:27 -0800 Subject: [PATCH 5/5] revert file not intended --- pandas/tests/frame/test_reductions.py | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 pandas/tests/frame/test_reductions.py diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py deleted file mode 100644 index d508c2786a49a..0000000000000 --- a/pandas/tests/frame/test_reductions.py +++ /dev/null @@ -1,12 +0,0 @@ -""" -Tests for DataFrame reductions that are DataFrame-specific, i.e. cannot -be shared in tests.reductions. -""" -import pandas as pd -import numpy as np - - -if True:#def test_blockwise_reduction(): - arr = np.arange(10) - tdarr = arr.astype("m8[s]") - df = pd.DataFrame({"A": arr, "B": tdarr})