From 90fb20e1fc3153954d7b2438ea189b31e907e5ae Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 7 Jul 2018 09:39:41 -0500 Subject: [PATCH 01/23] ENH: add groupby & reduce support to EA closes #21789 closes #22346 --- doc/source/whatsnew/v0.24.0.txt | 9 ++++- pandas/conftest.py | 23 ++++++++++++ pandas/core/arrays/base.py | 29 +++++++++++++++ pandas/core/arrays/integer.py | 49 +++++++++++++++++++++++++ pandas/tests/arrays/test_integer.py | 9 +++-- pandas/tests/extension/base/__init__.py | 1 + pandas/tests/extension/base/groupby.py | 8 ++-- pandas/tests/extension/base/reduce.py | 38 +++++++++++++++++++ pandas/tests/extension/test_integer.py | 48 +++++++++++++++++------- 9 files changed, 193 insertions(+), 21 deletions(-) create mode 100644 pandas/tests/extension/base/reduce.py diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 3d82dd042da20..b241883dc0d2a 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -48,7 +48,7 @@ Pandas has gained the ability to hold integer dtypes with missing values. This l Here is an example of the usage. We can construct a ``Series`` with the specified dtype. The dtype string ``Int64`` is a pandas ``ExtensionDtype``. Specifying a list or array using the traditional missing value -marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`, :issue:`22441`) +marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`, :issue:`22441`, :issue:`21789`, :issue:`22346`) .. ipython:: python @@ -91,6 +91,13 @@ These dtypes can be merged & reshaped & casted. pd.concat([df[['A']], df[['B', 'C']]], axis=1).dtypes df['A'].astype(float) +Reduction and groupby operations such as 'sum' work. + +.. ipython:: python + + df.sum() + df.groupby('B').A.sum() + .. warning:: The Integer NA support currently uses the captilized dtype version, e.g. ``Int8`` as compared to the traditional ``int8``. This may be changed at a future date. diff --git a/pandas/conftest.py b/pandas/conftest.py index 621de3ffd4b12..d24400d09809e 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -131,6 +131,29 @@ def all_arithmetic_operators(request): return request.param +_all_numeric_reductions = ['sum', 'max', 'min', + 'mean', 'prod', 'std', 'var', 'median'] + + +@pytest.fixture(params=_all_numeric_reductions) +def all_numeric_reductions(request): + """ + Fixture for numeric reduction names + """ + return request.param + + +_all_boolean_reductions = ['all', 'any'] + + +@pytest.fixture(params=_all_boolean_reductions) +def all_boolean_reductions(request): + """ + Fixture for boolean reduction names + """ + return request.param + + _cython_table = pd.core.base.SelectionMixin._cython_table.items() diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 627afd1b6f860..fe07b302eb02b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -59,6 +59,10 @@ class ExtensionArray(object): * factorize / _values_for_factorize * argsort / _values_for_argsort + One can implement methods to handle array reductions. + + * _reduce + The remaining methods implemented on this class should be performant, as they only compose abstract methods. Still, a more efficient implementation may be available, and these methods can be overridden. @@ -713,6 +717,31 @@ def _add_comparison_ops(cls): cls.__le__ = cls._create_comparison_method(operator.le) cls.__ge__ = cls._create_comparison_method(operator.ge) + def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, + filter_type=None, **kwargs): + """Return a scalar result of performing the op + + Parameters + ---------- + op : callable + function to apply to the array + name : str + name of the function + axis : int, default 0 + axis over which to apply, defined as 0 currently + skipna : bool, default True + if True, skip NaN values + numeric_only : bool, optional + if True, only perform numeric ops + filter_type : str, optional + kwargs : dict + + Returns + ------- + scalar + """ + raise AbstractMethodError(self) + class ExtensionScalarOpsMixin(ExtensionOpsMixin): """A mixin for defining the arithmetic and logical operations on diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index e58109a25e1a5..7e1e7315232aa 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -529,6 +529,55 @@ def cmp_method(self, other): name = '__{name}__'.format(name=op.__name__) return set_function_name(cmp_method, name, cls) + def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, + filter_type=None, **kwds): + """Return a scalar result of performing the op + + Parameters + ---------- + op : callable + function to apply to the array + name : str + name of the function + axis : int, default 0 + axis over which to apply, defined as 0 currently + skipna : bool, default True + if True, skip NaN values + numeric_only : bool, optional + if True, only perform numeric ops + filter_type : str, optional + kwds : dict + + Returns + ------- + scalar + """ + + data = self._data + mask = self._mask + + # coerce to a nan-aware float if needed + if mask.any(): + data = self._data.astype('float64') + data[mask] = self._na_value + + result = op(data, axis=axis, skipna=skipna) + + # if we have a boolean op, provide coercion back to a bool + # type if possible + if name in ['any', 'all']: + if is_integer(result) or is_float(result): + result = bool(int(result)) + + # if we have a numeric op, provide coercion back to an integer + # type if possible + elif not isna(result): + int_result = int(result) + if int_result == result: + result = int_result + + return result + def _maybe_mask_result(self, result, mask, other, op_name): """ Parameters diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 349a6aee5701e..3f3653f375149 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -587,15 +587,18 @@ def test_cross_type_arithmetic(): tm.assert_series_equal(result, expected) -def test_groupby_mean_included(): +@pytest.mark.parametrize('op', ['sum', 'min', 'max']) +def test_preserve_groupby_dtypes(op): + # TODO(#22346): preserve Int64 dtype + # for ops that enable (mean would actually work here + # but generally it is a float return value) df = pd.DataFrame({ "A": ['a', 'b', 'b'], "B": [1, None, 3], "C": integer_array([1, None, 3], dtype='Int64'), }) - result = df.groupby("A").sum() - # TODO(#22346): preserve Int64 dtype + result = getattr(df.groupby("A"), op)() expected = pd.DataFrame({ "B": np.array([1.0, 3.0]), "C": np.array([1, 3], dtype="int64") diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index b6b81bb941a59..b5d4fd676245e 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -48,6 +48,7 @@ class TestMyDtype(BaseDtypeTests): from .interface import BaseInterfaceTests # noqa from .methods import BaseMethodsTests # noqa from .ops import BaseArithmeticOpsTests, BaseComparisonOpsTests, BaseOpsUtil # noqa +from .reduce import BaseNumericReduceTests, BaseBooleanReduceTests # noqa from .missing import BaseMissingTests # noqa from .reshaping import BaseReshapingTests # noqa from .setitem import BaseSetitemTests # noqa diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 174997c7d51e1..52c635d286df6 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -25,8 +25,8 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping): "B": data_for_grouping}) result = df.groupby("B", as_index=as_index).A.mean() _, index = pd.factorize(data_for_grouping, sort=True) - # TODO(ExtensionIndex): remove astype - index = pd.Index(index.astype(object), name="B") + + index = pd.Index(index, name="B") expected = pd.Series([3, 1, 4], index=index, name="A") if as_index: self.assert_series_equal(result, expected) @@ -39,8 +39,8 @@ def test_groupby_extension_no_sort(self, data_for_grouping): "B": data_for_grouping}) result = df.groupby("B", sort=False).A.mean() _, index = pd.factorize(data_for_grouping, sort=False) - # TODO(ExtensionIndex): remove astype - index = pd.Index(index.astype(object), name="B") + + index = pd.Index(index, name="B") expected = pd.Series([1, 3, 4], index=index, name="A") self.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py new file mode 100644 index 0000000000000..ce53ba91ff2fc --- /dev/null +++ b/pandas/tests/extension/base/reduce.py @@ -0,0 +1,38 @@ +import warnings +import pytest +import pandas.util.testing as tm +import pandas as pd +from .base import BaseExtensionTests + + +class BaseReduceTests(BaseExtensionTests): + """ + Reduction specific tests. Generally these only + make sense for numeric/boolean operations. + """ + def check_reduce(self, s, op_name, skipna): + result = getattr(s, op_name)(skipna=skipna) + expected = getattr(s.astype('float64'), op_name)(skipna=skipna) + tm.assert_almost_equal(result, expected) + + +class BaseNumericReduceTests(BaseReduceTests): + + @pytest.mark.parametrize('skipna', [True, False]) + def test_reduce_series(self, data, all_numeric_reductions, skipna): + op_name = all_numeric_reductions + s = pd.Series(data) + + # min/max with empty produce numpy warnings + with warnings.catch_warnings(record=True): + warnings.simplefilter("ignore", RuntimeWarning) + self.check_reduce(s, op_name, skipna) + + +class BaseBooleanReduceTests(BaseReduceTests): + + @pytest.mark.parametrize('skipna', [True, False]) + def test_reduce_series(self, data, all_boolean_reductions, skipna): + op_name = all_boolean_reductions + s = pd.Series(data) + self.check_reduce(s, op_name, skipna) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index fa5c89d85e548..163f2ad6fc72b 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -208,17 +208,39 @@ class TestCasting(base.BaseCastingTests): class TestGroupby(base.BaseGroupbyTests): - @pytest.mark.xfail(reason="groupby not working", strict=True) - def test_groupby_extension_no_sort(self, data_for_grouping): - super(TestGroupby, self).test_groupby_extension_no_sort( - data_for_grouping) - - @pytest.mark.parametrize('as_index', [ - pytest.param(True, - marks=pytest.mark.xfail(reason="groupby not working", - strict=True)), - False - ]) + @pytest.mark.parametrize('as_index', [True, False]) def test_groupby_extension_agg(self, as_index, data_for_grouping): - super(TestGroupby, self).test_groupby_extension_agg( - as_index, data_for_grouping) + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], + "B": data_for_grouping}) + result = df.groupby("B", as_index=as_index).A.mean() + _, index = pd.factorize(data_for_grouping, sort=True) + + # TODO(ExtensionIndex): remove coercion to object + # we don't have an easy way to represent an EA as an Index object + index = pd.Index(index, name="B", dtype=object) + expected = pd.Series([3, 1, 4], index=index, name="A") + if as_index: + self.assert_series_equal(result, expected) + else: + expected = expected.reset_index() + self.assert_frame_equal(result, expected) + + def test_groupby_extension_no_sort(self, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], + "B": data_for_grouping}) + result = df.groupby("B", sort=False).A.mean() + _, index = pd.factorize(data_for_grouping, sort=False) + + # TODO(ExtensionIndex): remove coercion to object + # we don't have an easy way to represent an EA as an Index object + index = pd.Index(index, name="B", dtype=object) + expected = pd.Series([1, 3, 4], index=index, name="A") + self.assert_series_equal(result, expected) + + +class TestNumericReduce(base.BaseNumericReduceTests): + pass + + +class TestBooleanReduce(base.BaseBooleanReduceTests): + pass From 63899aadae90ba05431eaa111d18ef6f6e3c764f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 19 Sep 2018 10:32:27 -0400 Subject: [PATCH 02/23] don't need record=True --- pandas/tests/extension/base/reduce.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index ce53ba91ff2fc..5f3db6957bc07 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -24,7 +24,7 @@ def test_reduce_series(self, data, all_numeric_reductions, skipna): s = pd.Series(data) # min/max with empty produce numpy warnings - with warnings.catch_warnings(record=True): + with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) self.check_reduce(s, op_name, skipna) From 05f27a137b49f5a75b7caf3912ee6b7c9b652660 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 19 Sep 2018 11:31:42 -0400 Subject: [PATCH 03/23] dispatch to Index types on _reduce Decimal is a numeric type, support in groupby --- pandas/core/arrays/base.py | 4 +++- pandas/tests/extension/decimal/array.py | 25 +++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index fe07b302eb02b..5f2722a6cbc98 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -740,7 +740,9 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, ------- scalar """ - raise AbstractMethodError(self) + + # we dispatchh to the nanops operations + return op(self, axis=axis, skipna=skipna) class ExtensionScalarOpsMixin(ExtensionOpsMixin): diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index a1ee3a4fefef2..3186ed43f5e41 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -134,6 +134,31 @@ def _na_value(self): def _concat_same_type(cls, to_concat): return cls(np.concatenate([x._data for x in to_concat])) + def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, + filter_type=None, **kwds): + """Return a scalar result of performing the op + + Parameters + ---------- + op : callable + function to apply to the array + name : str + name of the function + axis : int, default 0 + axis over which to apply, defined as 0 currently + skipna : bool, default True + if True, skip NaN values + numeric_only : bool, optional + if True, only perform numeric ops + filter_type : str, optional + kwds : dict + + Returns + ------- + scalar + """ + return op(self.data, axis=axis, skipna=skipna) + def to_decimal(values, context=None): return DecimalArray([decimal.Decimal(x) for x in values], context=context) From 9dbbe51762fdb8504bbb56f305ede1c784e203f6 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 19 Sep 2018 12:03:42 -0400 Subject: [PATCH 04/23] handle min/max correctly for datetimelikes --- pandas/core/arrays/base.py | 4 +--- pandas/core/arrays/datetimelike.py | 36 ++++++++++++++++++++++++++++++ pandas/core/dtypes/common.py | 3 ++- pandas/tests/dtypes/test_common.py | 2 ++ 4 files changed, 41 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 5f2722a6cbc98..fe07b302eb02b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -740,9 +740,7 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, ------- scalar """ - - # we dispatchh to the nanops operations - return op(self, axis=axis, skipna=skipna) + raise AbstractMethodError(self) class ExtensionScalarOpsMixin(ExtensionOpsMixin): diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index e4ace2bfe1509..cc762f5e0aa1f 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -342,6 +342,42 @@ def _validate_frequency(cls, index, freq, **kwargs): 'does not conform to passed frequency {passed}' .format(infer=inferred, passed=freq.freqstr)) + # ------------------------------------------------------------------ + # Reduction Methods + + def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, + filter_type=None, **kwargs): + """Return a scalar result of performing the op + + Parameters + ---------- + op : callable + function to apply to the array + name : str + name of the function + axis : int, default 0 + axis over which to apply, defined as 0 currently + skipna : bool, default True + if True, skip NaN values + numeric_only : bool, optional + if True, only perform numeric ops + filter_type : str, optional + kwargs : dict + + Returns + ------- + scalar + """ + # if we have a reduction op already defined, use it + # this is important for min/max where tz's must be preserved + # and nanops is not geared towards this + # TODO(jreback): we are ignoring skipna + if hasattr(self, name): + return getattr(self, name)() + + # we dispatch to the nanops operations + return op(self, axis=axis, skipna=skipna) + # ------------------------------------------------------------------ # Arithmetic Methods diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index a9fc9d13d4ab3..69c47de2c519c 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1237,7 +1237,8 @@ def is_datetime_or_timedelta_dtype(arr_or_dtype): if arr_or_dtype is None: return False tipo = _get_dtype_type(arr_or_dtype) - return issubclass(tipo, (np.datetime64, np.timedelta64)) + return (issubclass(tipo, (np.datetime64, np.timedelta64)) or + is_datetime64tz_dtype(arr_or_dtype)) def _is_unorderable_exception(e): diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index f87c51a4ee16b..6c23dbb6c257e 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -388,6 +388,8 @@ def test_is_datetime_or_timedelta_dtype(): assert not com.is_datetime_or_timedelta_dtype(np.array(['a', 'b'])) assert com.is_datetime_or_timedelta_dtype(np.datetime64) + assert com.is_datetime_or_timedelta_dtype( + DatetimeTZDtype("ns", "US/Eastern")) assert com.is_datetime_or_timedelta_dtype(np.timedelta64) assert com.is_datetime_or_timedelta_dtype( np.array([], dtype=np.timedelta64)) From d4c0a3e260608c709db60556e4c287618593c72e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 19 Sep 2018 19:13:51 -0400 Subject: [PATCH 05/23] share doc-string with base class --- pandas/core/arrays/datetimelike.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index cc762f5e0aa1f..8645a135d7c92 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -347,27 +347,6 @@ def _validate_frequency(cls, index, freq, **kwargs): def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwargs): - """Return a scalar result of performing the op - - Parameters - ---------- - op : callable - function to apply to the array - name : str - name of the function - axis : int, default 0 - axis over which to apply, defined as 0 currently - skipna : bool, default True - if True, skip NaN values - numeric_only : bool, optional - if True, only perform numeric ops - filter_type : str, optional - kwargs : dict - - Returns - ------- - scalar - """ # if we have a reduction op already defined, use it # this is important for min/max where tz's must be preserved # and nanops is not geared towards this From bb3f37c7ae6ce7604cfdc721054c98406c958509 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 19 Sep 2018 19:17:13 -0400 Subject: [PATCH 06/23] remove more uneeded doc-strings --- pandas/core/arrays/integer.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 7e1e7315232aa..0b62865fe3776 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -531,28 +531,6 @@ def cmp_method(self, other): def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds): - """Return a scalar result of performing the op - - Parameters - ---------- - op : callable - function to apply to the array - name : str - name of the function - axis : int, default 0 - axis over which to apply, defined as 0 currently - skipna : bool, default True - if True, skip NaN values - numeric_only : bool, optional - if True, only perform numeric ops - filter_type : str, optional - kwds : dict - - Returns - ------- - scalar - """ - data = self._data mask = self._mask From e7fbe0f8a15a62c04f9ef2199a2618f220d805dd Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 23 Sep 2018 12:40:17 -0400 Subject: [PATCH 07/23] simplify EA._reduce --- pandas/core/arrays/base.py | 8 +------ pandas/core/arrays/integer.py | 5 ++-- pandas/core/series.py | 14 +++++++---- pandas/tests/extension/decimal/array.py | 32 +++++++------------------ 4 files changed, 23 insertions(+), 36 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index fe07b302eb02b..b7ee5f7150029 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -717,23 +717,17 @@ def _add_comparison_ops(cls): cls.__le__ = cls._create_comparison_method(operator.le) cls.__ge__ = cls._create_comparison_method(operator.ge) - def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwargs): + def _reduce(self, name, skipna=True, **kwargs): """Return a scalar result of performing the op Parameters ---------- - op : callable - function to apply to the array name : str name of the function axis : int, default 0 axis over which to apply, defined as 0 currently skipna : bool, default True if True, skip NaN values - numeric_only : bool, optional - if True, only perform numeric ops - filter_type : str, optional kwargs : dict Returns diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 0b62865fe3776..0e0c43473c168 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -8,6 +8,7 @@ from pandas.compat import u, range, string_types from pandas.compat import set_function_name +from pandas.core import nanops from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass from pandas.core.dtypes.common import ( @@ -529,8 +530,7 @@ def cmp_method(self, other): name = '__{name}__'.format(name=op.__name__) return set_function_name(cmp_method, name, cls) - def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): + def _reduce(self, name, axis=0, skipna=True, **kwargs): data = self._data mask = self._mask @@ -539,6 +539,7 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, data = self._data.astype('float64') data[mask] = self._na_value + op = getattr(nanops, 'nan' + name) result = op(data, axis=axis, skipna=skipna) # if we have a boolean op, provide coercion back to a bool diff --git a/pandas/core/series.py b/pandas/core/series.py index a613b22ea9046..b7d827a370299 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3392,10 +3392,16 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, """ delegate = self._values - if isinstance(delegate, np.ndarray): - # Validate that 'axis' is consistent with Series's single axis. - if axis is not None: - self._get_axis_number(axis) + + if axis is not None: + self._get_axis_number(axis) + + # dispatch to ExtensionArray interface + if isinstance(delegate, ExtensionArray): + return delegate._reduce(name, skipna=skipna, **kwds) + + # dispatch to numpy arrays + elif isinstance(delegate, np.ndarray): if numeric_only: raise NotImplementedError('Series.{0} does not implement ' 'numeric_only.'.format(name)) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 3186ed43f5e41..de3df0403851f 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -134,30 +134,16 @@ def _na_value(self): def _concat_same_type(cls, to_concat): return cls(np.concatenate([x._data for x in to_concat])) - def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): - """Return a scalar result of performing the op - - Parameters - ---------- - op : callable - function to apply to the array - name : str - name of the function - axis : int, default 0 - axis over which to apply, defined as 0 currently - skipna : bool, default True - if True, skip NaN values - numeric_only : bool, optional - if True, only perform numeric ops - filter_type : str, optional - kwds : dict + def _reduce(self, name, axis=0, skipna=True, **kwargs): - Returns - ------- - scalar - """ - return op(self.data, axis=axis, skipna=skipna) + # select the nan* ops + if skipna: + op = getattr(self.data, 'nan' + name) + + # don't skip nans + else: + op = getattr(self.data, name) + return op(axis=axis) def to_decimal(values, context=None): From fddf93821c64b39faa0c344efd4940ce66378dd7 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 23 Sep 2018 15:52:35 -0400 Subject: [PATCH 08/23] fix categorical --- pandas/core/arrays/categorical.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 216bccf7d6309..79070bbbfd11a 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2069,14 +2069,12 @@ def _reverse_indexer(self): return result # reduction ops # - def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): - """ perform the reduction type operation """ + def _reduce(self, name, axis=0, skipna=True, **kwargs): func = getattr(self, name, None) if func is None: msg = 'Categorical cannot perform the operation {op}' raise TypeError(msg.format(op=name)) - return func(numeric_only=numeric_only, **kwds) + return func(**kwargs) def min(self, numeric_only=None, **kwargs): """ The minimum value of the object. From 49efedfa3dfc19468a389f2ca6a04b49e1a9e420 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 1 Oct 2018 08:44:41 -0400 Subject: [PATCH 09/23] make _reduce an always defined method on Base; raise TypeError if invoked and not overriden, test the same --- pandas/core/arrays/base.py | 42 ++++++++++--------- pandas/tests/extension/arrow/test_bool.py | 4 ++ pandas/tests/extension/base/__init__.py | 2 +- pandas/tests/extension/base/reduce.py | 20 +++++++++ pandas/tests/extension/decimal/array.py | 9 ++-- .../tests/extension/decimal/test_decimal.py | 22 ++++++++++ pandas/tests/extension/json/test_json.py | 4 ++ pandas/tests/extension/test_categorical.py | 4 ++ pandas/tests/extension/test_interval.py | 4 ++ 9 files changed, 87 insertions(+), 24 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index b7ee5f7150029..01a580b52ab3f 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -679,6 +679,29 @@ def _ndarray_values(self): """ return np.array(self) + def _reduce(self, name, skipna=True, **kwargs): + """Return a scalar result of performing the op + + Parameters + ---------- + name : str + name of the function + axis : int, default 0 + axis over which to apply, defined as 0 currently + skipna : bool, default True + if True, skip NaN values + kwargs : dict + + Returns + ------- + scalar + + Raises + ------ + TypeError : subclass does not define reductions + """ + raise TypeError + class ExtensionOpsMixin(object): """ @@ -717,25 +740,6 @@ def _add_comparison_ops(cls): cls.__le__ = cls._create_comparison_method(operator.le) cls.__ge__ = cls._create_comparison_method(operator.ge) - def _reduce(self, name, skipna=True, **kwargs): - """Return a scalar result of performing the op - - Parameters - ---------- - name : str - name of the function - axis : int, default 0 - axis over which to apply, defined as 0 currently - skipna : bool, default True - if True, skip NaN values - kwargs : dict - - Returns - ------- - scalar - """ - raise AbstractMethodError(self) - class ExtensionScalarOpsMixin(ExtensionOpsMixin): """A mixin for defining the arithmetic and logical operations on diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index e1afedcade3ff..12c37d1fdf895 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -39,6 +39,10 @@ def test_from_dtype(self, data): pytest.skip("GH-22666") +class TestReduce(base.BaseNoReduceTests): + pass + + def test_is_bool_dtype(data): assert pd.api.types.is_bool_dtype(data) assert pd.core.common.is_bool_indexer(data) diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index b5d4fd676245e..d11bb8b6beb77 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -48,7 +48,7 @@ class TestMyDtype(BaseDtypeTests): from .interface import BaseInterfaceTests # noqa from .methods import BaseMethodsTests # noqa from .ops import BaseArithmeticOpsTests, BaseComparisonOpsTests, BaseOpsUtil # noqa -from .reduce import BaseNumericReduceTests, BaseBooleanReduceTests # noqa +from .reduce import BaseNoReduceTests, BaseNumericReduceTests, BaseBooleanReduceTests # noqa from .missing import BaseMissingTests # noqa from .reshaping import BaseReshapingTests # noqa from .setitem import BaseSetitemTests # noqa diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index 5f3db6957bc07..4f6c7988314c0 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -16,6 +16,26 @@ def check_reduce(self, s, op_name, skipna): tm.assert_almost_equal(result, expected) +class BaseNoReduceTests(BaseReduceTests): + """ we don't define any reductions """ + + @pytest.mark.parametrize('skipna', [True, False]) + def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): + op_name = all_numeric_reductions + s = pd.Series(data) + + with pytest.raises(TypeError): + getattr(s, op_name)(skipna=skipna) + + @pytest.mark.parametrize('skipna', [True, False]) + def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna): + op_name = all_boolean_reductions + s = pd.Series(data) + + with pytest.raises(TypeError): + getattr(s, op_name)(skipna=skipna) + + class BaseNumericReduceTests(BaseReduceTests): @pytest.mark.parametrize('skipna', [True, False]) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index de3df0403851f..d72756a254110 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -136,13 +136,14 @@ def _concat_same_type(cls, to_concat): def _reduce(self, name, axis=0, skipna=True, **kwargs): - # select the nan* ops if skipna: - op = getattr(self.data, 'nan' + name) + raise NotImplementedError("decimal does not support skipna=True") - # don't skip nans - else: + try: op = getattr(self.data, name) + except AttributeError: + raise NotImplementedError("decimal does not support " + "the {} operation".format(name)) return op(axis=axis) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 6724e183a0606..e3c44f5755cc7 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -131,6 +131,28 @@ class TestMissing(BaseDecimal, base.BaseMissingTests): pass +class Reduce: + + def check_reduce(self, s, op_name, skipna): + + if skipna or op_name in ['median']: + with pytest.raises(NotImplementedError): + getattr(s, op_name)(skipna=skipna) + + else: + result = getattr(s, op_name)(skipna=skipna) + expected = getattr(np.asarray(s), op_name)() + tm.assert_almost_equal(result, expected) + + +class TestNumericReduce(Reduce, base.BaseNumericReduceTests): + pass + + +class TestBooleanReduce(Reduce, base.BaseBooleanReduceTests): + pass + + class TestMethods(BaseDecimal, base.BaseMethodsTests): @pytest.mark.parametrize('dropna', [True, False]) @pytest.mark.xfail(reason="value_counts not implemented yet.") diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 6c8b12ed865fc..15d99f6c5d2fc 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -160,6 +160,10 @@ def test_fillna_frame(self): reason="Dictionary order unstable") +class TestReduce(base.BaseNoReduceTests): + pass + + class TestMethods(BaseJSON, base.BaseMethodsTests): @unhashable def test_value_counts(self, all_data, dropna): diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index f118279c4b915..a4518798aa400 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -164,6 +164,10 @@ def test_fillna_limit_backfill(self): pass +class TestReduce(base.BaseNoReduceTests): + pass + + class TestMethods(base.BaseMethodsTests): pass diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 7302c5757d144..183ebea927b10 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -98,6 +98,10 @@ class TestInterface(BaseInterval, base.BaseInterfaceTests): pass +class TestReduce(base.BaseNoReduceTests): + pass + + class TestMethods(BaseInterval, base.BaseMethodsTests): @pytest.mark.skip(reason='addition is not defined for intervals') From 371ab5410eae666fc405f67f909b810b4d63925b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 3 Oct 2018 07:28:50 -0400 Subject: [PATCH 10/23] pass mask to nanops --- pandas/core/arrays/integer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 0e0c43473c168..76bc315d71484 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -540,7 +540,7 @@ def _reduce(self, name, axis=0, skipna=True, **kwargs): data[mask] = self._na_value op = getattr(nanops, 'nan' + name) - result = op(data, axis=axis, skipna=skipna) + result = op(data, axis=axis, skipna=skipna, mask=mask) # if we have a boolean op, provide coercion back to a bool # type if possible From d94b85e1155be23ea9768fdcf8398c858cd47479 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 3 Oct 2018 08:09:49 -0400 Subject: [PATCH 11/23] review comments --- doc/source/whatsnew/v0.24.0.txt | 1 + pandas/conftest.py | 3 ++- pandas/core/arrays/base.py | 10 +++++++--- pandas/core/arrays/integer.py | 12 +++++------- pandas/core/series.py | 3 +++ pandas/tests/extension/decimal/array.py | 4 ++-- pandas/tests/extension/decimal/test_decimal.py | 2 +- 7 files changed, 21 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index b241883dc0d2a..29b766e616b3b 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -557,6 +557,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your - Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`) - Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`) - Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`) +- Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`) .. _whatsnew_0240.api.incompatibilities: diff --git a/pandas/conftest.py b/pandas/conftest.py index d24400d09809e..e84657a79b51a 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -132,7 +132,8 @@ def all_arithmetic_operators(request): _all_numeric_reductions = ['sum', 'max', 'min', - 'mean', 'prod', 'std', 'var', 'median'] + 'mean', 'prod', 'std', 'var', 'median', + 'kurt', 'skew'] @pytest.fixture(params=_all_numeric_reductions) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 01a580b52ab3f..ad46cfc2054ef 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -680,17 +680,20 @@ def _ndarray_values(self): return np.array(self) def _reduce(self, name, skipna=True, **kwargs): - """Return a scalar result of performing the op + """Return a scalar result of performing the reduction operation. Parameters ---------- name : str - name of the function + name of the function, support values are: + {any, all, min, max, sum, mean, median, prod, + std, var, sem, kurt, skew} axis : int, default 0 axis over which to apply, defined as 0 currently skipna : bool, default True if True, skip NaN values kwargs : dict + ddof is the only supported kwarg Returns ------- @@ -700,7 +703,8 @@ def _reduce(self, name, skipna=True, **kwargs): ------ TypeError : subclass does not define reductions """ - raise TypeError + raise TypeError("cannot perform {name} with type {dtype}".format( + name=name, dtype=self.dtype)) class ExtensionOpsMixin(object): diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 76bc315d71484..e31a7571f62d6 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -530,7 +530,7 @@ def cmp_method(self, other): name = '__{name}__'.format(name=op.__name__) return set_function_name(cmp_method, name, cls) - def _reduce(self, name, axis=0, skipna=True, **kwargs): + def _reduce(self, name, skipna=True, **kwargs): data = self._data mask = self._mask @@ -540,17 +540,15 @@ def _reduce(self, name, axis=0, skipna=True, **kwargs): data[mask] = self._na_value op = getattr(nanops, 'nan' + name) - result = op(data, axis=axis, skipna=skipna, mask=mask) + result = op(data, axis=0, skipna=skipna, mask=mask) - # if we have a boolean op, provide coercion back to a bool - # type if possible + # if we have a boolean op, don't coerce if name in ['any', 'all']: - if is_integer(result) or is_float(result): - result = bool(int(result)) + pass # if we have a numeric op, provide coercion back to an integer # type if possible - elif not isna(result): + elif notna(result): int_result = int(result) if int_result == result: result = int_result diff --git a/pandas/core/series.py b/pandas/core/series.py index b7d827a370299..bff0f9fe25532 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3408,6 +3408,9 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, with np.errstate(all='ignore'): return op(delegate, skipna=skipna, **kwds) + # TODO(EA) dispatch to Index + # remove once all internals extension types are + # moved to ExtensionArrays return delegate._reduce(op=op, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only, filter_type=filter_type, **kwds) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index d72756a254110..53a598559393c 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -134,7 +134,7 @@ def _na_value(self): def _concat_same_type(cls, to_concat): return cls(np.concatenate([x._data for x in to_concat])) - def _reduce(self, name, axis=0, skipna=True, **kwargs): + def _reduce(self, name, skipna=True, **kwargs): if skipna: raise NotImplementedError("decimal does not support skipna=True") @@ -144,7 +144,7 @@ def _reduce(self, name, axis=0, skipna=True, **kwargs): except AttributeError: raise NotImplementedError("decimal does not support " "the {} operation".format(name)) - return op(axis=axis) + return op(axis=0) def to_decimal(values, context=None): diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index e3c44f5755cc7..9f8ac3b51067c 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -135,7 +135,7 @@ class Reduce: def check_reduce(self, s, op_name, skipna): - if skipna or op_name in ['median']: + if skipna or op_name in ['median', 'skew', 'kurt']: with pytest.raises(NotImplementedError): getattr(s, op_name)(skipna=skipna) From 67747911035998446d741e3d7f034b9aa2611f8a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 5 Oct 2018 07:54:21 -0400 Subject: [PATCH 12/23] cleanup --- pandas/core/arrays/datetimelike.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 8645a135d7c92..e4ace2bfe1509 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -342,21 +342,6 @@ def _validate_frequency(cls, index, freq, **kwargs): 'does not conform to passed frequency {passed}' .format(infer=inferred, passed=freq.freqstr)) - # ------------------------------------------------------------------ - # Reduction Methods - - def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwargs): - # if we have a reduction op already defined, use it - # this is important for min/max where tz's must be preserved - # and nanops is not geared towards this - # TODO(jreback): we are ignoring skipna - if hasattr(self, name): - return getattr(self, name)() - - # we dispatch to the nanops operations - return op(self, axis=axis, skipna=skipna) - # ------------------------------------------------------------------ # Arithmetic Methods From 6699080fa41f611b79a11ad67ca8a79ae9cfd001 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 5 Oct 2018 07:56:32 -0400 Subject: [PATCH 13/23] doc --- pandas/core/arrays/base.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index ad46cfc2054ef..14df4d9bf8670 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -688,8 +688,6 @@ def _reduce(self, name, skipna=True, **kwargs): name of the function, support values are: {any, all, min, max, sum, mean, median, prod, std, var, sem, kurt, skew} - axis : int, default 0 - axis over which to apply, defined as 0 currently skipna : bool, default True if True, skip NaN values kwargs : dict From 29c3cf7788334be62b22ddee19cc8de7b04329e2 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 5 Oct 2018 08:38:40 -0400 Subject: [PATCH 14/23] lint issue --- pandas/tests/extension/decimal/test_decimal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 9f8ac3b51067c..f84d24295b049 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -131,7 +131,7 @@ class TestMissing(BaseDecimal, base.BaseMissingTests): pass -class Reduce: +class Reduce(object): def check_reduce(self, s, op_name, skipna): From 0fdaf683bf4478ec553468df1499fb6470669101 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 6 Oct 2018 10:07:44 -0400 Subject: [PATCH 15/23] comments --- pandas/core/arrays/base.py | 10 +++++----- pandas/core/arrays/integer.py | 6 +++--- pandas/core/dtypes/common.py | 3 +-- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 14df4d9bf8670..de07f2318eacd 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -59,14 +59,14 @@ class ExtensionArray(object): * factorize / _values_for_factorize * argsort / _values_for_argsort - One can implement methods to handle array reductions. - - * _reduce - The remaining methods implemented on this class should be performant, as they only compose abstract methods. Still, a more efficient implementation may be available, and these methods can be overridden. + One can implement methods to handle array reductions. + + * _reduce + This class does not inherit from 'abc.ABCMeta' for performance reasons. Methods and properties required by the interface raise ``pandas.errors.AbstractMethodError`` and no ``register`` method is @@ -691,7 +691,7 @@ def _reduce(self, name, skipna=True, **kwargs): skipna : bool, default True if True, skip NaN values kwargs : dict - ddof is the only supported kwarg + Additional keyword arguments passed to the reduction function Returns ------- diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index e31a7571f62d6..6f37526bbf49e 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -546,9 +546,9 @@ def _reduce(self, name, skipna=True, **kwargs): if name in ['any', 'all']: pass - # if we have a numeric op, provide coercion back to an integer - # type if possible - elif notna(result): + # if we have a preservable numeric op, + # provide coercion back to an integer type if possible + elif name in ['sum', 'min', 'max'] and notna(result): int_result = int(result) if int_result == result: result = int_result diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 69c47de2c519c..a9fc9d13d4ab3 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1237,8 +1237,7 @@ def is_datetime_or_timedelta_dtype(arr_or_dtype): if arr_or_dtype is None: return False tipo = _get_dtype_type(arr_or_dtype) - return (issubclass(tipo, (np.datetime64, np.timedelta64)) or - is_datetime64tz_dtype(arr_or_dtype)) + return issubclass(tipo, (np.datetime64, np.timedelta64)) def _is_unorderable_exception(e): From 08f5830bbc13d1093e700865d2fca09caa29d364 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 6 Oct 2018 10:30:37 -0400 Subject: [PATCH 16/23] fix test --- pandas/tests/dtypes/test_common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 6c23dbb6c257e..882b2c156478a 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -386,10 +386,10 @@ def test_is_datetime_or_timedelta_dtype(): assert not com.is_datetime_or_timedelta_dtype(str) assert not com.is_datetime_or_timedelta_dtype(pd.Series([1, 2])) assert not com.is_datetime_or_timedelta_dtype(np.array(['a', 'b'])) + assert not com.is_datetime_or_timedelta_dtype( + DatetimeTZDtype("ns", "US/Eastern")) assert com.is_datetime_or_timedelta_dtype(np.datetime64) - assert com.is_datetime_or_timedelta_dtype( - DatetimeTZDtype("ns", "US/Eastern")) assert com.is_datetime_or_timedelta_dtype(np.timedelta64) assert com.is_datetime_or_timedelta_dtype( np.array([], dtype=np.timedelta64)) From 3e763c40797d006732e2ac2e31a1bb31c75a226a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 10 Oct 2018 20:27:24 -0400 Subject: [PATCH 17/23] comments --- pandas/core/arrays/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index de07f2318eacd..7e2e0da199a01 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -692,6 +692,7 @@ def _reduce(self, name, skipna=True, **kwargs): if True, skip NaN values kwargs : dict Additional keyword arguments passed to the reduction function + ddof is the only supported kwarg Returns ------- From b543386929344fef4565f3852e5ab68bbdf7ce00 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 11 Oct 2018 17:04:39 -0400 Subject: [PATCH 18/23] add op tests for reduction result dtype --- pandas/tests/arrays/test_integer.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 3f3653f375149..905bca38ac228 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -588,7 +588,7 @@ def test_cross_type_arithmetic(): @pytest.mark.parametrize('op', ['sum', 'min', 'max']) -def test_preserve_groupby_dtypes(op): +def test_preserve_dtypes(op): # TODO(#22346): preserve Int64 dtype # for ops that enable (mean would actually work here # but generally it is a float return value) @@ -598,6 +598,11 @@ def test_preserve_groupby_dtypes(op): "C": integer_array([1, None, 3], dtype='Int64'), }) + # op + result = getattr(df.C, op)() + assert isinstance(result, int) + + # groupby result = getattr(df.groupby("A"), op)() expected = pd.DataFrame({ "B": np.array([1.0, 3.0]), From b808778569ab23fc333afd7ccc17936714207bfa Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 11 Oct 2018 17:09:29 -0400 Subject: [PATCH 19/23] remove xfails on frame arithmetic ops comparisons --- pandas/tests/arrays/test_integer.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 905bca38ac228..fdf4bc91f9f77 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -114,6 +114,13 @@ def _check_op(self, s, op_name, other, exc=None): # compute expected mask = s.isna() + # if s is a DataFrame, squeeze to a Series + # for comparison + if isinstance(s, pd.DataFrame): + result = result.squeeze() + s = s.squeeze() + mask = mask.squeeze() + # other array is an Integer if isinstance(other, IntegerArray): omask = getattr(other, 'mask', None) @@ -215,7 +222,6 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators): s = pd.Series(data) self._check_op(s, op, 1, exc=TypeError) - @pytest.mark.xfail(run=False, reason="_reduce needs implementation") def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): # frame & scalar op = all_arithmetic_operators From 2b3d96f474ce425f49ab6d1caba88caca4729d9d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 11 Oct 2018 17:37:27 -0400 Subject: [PATCH 20/23] remove groupby overrides --- pandas/tests/extension/test_integer.py | 30 +------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 163f2ad6fc72b..89c36bbe7b325 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -207,35 +207,7 @@ class TestCasting(base.BaseCastingTests): class TestGroupby(base.BaseGroupbyTests): - - @pytest.mark.parametrize('as_index', [True, False]) - def test_groupby_extension_agg(self, as_index, data_for_grouping): - df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], - "B": data_for_grouping}) - result = df.groupby("B", as_index=as_index).A.mean() - _, index = pd.factorize(data_for_grouping, sort=True) - - # TODO(ExtensionIndex): remove coercion to object - # we don't have an easy way to represent an EA as an Index object - index = pd.Index(index, name="B", dtype=object) - expected = pd.Series([3, 1, 4], index=index, name="A") - if as_index: - self.assert_series_equal(result, expected) - else: - expected = expected.reset_index() - self.assert_frame_equal(result, expected) - - def test_groupby_extension_no_sort(self, data_for_grouping): - df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], - "B": data_for_grouping}) - result = df.groupby("B", sort=False).A.mean() - _, index = pd.factorize(data_for_grouping, sort=False) - - # TODO(ExtensionIndex): remove coercion to object - # we don't have an easy way to represent an EA as an Index object - index = pd.Index(index, name="B", dtype=object) - expected = pd.Series([1, 3, 4], index=index, name="A") - self.assert_series_equal(result, expected) + pass class TestNumericReduce(base.BaseNumericReduceTests): From 3037ccd7f6ae82a1fe9308759d1ec505f45c8940 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 12 Oct 2018 11:48:55 +0200 Subject: [PATCH 21/23] format docstring --- pandas/core/arrays/base.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 7e2e0da199a01..ef7e25033f24e 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -680,19 +680,20 @@ def _ndarray_values(self): return np.array(self) def _reduce(self, name, skipna=True, **kwargs): - """Return a scalar result of performing the reduction operation. + """ + Return a scalar result of performing the reduction operation. Parameters ---------- name : str - name of the function, support values are: - {any, all, min, max, sum, mean, median, prod, - std, var, sem, kurt, skew} + Name of the function, supported values are: + { any, all, min, max, sum, mean, median, prod, + std, var, sem, kurt, skew }. skipna : bool, default True - if True, skip NaN values - kwargs : dict - Additional keyword arguments passed to the reduction function - ddof is the only supported kwarg + If True, skip NaN values. + **kwargs + Additional keyword arguments passed to the reduction function. + Currently, `ddof` is the only supported kwarg. Returns ------- From b8726078a1a8968921c482d1d7e9f5ccd6578945 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 12 Oct 2018 11:53:19 +0200 Subject: [PATCH 22/23] add test ensuring mean always gives float --- pandas/tests/arrays/test_integer.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index fdf4bc91f9f77..9102a5210d54b 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -617,6 +617,29 @@ def test_preserve_dtypes(op): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize('op', ['mean']) +def test_reduce_to_float(op): + # some reduce ops always return float, even if the result + # is a rounded number + df = pd.DataFrame({ + "A": ['a', 'b', 'b'], + "B": [1, None, 3], + "C": integer_array([1, None, 3], dtype='Int64'), + }) + + # op + result = getattr(df.C, op)() + assert isinstance(result, float) + + # groupby + result = getattr(df.groupby("A"), op)() + expected = pd.DataFrame({ + "B": np.array([1.0, 3.0]), + "C": np.array([1, 3], dtype="float64") + }, index=pd.Index(['a', 'b'], name='A')) + tm.assert_frame_equal(result, expected) + + def test_astype_nansafe(): # https://github.com/pandas-dev/pandas/pull/22343 arr = integer_array([np.nan, 1, 2], dtype="Int8") From aeaf5f32c0939d6bedc99240b47dd54252dda96a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 12 Oct 2018 13:40:11 +0200 Subject: [PATCH 23/23] add prod to list of type preserving reductions --- pandas/core/arrays/integer.py | 2 +- pandas/tests/arrays/test_integer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 6f37526bbf49e..9917045f2f7d2 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -548,7 +548,7 @@ def _reduce(self, name, skipna=True, **kwargs): # if we have a preservable numeric op, # provide coercion back to an integer type if possible - elif name in ['sum', 'min', 'max'] and notna(result): + elif name in ['sum', 'min', 'max', 'prod'] and notna(result): int_result = int(result) if int_result == result: result = int_result diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 9102a5210d54b..23ee8d217bd59 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -593,7 +593,7 @@ def test_cross_type_arithmetic(): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('op', ['sum', 'min', 'max']) +@pytest.mark.parametrize('op', ['sum', 'min', 'max', 'prod']) def test_preserve_dtypes(op): # TODO(#22346): preserve Int64 dtype # for ops that enable (mean would actually work here