From 506421798b0a73b36a246ad7bf9fa8c9564bfb66 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 14 Aug 2018 10:08:00 -0500 Subject: [PATCH 1/6] API: ExtensionDtype._is_numeric --- doc/source/whatsnew/v0.24.0.txt | 1 + pandas/core/arrays/integer.py | 4 ++++ pandas/core/dtypes/base.py | 17 +++++++++++++++++ pandas/core/internals/blocks.py | 8 +++++++- pandas/tests/extension/base/groupby.py | 13 +++++++++++++ pandas/tests/extension/base/interface.py | 4 ++++ pandas/tests/extension/decimal/array.py | 4 ++++ pandas/tests/extension/integer/test_integer.py | 15 +++++++++++++++ 8 files changed, 65 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index cf12759c051fc..c1765b773b6a1 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -442,6 +442,7 @@ ExtensionType Changes - ``ExtensionArray`` has gained the abstract methods ``.dropna()`` (:issue:`21185`) - ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`) +- Added ``ExtensionDtype._is_numeric`` for controlling whether an extension dtype is considered numeric (:issue:`22290`). - The ``ExtensionArray`` constructor, ``_from_sequence`` now take the keyword arg ``copy=False`` (:issue:`21185`) - Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`) - :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index c126117060c3d..b818a860f9aa7 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -45,6 +45,10 @@ def is_signed_integer(self): def is_unsigned_integer(self): return self.kind == 'u' + @property + def _is_numeric(self): + return True + @cache_readonly def numpy_dtype(self): """ Return an instance of our numpy dtype """ diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 5f405e0d10657..2c90f0f7882a6 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -94,6 +94,18 @@ def is_dtype(cls, dtype): except TypeError: return False + @property + def _is_numeric(self): + # type: () -> bool + """ + Whether columns with this dtype should be considered numeric. + + By default ExtensionDtypes are assumed to be non-numeric. + They'll be excluded from operations that exclude non-numeric + columns, like groupby reductions. + """ + return False + class ExtensionDtype(_DtypeOpsMixin): """A custom data type, to be paired with an ExtensionArray. @@ -109,6 +121,11 @@ class ExtensionDtype(_DtypeOpsMixin): * name * construct_from_string + The following attributes influence the behavior of the dtype in + pandas operations + + * _is_numeric + Optionally one can override construct_array_type for construction with the name of this dtype via the Registry diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f0635014b166b..b8f9ab6ee2f60 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -669,7 +669,9 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, newb = self.copy() if copy else self if newb.is_numeric and self.is_numeric: - if newb.shape != self.shape: + # use values.shape, rather than newb.shape, as newb.shape + # may be incorrect for ExtensionBlocks. + if values.shape != self.shape: raise TypeError( "cannot set astype for copy = [{copy}] for dtype " "({dtype} [{itemsize}]) with smaller itemsize than " @@ -1947,6 +1949,10 @@ def is_view(self): """Extension arrays are never treated as views.""" return False + @property + def is_numeric(self): + return self.values.dtype._is_numeric + def setitem(self, indexer, value, mgr=None): """Set the value inplace, returning a same-typed block. diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index a29ef2a509a63..174997c7d51e1 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -67,3 +67,16 @@ def test_groupby_extension_apply(self, data_for_grouping, op): df.groupby("B").A.apply(op) df.groupby("A").apply(op) df.groupby("A").B.apply(op) + + def test_in_numeric_groupby(self, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], + "B": data_for_grouping, + "C": [1, 1, 1, 1, 1, 1, 1, 1]}) + result = df.groupby("A").sum().columns + + if data_for_grouping.dtype._is_numeric: + expected = pd.Index(['B', 'C']) + else: + expected = pd.Index(['C']) + + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 69de0e1900831..99c3b92541cbd 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -67,3 +67,7 @@ def test_no_values_attribute(self, data): # code, disallowing this for now until solved assert not hasattr(data, 'values') assert not hasattr(data, '_values') + + def test_is_numeric_honored(self, data): + result = pd.Series(data) + assert result._data.blocks[0].is_numeric is data.dtype._is_numeric diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 108b8874b3ac5..3d28ab9978f38 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -33,6 +33,10 @@ def construct_from_string(cls, string): raise TypeError("Cannot construct a '{}' from " "'{}'".format(cls, string)) + @property + def _is_numeric(self): + return True + class DecimalArray(ExtensionArray, ExtensionScalarOpsMixin): dtype = DecimalDtype() diff --git a/pandas/tests/extension/integer/test_integer.py b/pandas/tests/extension/integer/test_integer.py index 5e0f5bf0a5dcf..efc690a487d22 100644 --- a/pandas/tests/extension/integer/test_integer.py +++ b/pandas/tests/extension/integer/test_integer.py @@ -697,6 +697,21 @@ def test_cross_type_arithmetic(): tm.assert_series_equal(result, expected) +def test_groupby_mean_included(): + df = pd.DataFrame({ + "A": ['a', 'b', 'b'], + "B": [1, None, 3], + "C": IntegerArray([1, None, 3], dtype='Int64'), + }) + + result = df.groupby("A").sum() + expected = pd.DataFrame({ + "B": np.array([1.0, 3.0]), + "C": IntegerArray([1, 3], dtype="Int64") + }) + tm.assert_frame_equal(result, expected) + + # TODO(jreback) - these need testing / are broken # shift From 50de326a37873d8c6667fd3f33e36cddaa8af9b4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 14 Aug 2018 11:49:51 -0500 Subject: [PATCH 2/6] fixed test --- pandas/tests/extension/integer/test_integer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/integer/test_integer.py b/pandas/tests/extension/integer/test_integer.py index efc690a487d22..7b374d8331cae 100644 --- a/pandas/tests/extension/integer/test_integer.py +++ b/pandas/tests/extension/integer/test_integer.py @@ -705,10 +705,11 @@ def test_groupby_mean_included(): }) result = df.groupby("A").sum() + # TODO(#22346): preserve Int64 dtype expected = pd.DataFrame({ "B": np.array([1.0, 3.0]), - "C": IntegerArray([1, 3], dtype="Int64") - }) + "C": np.array([1, 3], dtype="int64") + }, index=pd.Index(['a', 'b'], name='A')) tm.assert_frame_equal(result, expected) From 1d96d22681abe15fe9666a3f3f7f99824c9de9df Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Tue, 14 Aug 2018 23:43:12 -0600 Subject: [PATCH 3/6] added test for DataFrame._get_numeric_data --- pandas/tests/frame/test_block_internals.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 8e012922d25f1..d096daaa0b664 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -11,7 +11,8 @@ import numpy as np from pandas import (DataFrame, Series, Timestamp, date_range, compat, - option_context) + option_context, Categorical) +from pandas.core.arrays import IntegerArray, IntervalArray from pandas.compat import StringIO import pandas as pd @@ -436,6 +437,17 @@ def test_get_numeric_data(self): expected = df assert_frame_equal(result, expected) + def test_get_numeric_data_extension_dtype(self): + # GH 22290 + df = DataFrame({ + 'A': IntegerArray([-10, np.nan, 0, 10, 20, 30], dtype='Int64'), + 'B': Categorical(list('abcabc')), + 'C': IntegerArray([0, 1, 2, 3, np.nan, 5], dtype='UInt8'), + 'D': IntervalArray.from_breaks(range(7))}) + result = df._get_numeric_data() + expected = df.loc[:, ['A', 'C']] + assert_frame_equal(result, expected) + def test_convert_objects(self): oops = self.mixed_frame.T.T From db9af360aa7ea8ed9ad6911a92de839d269e41be Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 15 Aug 2018 07:37:37 -0500 Subject: [PATCH 4/6] Pass ndim --- pandas/core/internals/blocks.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b8f9ab6ee2f60..b568d234b8558 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -662,16 +662,14 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, pass newb = make_block(values, placement=self.mgr_locs, - klass=klass) + klass=klass, ndim=self.ndim) except: if errors == 'raise': raise newb = self.copy() if copy else self if newb.is_numeric and self.is_numeric: - # use values.shape, rather than newb.shape, as newb.shape - # may be incorrect for ExtensionBlocks. - if values.shape != self.shape: + if newb.shape != self.shape: raise TypeError( "cannot set astype for copy = [{copy}] for dtype " "({dtype} [{itemsize}]) with smaller itemsize than " From a3fdc2ae9baafa7feceeb0cb67066b1a4ed52951 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 15 Aug 2018 07:40:42 -0500 Subject: [PATCH 5/6] Note plotting --- pandas/core/dtypes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 2c90f0f7882a6..c6bdb8656c3c6 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -102,7 +102,7 @@ def _is_numeric(self): By default ExtensionDtypes are assumed to be non-numeric. They'll be excluded from operations that exclude non-numeric - columns, like groupby reductions. + columns, like groupby reductions, plotting, etc. """ return False From 2779419b7d502d6279aabbd5332e6aabc1d73448 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 20 Aug 2018 13:16:31 +0200 Subject: [PATCH 6/6] small edit --- pandas/core/dtypes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index c6bdb8656c3c6..1ecb6234ad2d9 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -102,7 +102,7 @@ def _is_numeric(self): By default ExtensionDtypes are assumed to be non-numeric. They'll be excluded from operations that exclude non-numeric - columns, like groupby reductions, plotting, etc. + columns, like (groupby) reductions, plotting, etc. """ return False