From 7267544e3c2a504b60b5511c52653e5a8d23c2fd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 15 Feb 2018 14:36:22 -0600 Subject: [PATCH 1/9] ENH: ExtensionArray.unique --- pandas/core/algorithms.py | 6 +++--- pandas/core/arrays/base.py | 12 ++++++++++++ pandas/tests/extension/base/methods.py | 11 +++++++++++ pandas/tests/extension/json/array.py | 7 +++++++ 4 files changed, 33 insertions(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d616e3f92aa4d..1b464e6b0b333 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -12,6 +12,7 @@ ABCSeries, ABCIndex, ABCIndexClass, ABCCategorical) from pandas.core.dtypes.common import ( + is_array_like, is_unsigned_integer_dtype, is_signed_integer_dtype, is_integer_dtype, is_complex_dtype, is_object_dtype, @@ -168,8 +169,7 @@ def _ensure_arraylike(values): """ ensure that we are arraylike if not already """ - if not isinstance(values, (np.ndarray, ABCCategorical, - ABCIndexClass, ABCSeries)): + if not is_array_like(values): inferred = lib.infer_dtype(values) if inferred in ['mixed', 'string', 'unicode']: if isinstance(values, tuple): @@ -356,7 +356,7 @@ def unique(values): # categorical is a fast-path # this will coerce Categorical, CategoricalIndex, # and category dtypes Series to same return of Category - if is_categorical_dtype(values): + if is_extension_array_dtype(values): values = getattr(values, '.values', values) return values.unique() diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index cec881394a021..0ddae332abc04 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -216,6 +216,18 @@ def isna(self): """ raise AbstractMethodError(self) + def unique(self): + """Compute the ExtensionArray of unique values. + + Returns + ------- + uniques : ExtensionArray + """ + from pandas import unique + + uniques = unique(self.astype(object)) + return type(self)(uniques) + # ------------------------------------------------------------------------ # Indexing methods # ------------------------------------------------------------------------ diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index c77811ca63926..f7038375c7a93 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -30,3 +30,14 @@ def test_count(self, data_missing): def test_apply_simple_series(self, data): result = pd.Series(data).apply(id) assert isinstance(result, pd.Series) + + @pytest.mark.parametrize('box', [pd.Series, lambda x: x]) + @pytest.mark.parametrize('method', [lambda x: x.unique(), pd.unique]) + def test_unique(self, data, box, method): + duplicated = box(type(data)([data[0], data[0]])) + + result = method(duplicated) + + assert len(result) == 1 + assert isinstance(result, type(data)) + assert result[0] == duplicated[0] diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 90aac93c68f64..1731f037a530f 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -82,6 +82,13 @@ def take(self, indexer, allow_fill=True, fill_value=None): def copy(self, deep=False): return type(self)(self.data[:]) + def unique(self): + # Parent method doesn't work since np.array will try to infer + # a 2-dim object. + return type(self)([ + dict(x) for x in list(set(tuple(d.items()) for d in self.data)) + ]) + @property def _na_value(self): return {} From 07148dbf9d54c2e489dc3604c092071ed0a62dfb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 23 Feb 2018 12:34:57 -0600 Subject: [PATCH 2/9] Linting --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1b464e6b0b333..0d15fc030b821 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -10,7 +10,7 @@ maybe_promote, construct_1d_object_array_from_listlike) from pandas.core.dtypes.generic import ( ABCSeries, ABCIndex, - ABCIndexClass, ABCCategorical) + ABCIndexClass) from pandas.core.dtypes.common import ( is_array_like, is_unsigned_integer_dtype, is_signed_integer_dtype, From c8b5852412f174fe3dd9b210dc1fec991770a55d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 23 Feb 2018 14:49:39 -0600 Subject: [PATCH 3/9] Update comment, remove buggy line --- pandas/core/algorithms.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 0d15fc030b821..9101fca58d5fa 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -353,11 +353,8 @@ def unique(values): values = _ensure_arraylike(values) - # categorical is a fast-path - # this will coerce Categorical, CategoricalIndex, - # and category dtypes Series to same return of Category if is_extension_array_dtype(values): - values = getattr(values, '.values', values) + # Dispatch to extension dtype's unique. return values.unique() original = values From 509957399cd3554178a18786f7a7a936e9c09a50 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 24 Feb 2018 09:30:19 -0600 Subject: [PATCH 4/9] Fixed 32-bit test failures --- pandas/tests/extension/base/getitem.py | 3 ++- pandas/tests/extension/decimal/array.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index f43971e928cac..72d6ff817bb0b 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -48,7 +48,8 @@ def test_loc_series(self, data): tm.assert_series_equal(result, expected) def test_loc_frame(self, data): - df = pd.DataFrame({"A": data, 'B': np.arange(len(data))}) + df = pd.DataFrame({"A": data, 'B': np.arange(len(data), + dtype='int64')}) expected = pd.DataFrame({"A": data[:4]}) # slice -> frame diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index f526ac5996a10..918272177d418 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -68,7 +68,7 @@ def isna(self): def take(self, indexer, allow_fill=True, fill_value=None): mask = indexer == -1 - out = self.values.take(indexer) + out = self.values.take(indexer.astype(np.intp)) out[mask] = self._na_value return type(self)(out) From a5d6b67c5bd35faf67883e3e79d8e0532447d20f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 24 Feb 2018 09:39:57 -0600 Subject: [PATCH 5/9] Revert "Fixed 32-bit test failures" This reverts commit 509957399cd3554178a18786f7a7a936e9c09a50. --- pandas/tests/extension/base/getitem.py | 3 +-- pandas/tests/extension/decimal/array.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 72d6ff817bb0b..f43971e928cac 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -48,8 +48,7 @@ def test_loc_series(self, data): tm.assert_series_equal(result, expected) def test_loc_frame(self, data): - df = pd.DataFrame({"A": data, 'B': np.arange(len(data), - dtype='int64')}) + df = pd.DataFrame({"A": data, 'B': np.arange(len(data))}) expected = pd.DataFrame({"A": data[:4]}) # slice -> frame diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 918272177d418..f526ac5996a10 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -68,7 +68,7 @@ def isna(self): def take(self, indexer, allow_fill=True, fill_value=None): mask = indexer == -1 - out = self.values.take(indexer.astype(np.intp)) + out = self.values.take(indexer) out[mask] = self._na_value return type(self)(out) From 011d02ed3c4c72699673216849aa7381dc965986 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 26 Feb 2018 08:08:08 -0600 Subject: [PATCH 6/9] Added _constructor --- pandas/core/arrays/base.py | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 0ddae332abc04..2e14be341671e 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -27,11 +27,11 @@ class ExtensionArray(object): * copy * _concat_same_type - Some additional methods are available to satisfy pandas' internal, private - block API. + Some additional methods are available to satisfy pandas' internals. * _can_hold_na * _formatting_values + * _constructor This class does not inherit from 'abc.ABCMeta' for performance reasons. Methods and properties required by the interface raise @@ -49,13 +49,32 @@ class ExtensionArray(object): assumptions on how the data are stored, just that it can be converted to a NumPy array. - Extension arrays should be able to be constructed with instances of - the class, i.e. ``ExtensionArray(extension_array)`` should return - an instance, not error. + There are a few restrictions on how ExtensionArrays are created. + + * An ExtensionArray should be valid, i.e. + ``ExtensionArray(extension_array)`` should return an instance + * A sequence of the scalar type should be valid, i.e. + ``ExtensionArray(Sequence[ExtensionDtype.type]])`` should return + an instance, not error. """ # '_typ' is for pandas.core.dtypes.generic.ABCExtensionArray. # Don't override this. _typ = 'extension' + + @classmethod + def _constructor(cls, data): + """Construct a new instance of of the extension array. + + Parameters + ---------- + data : Sequence + + Returns + ------- + ExtensionArray + """ + return cls(data) + # ------------------------------------------------------------------------ # Must be a Sequence # ------------------------------------------------------------------------ @@ -226,7 +245,7 @@ def unique(self): from pandas import unique uniques = unique(self.astype(object)) - return type(self)(uniques) + return self._constructor(uniques) # ------------------------------------------------------------------------ # Indexing methods From b8711d30d195c733d5e60fab0ef90f9cb4db7780 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 26 Feb 2018 08:36:11 -0600 Subject: [PATCH 7/9] Revert "Added _constructor" This reverts commit 011d02ed3c4c72699673216849aa7381dc965986. --- pandas/core/arrays/base.py | 31 ++++++------------------------- 1 file changed, 6 insertions(+), 25 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 2e14be341671e..0ddae332abc04 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -27,11 +27,11 @@ class ExtensionArray(object): * copy * _concat_same_type - Some additional methods are available to satisfy pandas' internals. + Some additional methods are available to satisfy pandas' internal, private + block API. * _can_hold_na * _formatting_values - * _constructor This class does not inherit from 'abc.ABCMeta' for performance reasons. Methods and properties required by the interface raise @@ -49,32 +49,13 @@ class ExtensionArray(object): assumptions on how the data are stored, just that it can be converted to a NumPy array. - There are a few restrictions on how ExtensionArrays are created. - - * An ExtensionArray should be valid, i.e. - ``ExtensionArray(extension_array)`` should return an instance - * A sequence of the scalar type should be valid, i.e. - ``ExtensionArray(Sequence[ExtensionDtype.type]])`` should return - an instance, not error. + Extension arrays should be able to be constructed with instances of + the class, i.e. ``ExtensionArray(extension_array)`` should return + an instance, not error. """ # '_typ' is for pandas.core.dtypes.generic.ABCExtensionArray. # Don't override this. _typ = 'extension' - - @classmethod - def _constructor(cls, data): - """Construct a new instance of of the extension array. - - Parameters - ---------- - data : Sequence - - Returns - ------- - ExtensionArray - """ - return cls(data) - # ------------------------------------------------------------------------ # Must be a Sequence # ------------------------------------------------------------------------ @@ -245,7 +226,7 @@ def unique(self): from pandas import unique uniques = unique(self.astype(object)) - return self._constructor(uniques) + return type(self)(uniques) # ------------------------------------------------------------------------ # Indexing methods From a260d3590a948bddec90c1cb7255aed506a3b1b7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 5 Mar 2018 15:35:32 -0800 Subject: [PATCH 8/9] Updated --- pandas/core/arrays/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 348e2336bc2ca..1f33081a5f610 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -246,7 +246,7 @@ def unique(self): from pandas import unique uniques = unique(self.astype(object)) - return type(self)(uniques) + return self._constructor_from_sequence(uniques) # ------------------------------------------------------------------------ # Indexing methods From fc04612e756fd8ca0dd1a6a60ec1320a179f7db5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Mar 2018 09:17:32 -0500 Subject: [PATCH 9/9] Use from_sequence --- pandas/tests/extension/base/methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index ae430a4cbe4fe..7ce80e25d8cf6 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -35,7 +35,7 @@ def test_apply_simple_series(self, data): @pytest.mark.parametrize('box', [pd.Series, lambda x: x]) @pytest.mark.parametrize('method', [lambda x: x.unique(), pd.unique]) def test_unique(self, data, box, method): - duplicated = box(type(data)([data[0], data[0]])) + duplicated = box(data._constructor_from_sequence([data[0], data[0]])) result = method(duplicated)