From 4d7cc21d06853556a89c47dd70ecf0c9d0306fa6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Apr 2018 14:44:28 +0200 Subject: [PATCH 1/7] BUG: fix take() on empty arrays --- pandas/core/arrays/categorical.py | 11 +++++++++++ pandas/tests/extension/base/getitem.py | 9 +++++++++ pandas/tests/extension/decimal/array.py | 12 +++++++++++- pandas/tests/extension/json/array.py | 7 +++++-- 4 files changed, 36 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b5a4785fd98a6..dcdcdc90a9869 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1740,6 +1740,17 @@ def take_nd(self, indexer, allow_fill=True, fill_value=None): # but is passed thru internally assert isna(fill_value) + indexer = np.asarray(indexer) + + # take on empty array + if not len(self): + # only valid if result is an all-missing array + if (indexer == -1).all(): + codes = [-1] * len(indexer) + else: + raise IndexError( + "cannot do a non-empty take from an empty array.") + codes = take_1d(self._codes, indexer, allow_fill=True, fill_value=-1) result = self._constructor(codes, categories=self.categories, ordered=self.ordered, fastpath=True) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 566ba1721d13c..32086511b0b44 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd +import pandas.util.testing as tm from .base import BaseExtensionTests @@ -120,3 +121,11 @@ def test_take_sequence(self, data): assert result.iloc[0] == data[0] assert result.iloc[1] == data[1] assert result.iloc[2] == data[3] + + def test_take_empty_missing(self, data, na_value, na_cmp): + empty = data[:0] + result = empty.take([-1]) + na_cmp(result[0], na_value) + + with tm.assert_raises_regex(IndexError, "cannot do a non-empty take"): + empty.take([0, 1]) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index b66a14c77a059..9ab3a5a553be7 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -62,7 +62,7 @@ def __len__(self): return len(self.values) def __repr__(self): - return repr(self.values) + return "DecimalArray: " + repr(self.values) @property def nbytes(self): @@ -75,9 +75,19 @@ def isna(self): return np.array([x.is_nan() for x in self.values]) def take(self, indexer, allow_fill=True, fill_value=None): + indexer = np.asarray(indexer) mask = indexer == -1 + # take on empty array + if not len(self): + # only valid if result is an all-missing array + if mask.all(): + return type(self)([self._na_value] * len(indexer)) + else: + raise IndexError( + "cannot do a non-empty take from an empty array.") + indexer = _ensure_platform_int(indexer) out = self.values.take(indexer) out[mask] = self._na_value diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index d9ae49d87804a..c9a4cac282110 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -89,8 +89,11 @@ def isna(self): return np.array([x == self._na_value for x in self.data]) def take(self, indexer, allow_fill=True, fill_value=None): - output = [self.data[loc] if loc != -1 else self._na_value - for loc in indexer] + try: + output = [self.data[loc] if loc != -1 else self._na_value + for loc in indexer] + except IndexError: + raise IndexError("cannot do a non-empty take from an empty array.") return self._constructor_from_sequence(output) def copy(self, deep=False): From aba95d5c55390c2a9911be600b3143aa5177413e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Apr 2018 17:02:00 +0200 Subject: [PATCH 2/7] simplify check in categorical --- pandas/core/arrays/categorical.py | 11 +++-------- pandas/tests/extension/decimal/array.py | 1 - 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index dcdcdc90a9869..e37b2ab1e4b78 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1742,14 +1742,9 @@ def take_nd(self, indexer, allow_fill=True, fill_value=None): indexer = np.asarray(indexer) - # take on empty array - if not len(self): - # only valid if result is an all-missing array - if (indexer == -1).all(): - codes = [-1] * len(indexer) - else: - raise IndexError( - "cannot do a non-empty take from an empty array.") + # take on empty array only valid if result is an all-missing array + if not len(self) and not (indexer == -1).all(): + raise IndexError("cannot do a non-empty take from an empty array.") codes = take_1d(self._codes, indexer, allow_fill=True, fill_value=-1) result = self._constructor(codes, categories=self.categories, diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 9ab3a5a553be7..306f56a40f568 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -75,7 +75,6 @@ def isna(self): return np.array([x.is_nan() for x in self.values]) def take(self, indexer, allow_fill=True, fill_value=None): - indexer = np.asarray(indexer) mask = indexer == -1 From 3d6fd2eba79129f5c8472c5c2d065a81bb145754 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Apr 2018 17:15:51 +0200 Subject: [PATCH 3/7] add docs --- pandas/core/arrays/base.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c281bd80cb274..9ee212032ced8 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -458,6 +458,15 @@ def take(self, indexer, allow_fill=True, fill_value=None): Fill value to replace -1 values with. If applicable, this should use the sentinel missing value for this type. + Returns + ------- + ExtensionArray + + Raises + ------ + IndexError + When the indexer is out of bounds for the array. + Notes ----- This should follow pandas' semantics where -1 indicates missing values. @@ -477,6 +486,16 @@ def take(self, indexer, allow_fill=True, fill_value=None): def take(self, indexer, allow_fill=True, fill_value=None): indexer = np.asarray(indexer) mask = indexer == -1 + + # take on empty array not handled as desired by numpy + if not len(self): + # only valid if result is an all-missing array + if mask.all(): + return type(self)([self._na_value] * len(indexer)) + else: + raise IndexError( + "cannot do a non-empty take from an empty array.") + result = self.data.take(indexer) result[mask] = np.nan # NA for this type return type(self)(result) From eacb3d5e02ea9f2b3f5096b883896f54ca69733d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 Apr 2018 09:15:23 +0200 Subject: [PATCH 4/7] simplify check --- pandas/core/arrays/base.py | 13 ++++++------- pandas/tests/extension/decimal/array.py | 11 +++-------- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 9ee212032ced8..d0e366ccca5f0 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -472,6 +472,9 @@ def take(self, indexer, allow_fill=True, fill_value=None): This should follow pandas' semantics where -1 indicates missing values. Positions where indexer is ``-1`` should be filled with the missing value for this type. + This gives rise to the special case of a take on an empty + ExtensionArray that does not raises an IndexError straight away + when the `indexer` is all ``-1``. This is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when the indexer is a sequence of values. @@ -488,13 +491,9 @@ def take(self, indexer, allow_fill=True, fill_value=None): mask = indexer == -1 # take on empty array not handled as desired by numpy - if not len(self): - # only valid if result is an all-missing array - if mask.all(): - return type(self)([self._na_value] * len(indexer)) - else: - raise IndexError( - "cannot do a non-empty take from an empty array.") + # in case of -1 + if not len(self) and mask.all(): + return type(self)([np.nan] * len(indexer)) result = self.data.take(indexer) result[mask] = np.nan # NA for this type diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 306f56a40f568..eedcbd4a3cc92 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -78,14 +78,9 @@ def take(self, indexer, allow_fill=True, fill_value=None): indexer = np.asarray(indexer) mask = indexer == -1 - # take on empty array - if not len(self): - # only valid if result is an all-missing array - if mask.all(): - return type(self)([self._na_value] * len(indexer)) - else: - raise IndexError( - "cannot do a non-empty take from an empty array.") + # take on empty array not handled as desired by numpy in case of -1 + if not len(self) and mask.all(): + return type(self)([self._na_value] * len(indexer)) indexer = _ensure_platform_int(indexer) out = self.values.take(indexer) From c4faf8e12ed1009188466f3775d1d6bdf0910273 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 12 Apr 2018 11:41:44 +0200 Subject: [PATCH 5/7] expand take test + undo changes in categorical --- pandas/core/arrays/base.py | 2 +- pandas/core/arrays/categorical.py | 6 ------ pandas/tests/extension/base/getitem.py | 11 ++++++++++- pandas/tests/extension/category/test_categorical.py | 9 +++++++++ pandas/tests/extension/json/array.py | 3 ++- 5 files changed, 22 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index d0e366ccca5f0..d49a0d799526a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -491,7 +491,7 @@ def take(self, indexer, allow_fill=True, fill_value=None): mask = indexer == -1 # take on empty array not handled as desired by numpy - # in case of -1 + # in case of -1 (all missing take) if not len(self) and mask.all(): return type(self)([np.nan] * len(indexer)) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e37b2ab1e4b78..b5a4785fd98a6 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1740,12 +1740,6 @@ def take_nd(self, indexer, allow_fill=True, fill_value=None): # but is passed thru internally assert isna(fill_value) - indexer = np.asarray(indexer) - - # take on empty array only valid if result is an all-missing array - if not len(self) and not (indexer == -1).all(): - raise IndexError("cannot do a non-empty take from an empty array.") - codes = take_1d(self._codes, indexer, allow_fill=True, fill_value=-1) result = self._constructor(codes, categories=self.categories, ordered=self.ordered, fastpath=True) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 32086511b0b44..e9d732257fdf7 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -122,7 +122,16 @@ def test_take_sequence(self, data): assert result.iloc[1] == data[1] assert result.iloc[2] == data[3] - def test_take_empty_missing(self, data, na_value, na_cmp): + def test_take(self, data, na_value, na_cmp): + result = data.take([0, -1]) + assert result.dtype == data.dtype + assert result[0] == data[0] + na_cmp(result[1], na_value) + + with tm.assert_raises_regex(IndexError, "out of bounds"): + data.take([len(data) + 1]) + + def test_take_empty(self, data, na_value, na_cmp): empty = data[:0] result = empty.take([-1]) na_cmp(result[0], na_value) diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py index 7528299578326..616ec6a48cfdb 100644 --- a/pandas/tests/extension/category/test_categorical.py +++ b/pandas/tests/extension/category/test_categorical.py @@ -84,6 +84,15 @@ def test_getitem_scalar(self): # to break things by changing. pass + @pytest.mark.xfail(reason="Categorical.take buggy") + def test_take(self): + # TODO remove this once Categorical.take is fixed + pass + + @pytest.mark.xfail(reason="Categorical.take buggy") + def test_take_empty(self): + pass + class TestMissing(base.BaseMissingTests): diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index c9a4cac282110..33843492cb706 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -93,7 +93,8 @@ def take(self, indexer, allow_fill=True, fill_value=None): output = [self.data[loc] if loc != -1 else self._na_value for loc in indexer] except IndexError: - raise IndexError("cannot do a non-empty take from an empty array.") + raise IndexError("Index is out of bounds or cannot do a " + "non-empty take from an empty array.") return self._constructor_from_sequence(output) def copy(self, deep=False): From 92572039d3c0e2151cb80e5e77416d551026de01 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 16 Apr 2018 09:19:03 +0200 Subject: [PATCH 6/7] add reindex test --- pandas/tests/extension/base/getitem.py | 19 +++++++++++++++++++ .../extension/category/test_categorical.py | 4 ++++ 2 files changed, 23 insertions(+) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index e9d732257fdf7..c329fd86d4e9e 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -138,3 +138,22 @@ def test_take_empty(self, data, na_value, na_cmp): with tm.assert_raises_regex(IndexError, "cannot do a non-empty take"): empty.take([0, 1]) + + def test_reindex(self, data, na_value): + s = pd.Series(data) + result = s.reindex([0, 1, 3]) + expected = pd.Series(data.take([0, 1, 3]), index=[0, 1, 3]) + self.assert_series_equal(result, expected) + + n = len(data) + result = s.reindex([-1, 0, n]) + expected = pd.Series( + data._constructor_from_sequence([na_value, data[0], na_value]), + index=[-1, 0, n]) + self.assert_series_equal(result, expected) + + result = s.reindex([n, n + 1]) + expected = pd.Series( + data._constructor_from_sequence([na_value, na_value]), + index=[n, n + 1]) + self.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py index 616ec6a48cfdb..561f03c84ed5f 100644 --- a/pandas/tests/extension/category/test_categorical.py +++ b/pandas/tests/extension/category/test_categorical.py @@ -93,6 +93,10 @@ def test_take(self): def test_take_empty(self): pass + @pytest.mark.xfail(reason="test not written correctly for categorical") + def test_reindex(self): + pass + class TestMissing(base.BaseMissingTests): From a3f88ee2bdccc4c33a3b2f759274a0db056f4c0c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 16 Apr 2018 09:32:41 +0200 Subject: [PATCH 7/7] add failing test for Series.take --- pandas/tests/extension/base/getitem.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index c329fd86d4e9e..4e2a65eba06dc 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -1,3 +1,4 @@ +import pytest import numpy as np import pandas as pd @@ -139,6 +140,15 @@ def test_take_empty(self, data, na_value, na_cmp): with tm.assert_raises_regex(IndexError, "cannot do a non-empty take"): empty.take([0, 1]) + @pytest.mark.xfail(reason="Series.take with extension array buggy for -1") + def test_take_series(self, data): + s = pd.Series(data) + result = s.take([0, -1]) + expected = pd.Series( + data._constructor_from_sequence([data[0], data[len(data) - 1]]), + index=[0, len(data) - 1]) + self.assert_series_equal(result, expected) + def test_reindex(self, data, na_value): s = pd.Series(data) result = s.reindex([0, 1, 3])