From a62dbda207689000f0ae31b095a81e00bd9b543a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 6 Feb 2020 11:19:54 +0100 Subject: [PATCH 1/7] TST: expand tests for ExtensionArray setitem with nullable arrays --- pandas/tests/extension/base/setitem.py | 97 ++++++++++++++++++++++---- pandas/tests/extension/test_numpy.py | 37 ++++++++++ 2 files changed, 122 insertions(+), 12 deletions(-) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index e0ca603aaa0ed..33a94106057a9 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -4,7 +4,6 @@ import pytest import pandas as pd -from pandas.core.arrays.numpy_ import PandasDtype from .base import BaseExtensionTests @@ -93,6 +92,91 @@ def test_setitem_iloc_scalar_multiple_homogoneous(self, data): df.iloc[10, 1] = data[1] assert df.loc[10, "B"] == data[1] + def test_setitem_mask(self, data, box_in_series): + # numpy bool mask + arr = data[:5].copy() + expected = arr.take([0, 0, 0, 3, 4]) + if box_in_series: + arr = pd.Series(arr) + expected = pd.Series(expected) + mask = np.array([True, True, True, False, False]) + arr[mask] = data[0] + self.assert_equal(expected, arr) + + def test_setitem_mask_boolean_array(self, data, box_in_series): + # GH 31446 - nullable boolean mask + arr = data[:5].copy() + expected = arr.take([0, 0, 0, 3, 4]) + if box_in_series: + arr = pd.Series(arr) + expected = pd.Series(expected) + mask = pd.array([True, True, True, False, False], dtype="boolean") + arr[mask] = data[0] + self.assert_equal(expected, arr) + + def test_setitem_mask_raises(self, data, box_in_series): + # wrong length + mask = np.array([True, False]) + + if box_in_series: + data = pd.Series(data) + + with pytest.raises(IndexError): + data[mask] = data[0] + + mask = pd.array(mask, dtype="boolean") + with pytest.raises(IndexError): + data[mask] = data[0] + + def test_setitem_mask_boolean_array_raises(self, data, box_in_series): + # missing values in mask + mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") + mask[:2] = pd.NA + + if box_in_series: + data = pd.Series(data) + + msg = ( + "Cannot mask with a boolean indexer containing NA values|" + "cannot mask with array containing NA / NaN values" + ) + with pytest.raises(ValueError, match=msg): + data[mask] = data[0] + + @pytest.mark.parametrize( + "idx", + [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])], + ids=["list", "integer-array", "numpy-array"], + ) + def test_setitem_integer_array(self, data, idx, box_in_series): + arr = data[:5].copy() + expected = data.take([0, 0, 0, 3, 4]) + + if box_in_series: + arr = pd.Series(arr) + expected = pd.Series(expected) + + arr[idx] = arr[0] + self.assert_equal(arr, expected) + + @pytest.mark.parametrize( + "idx", + [[0, 1, 2, pd.NA], pd.array([0, 1, 2, pd.NA], dtype="Int64")], + ids=["list", "integer-array"], + ) + def test_setitem_integer_with_missing_raises(self, data, idx): + arr = data.copy() + + msg = "Cannot index with an integer indexer containing NA values" + with pytest.raises(ValueError, match=msg): + arr[idx] = arr[0] + + # TODO this raises KeyError about labels not found (it tries label-based) + # import pandas._testing as tm + # s = pd.Series(data, index=[tm.rands(4) for _ in range(len(data))]) + # with pytest.raises(ValueError, match=msg): + # s[idx] = s[0] + @pytest.mark.parametrize("as_callable", [True, False]) @pytest.mark.parametrize("setter", ["loc", None]) def test_setitem_mask_aligned(self, data, as_callable, setter): @@ -196,14 +280,3 @@ def test_setitem_preserves_views(self, data): data[0] = data[1] assert view1[0] == data[1] assert view2[0] == data[1] - - def test_setitem_nullable_mask(self, data): - # GH 31446 - # TODO: there is some issue with PandasArray, therefore, - # TODO: skip the setitem test for now, and fix it later - if data.dtype != PandasDtype("object"): - arr = data[:5] - expected = data.take([0, 0, 0, 3, 4]) - mask = pd.array([True, True, True, False, False]) - arr[mask] = data[0] - self.assert_extension_array_equal(expected, arr) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 8a820c8746857..a9de59185a44c 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -396,6 +396,43 @@ def test_setitem_scalar_key_sequence_raise(self, data): # Failed: DID NOT RAISE super().test_setitem_scalar_key_sequence_raise(data) + # TODO: there is some issue with PandasArray, therefore, + # skip the setitem test for now, and fix it later (GH 31446) + + @skip_nested + def test_setitem_mask(self, data, box_in_series): + super().test_setitem_mask(data, box_in_series) + + @skip_nested + def test_setitem_mask_boolean_array(self, data, box_in_series): + super().test_setitem_mask_boolean_array(data, box_in_series) + + @skip_nested + def test_setitem_mask_raises(self, data, box_in_series): + super().test_setitem_mask_raises(data, box_in_series) + + @skip_nested + def test_setitem_mask_boolean_array_raises(self, data, box_in_series): + super().test_setitem_mask_boolean_array_raises(data, box_in_series) + + @skip_nested + @pytest.mark.parametrize( + "idx", + [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])], + ids=["list", "integer-array", "numpy-array"], + ) + def test_setitem_integer_array(self, data, idx, box_in_series): + super().test_setitem_integer_array(data, idx, box_in_series) + + @skip_nested + @pytest.mark.parametrize( + "idx", + [[0, 1, 2, pd.NA], pd.array([0, 1, 2, pd.NA], dtype="Int64")], + ids=["list", "integer-array"], + ) + def test_setitem_integer_with_missing_raises(self, data, idx): + super().test_setitem_integer_with_missing_raises(data, idx) + @skip_nested class TestParsing(BaseNumPyTests, base.BaseParsingTests): From 3d713ef05e8a53cfd51011a8e4295ececa07a981 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 6 Feb 2020 20:48:10 +0100 Subject: [PATCH 2/7] combine tests --- pandas/tests/extension/base/setitem.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 33a94106057a9..343be975db748 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -92,25 +92,20 @@ def test_setitem_iloc_scalar_multiple_homogoneous(self, data): df.iloc[10, 1] = data[1] assert df.loc[10, "B"] == data[1] - def test_setitem_mask(self, data, box_in_series): - # numpy bool mask - arr = data[:5].copy() - expected = arr.take([0, 0, 0, 3, 4]) - if box_in_series: - arr = pd.Series(arr) - expected = pd.Series(expected) - mask = np.array([True, True, True, False, False]) - arr[mask] = data[0] - self.assert_equal(expected, arr) - - def test_setitem_mask_boolean_array(self, data, box_in_series): - # GH 31446 - nullable boolean mask + @pytest.mark.parametrize( + "mask", + [ + np.array([True, True, True, False, False]), + pd.array([True, True, True, False, False], dtype="boolean"), + ], + ids=["numpy-array", "boolean-array"], + ) + def test_setitem_mask(self, data, mask, box_in_series): arr = data[:5].copy() expected = arr.take([0, 0, 0, 3, 4]) if box_in_series: arr = pd.Series(arr) expected = pd.Series(expected) - mask = pd.array([True, True, True, False, False], dtype="boolean") arr[mask] = data[0] self.assert_equal(expected, arr) From d2e1af5fddd4d1d5d8ce88015295f81db2c89595 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 7 Feb 2020 08:32:02 +0100 Subject: [PATCH 3/7] fix numpy test --- pandas/tests/extension/test_numpy.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index a9de59185a44c..041c536603db6 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -400,12 +400,16 @@ def test_setitem_scalar_key_sequence_raise(self, data): # skip the setitem test for now, and fix it later (GH 31446) @skip_nested - def test_setitem_mask(self, data, box_in_series): - super().test_setitem_mask(data, box_in_series) - - @skip_nested - def test_setitem_mask_boolean_array(self, data, box_in_series): - super().test_setitem_mask_boolean_array(data, box_in_series) + @pytest.mark.parametrize( + "mask", + [ + np.array([True, True, True, False, False]), + pd.array([True, True, True, False, False], dtype="boolean"), + ], + ids=["numpy-array", "boolean-array"], + ) + def test_setitem_mask(self, data, mask, box_in_series): + super().test_setitem_mask(data, mask, box_in_series) @skip_nested def test_setitem_mask_raises(self, data, box_in_series): From af11ed0223082048e554bfa62db67fc2277d980d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 7 Feb 2020 08:40:31 +0100 Subject: [PATCH 4/7] test msg --- pandas/tests/extension/base/setitem.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 343be975db748..094398269d072 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -116,11 +116,11 @@ def test_setitem_mask_raises(self, data, box_in_series): if box_in_series: data = pd.Series(data) - with pytest.raises(IndexError): + with pytest.raises(IndexError, match="wrong length"): data[mask] = data[0] mask = pd.array(mask, dtype="boolean") - with pytest.raises(IndexError): + with pytest.raises(IndexError, match="wrong length"): data[mask] = data[0] def test_setitem_mask_boolean_array_raises(self, data, box_in_series): From 8796c6e994657893db61b797c166ba30e50202df Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 10 Feb 2020 11:04:11 +0100 Subject: [PATCH 5/7] use xfail --- pandas/tests/extension/base/setitem.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 094398269d072..f441f9e91424e 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -4,6 +4,7 @@ import pytest import pandas as pd +import pandas._testing as tm from .base import BaseExtensionTests @@ -155,23 +156,27 @@ def test_setitem_integer_array(self, data, idx, box_in_series): self.assert_equal(arr, expected) @pytest.mark.parametrize( - "idx", - [[0, 1, 2, pd.NA], pd.array([0, 1, 2, pd.NA], dtype="Int64")], - ids=["list", "integer-array"], + "idx, box_in_series", + [ + ([0, 1, 2, pd.NA], False), + pytest.param([0, 1, 2, pd.NA], True, marks=pytest.mark.xfail), + (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False), + (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False), + ], + ids=["list-False", "list-True", "integer-array-False", "integer-array-True"], ) - def test_setitem_integer_with_missing_raises(self, data, idx): + def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series): arr = data.copy() + # TODO(xfail) this raises KeyError about labels not found (it tries label-based) + # for list of labels with Series + if box_in_series: + arr = pd.Series(data, index=[tm.rands(4) for _ in range(len(data))]) + msg = "Cannot index with an integer indexer containing NA values" with pytest.raises(ValueError, match=msg): arr[idx] = arr[0] - # TODO this raises KeyError about labels not found (it tries label-based) - # import pandas._testing as tm - # s = pd.Series(data, index=[tm.rands(4) for _ in range(len(data))]) - # with pytest.raises(ValueError, match=msg): - # s[idx] = s[0] - @pytest.mark.parametrize("as_callable", [True, False]) @pytest.mark.parametrize("setter", ["loc", None]) def test_setitem_mask_aligned(self, data, as_callable, setter): From 0be92b76925e9fcda66edfe347b730fc4881865c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 10 Feb 2020 11:42:53 +0100 Subject: [PATCH 6/7] fix numpy test --- pandas/tests/extension/test_numpy.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 041c536603db6..388f0e879eb93 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -430,12 +430,17 @@ def test_setitem_integer_array(self, data, idx, box_in_series): @skip_nested @pytest.mark.parametrize( - "idx", - [[0, 1, 2, pd.NA], pd.array([0, 1, 2, pd.NA], dtype="Int64")], - ids=["list", "integer-array"], + "idx, box_in_series", + [ + ([0, 1, 2, pd.NA], False), + pytest.param([0, 1, 2, pd.NA], True, marks=pytest.mark.xfail), + (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False), + (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False), + ], + ids=["list-False", "list-True", "integer-array-False", "integer-array-True"], ) - def test_setitem_integer_with_missing_raises(self, data, idx): - super().test_setitem_integer_with_missing_raises(data, idx) + def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series): + super().test_setitem_integer_with_missing_raises(data, idx, box_in_series) @skip_nested From db73fff495b27be60e50f8fe96e6565133705218 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 13 Feb 2020 09:48:14 +0100 Subject: [PATCH 7/7] add xfail reason --- pandas/tests/extension/base/setitem.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index be979d8205810..af70799c0236e 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -159,7 +159,9 @@ def test_setitem_integer_array(self, data, idx, box_in_series): "idx, box_in_series", [ ([0, 1, 2, pd.NA], False), - pytest.param([0, 1, 2, pd.NA], True, marks=pytest.mark.xfail), + pytest.param( + [0, 1, 2, pd.NA], True, marks=pytest.mark.xfail(reason="GH-31948") + ), (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False), (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False), ],