diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index e45dbb393a8de..36af5d32ae461 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -14,13 +14,90 @@ class TestSeriesAccessor: - # TODO: collect other Series accessor tests def test_to_dense(self): - s = pd.Series([0, 1, 0, 10], dtype="Sparse[int64]") - result = s.sparse.to_dense() + ser = pd.Series([0, 1, 0, 10], dtype="Sparse[int64]") + result = ser.sparse.to_dense() expected = pd.Series([0, 1, 0, 10]) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("attr", ["npoints", "density", "fill_value", "sp_values"]) + def test_get_attributes(self, attr): + arr = SparseArray([0, 1]) + ser = pd.Series(arr) + + result = getattr(ser.sparse, attr) + expected = getattr(arr, attr) + assert result == expected + + @td.skip_if_no_scipy + def test_from_coo(self): + import scipy.sparse + + row = [0, 3, 1, 0] + col = [0, 3, 1, 2] + data = [4, 5, 7, 9] + # TODO(scipy#13585): Remove dtype when scipy is fixed + # https://github.com/scipy/scipy/issues/13585 + sp_array = scipy.sparse.coo_matrix((data, (row, col)), dtype="int") + result = pd.Series.sparse.from_coo(sp_array) + + index = pd.MultiIndex.from_arrays([[0, 0, 1, 3], [0, 2, 1, 3]]) + expected = pd.Series([4, 9, 7, 5], index=index, dtype="Sparse[int]") + tm.assert_series_equal(result, expected) + + @td.skip_if_no_scipy + @pytest.mark.parametrize( + "sort_labels, expected_rows, expected_cols, expected_values_pos", + [ + ( + False, + [("b", 2), ("a", 2), ("b", 1), ("a", 1)], + [("z", 1), ("z", 2), ("x", 2), ("z", 0)], + {1: (1, 0), 3: (3, 3)}, + ), + ( + True, + [("a", 1), ("a", 2), ("b", 1), ("b", 2)], + [("x", 2), ("z", 0), ("z", 1), ("z", 2)], + {1: (1, 2), 3: (0, 1)}, + ), + ], + ) + def test_to_coo( + self, sort_labels, expected_rows, expected_cols, expected_values_pos + ): + import scipy.sparse + + values = SparseArray([0, np.nan, 1, 0, None, 3], fill_value=0) + index = pd.MultiIndex.from_tuples( + [ + ("b", 2, "z", 1), + ("a", 2, "z", 2), + ("a", 2, "z", 1), + ("a", 2, "x", 2), + ("b", 1, "z", 1), + ("a", 1, "z", 0), + ] + ) + ss = pd.Series(values, index=index) + + expected_A = np.zeros((4, 4)) + for value, (row, col) in expected_values_pos.items(): + expected_A[row, col] = value + + A, rows, cols = ss.sparse.to_coo( + row_levels=(0, 1), column_levels=(2, 3), sort_labels=sort_labels + ) + assert isinstance(A, scipy.sparse.coo_matrix) + tm.assert_numpy_array_equal(A.toarray(), expected_A) + assert rows == expected_rows + assert cols == expected_cols + + def test_non_sparse_raises(self): + ser = pd.Series([1, 2, 3]) + with pytest.raises(AttributeError, match=".sparse"): + ser.sparse.density + class TestFrameAccessor: def test_accessor_raises(self): diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py index 3db1ee9faad78..8f975e942db93 100644 --- a/pandas/tests/arrays/sparse/test_arithmetics.py +++ b/pandas/tests/arrays/sparse/test_arithmetics.py @@ -27,10 +27,6 @@ def mix(request): class TestSparseArrayArithmetics: - - _base = np.array - _klass = SparseArray - def _assert(self, a, b): # We have to use tm.assert_sp_array_equal. See GH #45126 tm.assert_numpy_array_equal(a, b) @@ -54,7 +50,7 @@ def _check_numeric_ops(self, a, b, a_dense, b_dense, mix: bool, op): self._assert(result, expected) def _check_bool_result(self, res): - assert isinstance(res, self._klass) + assert isinstance(res, SparseArray) assert isinstance(res.dtype, SparseDtype) assert res.dtype.subtype == np.bool_ assert isinstance(res.fill_value, bool) @@ -133,25 +129,25 @@ def test_float_scalar( mark = pytest.mark.xfail(raises=AssertionError, reason="GH#38172") request.node.add_marker(mark) - values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - a = self._klass(values, kind=kind, fill_value=fill_value) + a = SparseArray(values, kind=kind, fill_value=fill_value) self._check_numeric_ops(a, scalar, values, scalar, mix, op) def test_float_scalar_comparison(self, kind): - values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - a = self._klass(values, kind=kind) + a = SparseArray(values, kind=kind) self._check_comparison_ops(a, 1, values, 1) self._check_comparison_ops(a, 0, values, 0) self._check_comparison_ops(a, 3, values, 3) - a = self._klass(values, kind=kind, fill_value=0) + a = SparseArray(values, kind=kind, fill_value=0) self._check_comparison_ops(a, 1, values, 1) self._check_comparison_ops(a, 0, values, 0) self._check_comparison_ops(a, 3, values, 3) - a = self._klass(values, kind=kind, fill_value=2) + a = SparseArray(values, kind=kind, fill_value=2) self._check_comparison_ops(a, 1, values, 1) self._check_comparison_ops(a, 0, values, 0) self._check_comparison_ops(a, 3, values, 3) @@ -160,11 +156,11 @@ def test_float_same_index_without_nans(self, kind, mix, all_arithmetic_functions # when sp_index are the same op = all_arithmetic_functions - values = self._base([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0]) - rvalues = self._base([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0]) + values = np.array([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0]) + rvalues = np.array([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0]) - a = self._klass(values, kind=kind, fill_value=0) - b = self._klass(rvalues, kind=kind, fill_value=0) + a = SparseArray(values, kind=kind, fill_value=0) + b = SparseArray(rvalues, kind=kind, fill_value=0) self._check_numeric_ops(a, b, values, rvalues, mix, op) def test_float_same_index_with_nans( @@ -180,94 +176,94 @@ def test_float_same_index_with_nans( ): mark = pytest.mark.xfail(raises=AssertionError, reason="GH#38172") request.node.add_marker(mark) - values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) + values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = np.array([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) - a = self._klass(values, kind=kind) - b = self._klass(rvalues, kind=kind) + a = SparseArray(values, kind=kind) + b = SparseArray(rvalues, kind=kind) self._check_numeric_ops(a, b, values, rvalues, mix, op) def test_float_same_index_comparison(self, kind): # when sp_index are the same - values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) + values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = np.array([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) - a = self._klass(values, kind=kind) - b = self._klass(rvalues, kind=kind) + a = SparseArray(values, kind=kind) + b = SparseArray(rvalues, kind=kind) self._check_comparison_ops(a, b, values, rvalues) - values = self._base([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0]) - rvalues = self._base([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0]) + values = np.array([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0]) + rvalues = np.array([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0]) - a = self._klass(values, kind=kind, fill_value=0) - b = self._klass(rvalues, kind=kind, fill_value=0) + a = SparseArray(values, kind=kind, fill_value=0) + b = SparseArray(rvalues, kind=kind, fill_value=0) self._check_comparison_ops(a, b, values, rvalues) def test_float_array(self, kind, mix, all_arithmetic_functions): op = all_arithmetic_functions - values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) + values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = np.array([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) - a = self._klass(values, kind=kind) - b = self._klass(rvalues, kind=kind) + a = SparseArray(values, kind=kind) + b = SparseArray(rvalues, kind=kind) self._check_numeric_ops(a, b, values, rvalues, mix, op) self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op) - a = self._klass(values, kind=kind, fill_value=0) - b = self._klass(rvalues, kind=kind) + a = SparseArray(values, kind=kind, fill_value=0) + b = SparseArray(rvalues, kind=kind) self._check_numeric_ops(a, b, values, rvalues, mix, op) - a = self._klass(values, kind=kind, fill_value=0) - b = self._klass(rvalues, kind=kind, fill_value=0) + a = SparseArray(values, kind=kind, fill_value=0) + b = SparseArray(rvalues, kind=kind, fill_value=0) self._check_numeric_ops(a, b, values, rvalues, mix, op) - a = self._klass(values, kind=kind, fill_value=1) - b = self._klass(rvalues, kind=kind, fill_value=2) + a = SparseArray(values, kind=kind, fill_value=1) + b = SparseArray(rvalues, kind=kind, fill_value=2) self._check_numeric_ops(a, b, values, rvalues, mix, op) def test_float_array_different_kind(self, mix, all_arithmetic_functions): op = all_arithmetic_functions - values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) + values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = np.array([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) - a = self._klass(values, kind="integer") - b = self._klass(rvalues, kind="block") + a = SparseArray(values, kind="integer") + b = SparseArray(rvalues, kind="block") self._check_numeric_ops(a, b, values, rvalues, mix, op) self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op) - a = self._klass(values, kind="integer", fill_value=0) - b = self._klass(rvalues, kind="block") + a = SparseArray(values, kind="integer", fill_value=0) + b = SparseArray(rvalues, kind="block") self._check_numeric_ops(a, b, values, rvalues, mix, op) - a = self._klass(values, kind="integer", fill_value=0) - b = self._klass(rvalues, kind="block", fill_value=0) + a = SparseArray(values, kind="integer", fill_value=0) + b = SparseArray(rvalues, kind="block", fill_value=0) self._check_numeric_ops(a, b, values, rvalues, mix, op) - a = self._klass(values, kind="integer", fill_value=1) - b = self._klass(rvalues, kind="block", fill_value=2) + a = SparseArray(values, kind="integer", fill_value=1) + b = SparseArray(rvalues, kind="block", fill_value=2) self._check_numeric_ops(a, b, values, rvalues, mix, op) def test_float_array_comparison(self, kind): - values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) + values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = np.array([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) - a = self._klass(values, kind=kind) - b = self._klass(rvalues, kind=kind) + a = SparseArray(values, kind=kind) + b = SparseArray(rvalues, kind=kind) self._check_comparison_ops(a, b, values, rvalues) self._check_comparison_ops(a, b * 0, values, rvalues * 0) - a = self._klass(values, kind=kind, fill_value=0) - b = self._klass(rvalues, kind=kind) + a = SparseArray(values, kind=kind, fill_value=0) + b = SparseArray(rvalues, kind=kind) self._check_comparison_ops(a, b, values, rvalues) - a = self._klass(values, kind=kind, fill_value=0) - b = self._klass(rvalues, kind=kind, fill_value=0) + a = SparseArray(values, kind=kind, fill_value=0) + b = SparseArray(rvalues, kind=kind, fill_value=0) self._check_comparison_ops(a, b, values, rvalues) - a = self._klass(values, kind=kind, fill_value=1) - b = self._klass(rvalues, kind=kind, fill_value=2) + a = SparseArray(values, kind=kind, fill_value=1) + b = SparseArray(rvalues, kind=kind, fill_value=2) self._check_comparison_ops(a, b, values, rvalues) def test_int_array(self, kind, mix, all_arithmetic_functions): @@ -276,33 +272,33 @@ def test_int_array(self, kind, mix, all_arithmetic_functions): # have to specify dtype explicitly until fixing GH 667 dtype = np.int64 - values = self._base([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype) - rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype) + values = np.array([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype) + rvalues = np.array([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype) - a = self._klass(values, dtype=dtype, kind=kind) + a = SparseArray(values, dtype=dtype, kind=kind) assert a.dtype == SparseDtype(dtype) - b = self._klass(rvalues, dtype=dtype, kind=kind) + b = SparseArray(rvalues, dtype=dtype, kind=kind) assert b.dtype == SparseDtype(dtype) self._check_numeric_ops(a, b, values, rvalues, mix, op) self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op) - a = self._klass(values, fill_value=0, dtype=dtype, kind=kind) + a = SparseArray(values, fill_value=0, dtype=dtype, kind=kind) assert a.dtype == SparseDtype(dtype) - b = self._klass(rvalues, dtype=dtype, kind=kind) + b = SparseArray(rvalues, dtype=dtype, kind=kind) assert b.dtype == SparseDtype(dtype) self._check_numeric_ops(a, b, values, rvalues, mix, op) - a = self._klass(values, fill_value=0, dtype=dtype, kind=kind) + a = SparseArray(values, fill_value=0, dtype=dtype, kind=kind) assert a.dtype == SparseDtype(dtype) - b = self._klass(rvalues, fill_value=0, dtype=dtype, kind=kind) + b = SparseArray(rvalues, fill_value=0, dtype=dtype, kind=kind) assert b.dtype == SparseDtype(dtype) self._check_numeric_ops(a, b, values, rvalues, mix, op) - a = self._klass(values, fill_value=1, dtype=dtype, kind=kind) + a = SparseArray(values, fill_value=1, dtype=dtype, kind=kind) assert a.dtype == SparseDtype(dtype, fill_value=1) - b = self._klass(rvalues, fill_value=2, dtype=dtype, kind=kind) + b = SparseArray(rvalues, fill_value=2, dtype=dtype, kind=kind) assert b.dtype == SparseDtype(dtype, fill_value=2) self._check_numeric_ops(a, b, values, rvalues, mix, op) @@ -310,46 +306,46 @@ def test_int_array_comparison(self, kind): dtype = "int64" # int32 NI ATM - values = self._base([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype) - rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype) + values = np.array([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype) + rvalues = np.array([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype) - a = self._klass(values, dtype=dtype, kind=kind) - b = self._klass(rvalues, dtype=dtype, kind=kind) + a = SparseArray(values, dtype=dtype, kind=kind) + b = SparseArray(rvalues, dtype=dtype, kind=kind) self._check_comparison_ops(a, b, values, rvalues) self._check_comparison_ops(a, b * 0, values, rvalues * 0) - a = self._klass(values, dtype=dtype, kind=kind, fill_value=0) - b = self._klass(rvalues, dtype=dtype, kind=kind) + a = SparseArray(values, dtype=dtype, kind=kind, fill_value=0) + b = SparseArray(rvalues, dtype=dtype, kind=kind) self._check_comparison_ops(a, b, values, rvalues) - a = self._klass(values, dtype=dtype, kind=kind, fill_value=0) - b = self._klass(rvalues, dtype=dtype, kind=kind, fill_value=0) + a = SparseArray(values, dtype=dtype, kind=kind, fill_value=0) + b = SparseArray(rvalues, dtype=dtype, kind=kind, fill_value=0) self._check_comparison_ops(a, b, values, rvalues) - a = self._klass(values, dtype=dtype, kind=kind, fill_value=1) - b = self._klass(rvalues, dtype=dtype, kind=kind, fill_value=2) + a = SparseArray(values, dtype=dtype, kind=kind, fill_value=1) + b = SparseArray(rvalues, dtype=dtype, kind=kind, fill_value=2) self._check_comparison_ops(a, b, values, rvalues) @pytest.mark.parametrize("fill_value", [True, False, np.nan]) def test_bool_same_index(self, kind, fill_value): # GH 14000 # when sp_index are the same - values = self._base([True, False, True, True], dtype=np.bool_) - rvalues = self._base([True, False, True, True], dtype=np.bool_) + values = np.array([True, False, True, True], dtype=np.bool_) + rvalues = np.array([True, False, True, True], dtype=np.bool_) - a = self._klass(values, kind=kind, dtype=np.bool_, fill_value=fill_value) - b = self._klass(rvalues, kind=kind, dtype=np.bool_, fill_value=fill_value) + a = SparseArray(values, kind=kind, dtype=np.bool_, fill_value=fill_value) + b = SparseArray(rvalues, kind=kind, dtype=np.bool_, fill_value=fill_value) self._check_logical_ops(a, b, values, rvalues) @pytest.mark.parametrize("fill_value", [True, False, np.nan]) def test_bool_array_logical(self, kind, fill_value): # GH 14000 # when sp_index are the same - values = self._base([True, False, True, False, True, True], dtype=np.bool_) - rvalues = self._base([True, False, False, True, False, True], dtype=np.bool_) + values = np.array([True, False, True, False, True, True], dtype=np.bool_) + rvalues = np.array([True, False, False, True, False, True], dtype=np.bool_) - a = self._klass(values, kind=kind, dtype=np.bool_, fill_value=fill_value) - b = self._klass(rvalues, kind=kind, dtype=np.bool_, fill_value=fill_value) + a = SparseArray(values, kind=kind, dtype=np.bool_, fill_value=fill_value) + b = SparseArray(rvalues, kind=kind, dtype=np.bool_, fill_value=fill_value) self._check_logical_ops(a, b, values, rvalues) def test_mixed_array_float_int(self, kind, mix, all_arithmetic_functions, request): @@ -361,28 +357,28 @@ def test_mixed_array_float_int(self, kind, mix, all_arithmetic_functions, reques rdtype = "int64" - values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype) + values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = np.array([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype) - a = self._klass(values, kind=kind) - b = self._klass(rvalues, kind=kind) + a = SparseArray(values, kind=kind) + b = SparseArray(rvalues, kind=kind) assert b.dtype == SparseDtype(rdtype) self._check_numeric_ops(a, b, values, rvalues, mix, op) self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op) - a = self._klass(values, kind=kind, fill_value=0) - b = self._klass(rvalues, kind=kind) + a = SparseArray(values, kind=kind, fill_value=0) + b = SparseArray(rvalues, kind=kind) assert b.dtype == SparseDtype(rdtype) self._check_numeric_ops(a, b, values, rvalues, mix, op) - a = self._klass(values, kind=kind, fill_value=0) - b = self._klass(rvalues, kind=kind, fill_value=0) + a = SparseArray(values, kind=kind, fill_value=0) + b = SparseArray(rvalues, kind=kind, fill_value=0) assert b.dtype == SparseDtype(rdtype) self._check_numeric_ops(a, b, values, rvalues, mix, op) - a = self._klass(values, kind=kind, fill_value=1) - b = self._klass(rvalues, kind=kind, fill_value=2) + a = SparseArray(values, kind=kind, fill_value=1) + b = SparseArray(rvalues, kind=kind, fill_value=2) assert b.dtype == SparseDtype(rdtype, fill_value=2) self._check_numeric_ops(a, b, values, rvalues, mix, op) @@ -390,28 +386,28 @@ def test_mixed_array_comparison(self, kind): rdtype = "int64" # int32 NI ATM - values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype) + values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = np.array([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype) - a = self._klass(values, kind=kind) - b = self._klass(rvalues, kind=kind) + a = SparseArray(values, kind=kind) + b = SparseArray(rvalues, kind=kind) assert b.dtype == SparseDtype(rdtype) self._check_comparison_ops(a, b, values, rvalues) self._check_comparison_ops(a, b * 0, values, rvalues * 0) - a = self._klass(values, kind=kind, fill_value=0) - b = self._klass(rvalues, kind=kind) + a = SparseArray(values, kind=kind, fill_value=0) + b = SparseArray(rvalues, kind=kind) assert b.dtype == SparseDtype(rdtype) self._check_comparison_ops(a, b, values, rvalues) - a = self._klass(values, kind=kind, fill_value=0) - b = self._klass(rvalues, kind=kind, fill_value=0) + a = SparseArray(values, kind=kind, fill_value=0) + b = SparseArray(rvalues, kind=kind, fill_value=0) assert b.dtype == SparseDtype(rdtype) self._check_comparison_ops(a, b, values, rvalues) - a = self._klass(values, kind=kind, fill_value=1) - b = self._klass(rvalues, kind=kind, fill_value=2) + a = SparseArray(values, kind=kind, fill_value=1) + b = SparseArray(rvalues, kind=kind, fill_value=2) assert b.dtype == SparseDtype(rdtype, fill_value=2) self._check_comparison_ops(a, b, values, rvalues) @@ -495,36 +491,60 @@ def test_sparray_inplace(): tm.assert_sp_array_equal(sparray, expected) -@pytest.mark.parametrize("fill_value", [True, False]) -def test_invert(fill_value): - arr = np.array([True, False, False, True]) - sparray = SparseArray(arr, fill_value=fill_value) - result = ~sparray - expected = SparseArray(~arr, fill_value=not fill_value) - tm.assert_sp_array_equal(result, expected) - - result = ~pd.Series(sparray) - expected = pd.Series(expected) - tm.assert_series_equal(result, expected) - - result = ~pd.DataFrame({"A": sparray}) - expected = pd.DataFrame({"A": expected}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("fill_value", [0, np.nan]) -@pytest.mark.parametrize("op", [operator.pos, operator.neg]) -def test_unary_op(op, fill_value): - arr = np.array([0, 1, np.nan, 2]) - sparray = SparseArray(arr, fill_value=fill_value) - result = op(sparray) - expected = SparseArray(op(arr), fill_value=op(fill_value)) - tm.assert_sp_array_equal(result, expected) - - @pytest.mark.parametrize("cons", [list, np.array, SparseArray]) def test_mismatched_length_cmp_op(cons): left = SparseArray([True, True]) right = cons([True, True, True]) with pytest.raises(ValueError, match="operands have mismatched length"): left & right + + +@pytest.mark.parametrize("op", ["add", "sub", "mul", "truediv", "floordiv", "pow"]) +def test_binary_operators(op): + op = getattr(operator, op) + data1 = np.random.randn(20) + data2 = np.random.randn(20) + + data1[::2] = np.nan + data2[::3] = np.nan + + arr1 = SparseArray(data1) + arr2 = SparseArray(data2) + + data1[::2] = 3 + data2[::3] = 3 + farr1 = SparseArray(data1, fill_value=3) + farr2 = SparseArray(data2, fill_value=3) + + def _check_op(op, first, second): + res = op(first, second) + exp = SparseArray( + op(first.to_dense(), second.to_dense()), fill_value=first.fill_value + ) + assert isinstance(res, SparseArray) + tm.assert_almost_equal(res.to_dense(), exp.to_dense()) + + res2 = op(first, second.to_dense()) + assert isinstance(res2, SparseArray) + tm.assert_sp_array_equal(res, res2) + + res3 = op(first.to_dense(), second) + assert isinstance(res3, SparseArray) + tm.assert_sp_array_equal(res, res3) + + res4 = op(first, 4) + assert isinstance(res4, SparseArray) + + # Ignore this if the actual op raises (e.g. pow). + try: + exp = op(first.to_dense(), 4) + exp_fv = op(first.fill_value, 4) + except ValueError: + pass + else: + tm.assert_almost_equal(res4.fill_value, exp_fv) + tm.assert_almost_equal(res4.to_dense(), exp) + + with np.errstate(all="ignore"): + for first_arr, second_arr in [(arr1, arr2), (farr1, farr2)]: + _check_op(op, first_arr, second_arr) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index baa7537e3cc2f..5874e817477a9 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -1,4 +1,3 @@ -import operator import re import warnings @@ -6,7 +5,6 @@ import pytest from pandas._libs.sparse import IntIndex -import pandas.util._test_decorators as td import pandas as pd from pandas import isna @@ -24,309 +22,6 @@ def setup_method(self, method): self.arr = SparseArray(self.arr_data) self.zarr = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0) - def test_constructor_dtype(self): - arr = SparseArray([np.nan, 1, 2, np.nan]) - assert arr.dtype == SparseDtype(np.float64, np.nan) - assert arr.dtype.subtype == np.float64 - assert np.isnan(arr.fill_value) - - arr = SparseArray([np.nan, 1, 2, np.nan], fill_value=0) - assert arr.dtype == SparseDtype(np.float64, 0) - assert arr.fill_value == 0 - - arr = SparseArray([0, 1, 2, 4], dtype=np.float64) - assert arr.dtype == SparseDtype(np.float64, np.nan) - assert np.isnan(arr.fill_value) - - arr = SparseArray([0, 1, 2, 4], dtype=np.int64) - assert arr.dtype == SparseDtype(np.int64, 0) - assert arr.fill_value == 0 - - arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=np.int64) - assert arr.dtype == SparseDtype(np.int64, 0) - assert arr.fill_value == 0 - - arr = SparseArray([0, 1, 2, 4], dtype=None) - assert arr.dtype == SparseDtype(np.int64, 0) - assert arr.fill_value == 0 - - arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=None) - assert arr.dtype == SparseDtype(np.int64, 0) - assert arr.fill_value == 0 - - def test_constructor_dtype_str(self): - result = SparseArray([1, 2, 3], dtype="int") - expected = SparseArray([1, 2, 3], dtype=int) - tm.assert_sp_array_equal(result, expected) - - def test_constructor_sparse_dtype(self): - result = SparseArray([1, 0, 0, 1], dtype=SparseDtype("int64", -1)) - expected = SparseArray([1, 0, 0, 1], fill_value=-1, dtype=np.int64) - tm.assert_sp_array_equal(result, expected) - assert result.sp_values.dtype == np.dtype("int64") - - def test_constructor_sparse_dtype_str(self): - result = SparseArray([1, 0, 0, 1], dtype="Sparse[int32]") - expected = SparseArray([1, 0, 0, 1], dtype=np.int32) - tm.assert_sp_array_equal(result, expected) - assert result.sp_values.dtype == np.dtype("int32") - - def test_constructor_object_dtype(self): - # GH 11856 - arr = SparseArray(["A", "A", np.nan, "B"], dtype=object) - assert arr.dtype == SparseDtype(object) - assert np.isnan(arr.fill_value) - - arr = SparseArray(["A", "A", np.nan, "B"], dtype=object, fill_value="A") - assert arr.dtype == SparseDtype(object, "A") - assert arr.fill_value == "A" - - # GH 17574 - data = [False, 0, 100.0, 0.0] - arr = SparseArray(data, dtype=object, fill_value=False) - assert arr.dtype == SparseDtype(object, False) - assert arr.fill_value is False - arr_expected = np.array(data, dtype=object) - it = (type(x) == type(y) and x == y for x, y in zip(arr, arr_expected)) - assert np.fromiter(it, dtype=np.bool_).all() - - @pytest.mark.parametrize("dtype", [SparseDtype(int, 0), int]) - def test_constructor_na_dtype(self, dtype): - with pytest.raises(ValueError, match="Cannot convert"): - SparseArray([0, 1, np.nan], dtype=dtype) - - def test_constructor_warns_when_losing_timezone(self): - # GH#32501 warn when losing timezone information - dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") - - expected = SparseArray(np.asarray(dti, dtype="datetime64[ns]")) - - with tm.assert_produces_warning(UserWarning): - result = SparseArray(dti) - - tm.assert_sp_array_equal(result, expected) - - with tm.assert_produces_warning(UserWarning): - result = SparseArray(pd.Series(dti)) - - tm.assert_sp_array_equal(result, expected) - - def test_constructor_spindex_dtype(self): - arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2])) - # XXX: Behavior change: specifying SparseIndex no longer changes the - # fill_value - expected = SparseArray([0, 1, 2, 0], kind="integer") - tm.assert_sp_array_equal(arr, expected) - assert arr.dtype == SparseDtype(np.int64) - assert arr.fill_value == 0 - - arr = SparseArray( - data=[1, 2, 3], - sparse_index=IntIndex(4, [1, 2, 3]), - dtype=np.int64, - fill_value=0, - ) - exp = SparseArray([0, 1, 2, 3], dtype=np.int64, fill_value=0) - tm.assert_sp_array_equal(arr, exp) - assert arr.dtype == SparseDtype(np.int64) - assert arr.fill_value == 0 - - arr = SparseArray( - data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=np.int64 - ) - exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=np.int64) - tm.assert_sp_array_equal(arr, exp) - assert arr.dtype == SparseDtype(np.int64) - assert arr.fill_value == 0 - - arr = SparseArray( - data=[1, 2, 3], - sparse_index=IntIndex(4, [1, 2, 3]), - dtype=None, - fill_value=0, - ) - exp = SparseArray([0, 1, 2, 3], dtype=None) - tm.assert_sp_array_equal(arr, exp) - assert arr.dtype == SparseDtype(np.int64) - assert arr.fill_value == 0 - - @pytest.mark.parametrize("sparse_index", [None, IntIndex(1, [0])]) - def test_constructor_spindex_dtype_scalar(self, sparse_index): - # scalar input - arr = SparseArray(data=1, sparse_index=sparse_index, dtype=None) - exp = SparseArray([1], dtype=None) - tm.assert_sp_array_equal(arr, exp) - assert arr.dtype == SparseDtype(np.int64) - assert arr.fill_value == 0 - - arr = SparseArray(data=1, sparse_index=IntIndex(1, [0]), dtype=None) - exp = SparseArray([1], dtype=None) - tm.assert_sp_array_equal(arr, exp) - assert arr.dtype == SparseDtype(np.int64) - assert arr.fill_value == 0 - - def test_constructor_spindex_dtype_scalar_broadcasts(self): - arr = SparseArray( - data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=None - ) - exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=None) - tm.assert_sp_array_equal(arr, exp) - assert arr.dtype == SparseDtype(np.int64) - assert arr.fill_value == 0 - - @pytest.mark.parametrize( - "data, fill_value", - [ - (np.array([1, 2]), 0), - (np.array([1.0, 2.0]), np.nan), - ([True, False], False), - ([pd.Timestamp("2017-01-01")], pd.NaT), - ], - ) - def test_constructor_inferred_fill_value(self, data, fill_value): - result = SparseArray(data).fill_value - - if isna(fill_value): - assert isna(result) - else: - assert result == fill_value - - @pytest.mark.parametrize("format", ["coo", "csc", "csr"]) - @pytest.mark.parametrize("size", [0, 10]) - @td.skip_if_no_scipy - def test_from_spmatrix(self, size, format): - import scipy.sparse - - mat = scipy.sparse.random(size, 1, density=0.5, format=format) - result = SparseArray.from_spmatrix(mat) - - result = np.asarray(result) - expected = mat.toarray().ravel() - tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.parametrize("format", ["coo", "csc", "csr"]) - @td.skip_if_no_scipy - def test_from_spmatrix_including_explicit_zero(self, format): - import scipy.sparse - - mat = scipy.sparse.random(10, 1, density=0.5, format=format) - mat.data[0] = 0 - result = SparseArray.from_spmatrix(mat) - - result = np.asarray(result) - expected = mat.toarray().ravel() - tm.assert_numpy_array_equal(result, expected) - - @td.skip_if_no_scipy - def test_from_spmatrix_raises(self): - import scipy.sparse - - mat = scipy.sparse.eye(5, 4, format="csc") - - with pytest.raises(ValueError, match="not '4'"): - SparseArray.from_spmatrix(mat) - - @pytest.mark.parametrize( - "scalar,dtype", - [ - (False, SparseDtype(bool, False)), - (0.0, SparseDtype("float64", 0)), - (1, SparseDtype("int64", 1)), - ("z", SparseDtype("object", "z")), - ], - ) - def test_scalar_with_index_infer_dtype(self, scalar, dtype): - # GH 19163 - with tm.assert_produces_warning( - FutureWarning, match="The index argument has been deprecated" - ): - arr = SparseArray(scalar, index=[1, 2, 3], fill_value=scalar) - exp = SparseArray([scalar, scalar, scalar], fill_value=scalar) - - tm.assert_sp_array_equal(arr, exp) - - assert arr.dtype == dtype - assert exp.dtype == dtype - - def test_getitem_bool_sparse_array(self): - # GH 23122 - spar_bool = SparseArray([False, True] * 5, dtype=np.bool8, fill_value=True) - exp = SparseArray([np.nan, 2, np.nan, 5, 6]) - tm.assert_sp_array_equal(self.arr[spar_bool], exp) - - spar_bool = ~spar_bool - res = self.arr[spar_bool] - exp = SparseArray([np.nan, 1, 3, 4, np.nan]) - tm.assert_sp_array_equal(res, exp) - - spar_bool = SparseArray( - [False, True, np.nan] * 3, dtype=np.bool8, fill_value=np.nan - ) - res = self.arr[spar_bool] - exp = SparseArray([np.nan, 3, 5]) - tm.assert_sp_array_equal(res, exp) - - def test_getitem_bool_sparse_array_as_comparison(self): - # GH 45110 - arr = SparseArray([1, 2, 3, 4, np.nan, np.nan], fill_value=np.nan) - res = arr[arr > 2] - exp = SparseArray([3.0, 4.0], fill_value=np.nan) - tm.assert_sp_array_equal(res, exp) - - def test_get_item(self): - - assert np.isnan(self.arr[1]) - assert self.arr[2] == 1 - assert self.arr[7] == 5 - - assert self.zarr[0] == 0 - assert self.zarr[2] == 1 - assert self.zarr[7] == 5 - - errmsg = "must be an integer between -10 and 10" - - with pytest.raises(IndexError, match=errmsg): - self.arr[11] - - with pytest.raises(IndexError, match=errmsg): - self.arr[-11] - - assert self.arr[-1] == self.arr[len(self.arr) - 1] - - def test_take_scalar_raises(self): - msg = "'indices' must be an array, not a scalar '2'." - with pytest.raises(ValueError, match=msg): - self.arr.take(2) - - def test_take(self): - exp = SparseArray(np.take(self.arr_data, [2, 3])) - tm.assert_sp_array_equal(self.arr.take([2, 3]), exp) - - exp = SparseArray(np.take(self.arr_data, [0, 1, 2])) - tm.assert_sp_array_equal(self.arr.take([0, 1, 2]), exp) - - def test_take_all_empty(self): - a = pd.array([0, 0], dtype=SparseDtype("int64")) - result = a.take([0, 1], allow_fill=True, fill_value=np.nan) - tm.assert_sp_array_equal(a, result) - - def test_take_fill_value(self): - data = np.array([1, np.nan, 0, 3, 0]) - sparse = SparseArray(data, fill_value=0) - - exp = SparseArray(np.take(data, [0]), fill_value=0) - tm.assert_sp_array_equal(sparse.take([0]), exp) - - exp = SparseArray(np.take(data, [1, 3, 4]), fill_value=0) - tm.assert_sp_array_equal(sparse.take([1, 3, 4]), exp) - - def test_take_negative(self): - exp = SparseArray(np.take(self.arr_data, [-1])) - tm.assert_sp_array_equal(self.arr.take([-1]), exp) - - exp = SparseArray(np.take(self.arr_data, [-4, -3, -2])) - tm.assert_sp_array_equal(self.arr.take([-4, -3, -2]), exp) - @pytest.mark.parametrize("fill_value", [0, None, np.nan]) def test_shift_fill_value(self, fill_value): # GH #24128 @@ -337,287 +32,6 @@ def test_shift_fill_value(self, fill_value): exp = SparseArray(np.array([fill_value, 1, 0, 0, 3]), fill_value=8.0) tm.assert_sp_array_equal(res, exp) - def test_bad_take(self): - with pytest.raises(IndexError, match="bounds"): - self.arr.take([11]) - - def test_take_filling(self): - # similar tests as GH 12631 - sparse = SparseArray([np.nan, np.nan, 1, np.nan, 4]) - result = sparse.take(np.array([1, 0, -1])) - expected = SparseArray([np.nan, np.nan, 4]) - tm.assert_sp_array_equal(result, expected) - - # XXX: test change: fill_value=True -> allow_fill=True - result = sparse.take(np.array([1, 0, -1]), allow_fill=True) - expected = SparseArray([np.nan, np.nan, np.nan]) - tm.assert_sp_array_equal(result, expected) - - # allow_fill=False - result = sparse.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) - expected = SparseArray([np.nan, np.nan, 4]) - tm.assert_sp_array_equal(result, expected) - - msg = "Invalid value in 'indices'" - with pytest.raises(ValueError, match=msg): - sparse.take(np.array([1, 0, -2]), allow_fill=True) - - with pytest.raises(ValueError, match=msg): - sparse.take(np.array([1, 0, -5]), allow_fill=True) - - msg = "out of bounds value in 'indices'" - with pytest.raises(IndexError, match=msg): - sparse.take(np.array([1, -6])) - with pytest.raises(IndexError, match=msg): - sparse.take(np.array([1, 5])) - with pytest.raises(IndexError, match=msg): - sparse.take(np.array([1, 5]), allow_fill=True) - - def test_take_filling_fill_value(self): - # same tests as GH 12631 - sparse = SparseArray([np.nan, 0, 1, 0, 4], fill_value=0) - result = sparse.take(np.array([1, 0, -1])) - expected = SparseArray([0, np.nan, 4], fill_value=0) - tm.assert_sp_array_equal(result, expected) - - # fill_value - result = sparse.take(np.array([1, 0, -1]), allow_fill=True) - # XXX: behavior change. - # the old way of filling self.fill_value doesn't follow EA rules. - # It's supposed to be self.dtype.na_value (nan in this case) - expected = SparseArray([0, np.nan, np.nan], fill_value=0) - tm.assert_sp_array_equal(result, expected) - - # allow_fill=False - result = sparse.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) - expected = SparseArray([0, np.nan, 4], fill_value=0) - tm.assert_sp_array_equal(result, expected) - - msg = "Invalid value in 'indices'." - with pytest.raises(ValueError, match=msg): - sparse.take(np.array([1, 0, -2]), allow_fill=True) - with pytest.raises(ValueError, match=msg): - sparse.take(np.array([1, 0, -5]), allow_fill=True) - - msg = "out of bounds value in 'indices'" - with pytest.raises(IndexError, match=msg): - sparse.take(np.array([1, -6])) - with pytest.raises(IndexError, match=msg): - sparse.take(np.array([1, 5])) - with pytest.raises(IndexError, match=msg): - sparse.take(np.array([1, 5]), fill_value=True) - - @pytest.mark.parametrize("kind", ["block", "integer"]) - def test_take_filling_all_nan(self, kind): - sparse = SparseArray([np.nan, np.nan, np.nan, np.nan, np.nan], kind=kind) - result = sparse.take(np.array([1, 0, -1])) - expected = SparseArray([np.nan, np.nan, np.nan], kind=kind) - tm.assert_sp_array_equal(result, expected) - - result = sparse.take(np.array([1, 0, -1]), fill_value=True) - expected = SparseArray([np.nan, np.nan, np.nan], kind=kind) - tm.assert_sp_array_equal(result, expected) - - msg = "out of bounds value in 'indices'" - with pytest.raises(IndexError, match=msg): - sparse.take(np.array([1, -6])) - with pytest.raises(IndexError, match=msg): - sparse.take(np.array([1, 5])) - with pytest.raises(IndexError, match=msg): - sparse.take(np.array([1, 5]), fill_value=True) - - def test_set_item(self): - def setitem(): - self.arr[5] = 3 - - def setslice(): - self.arr[1:5] = 2 - - with pytest.raises(TypeError, match="assignment via setitem"): - setitem() - - with pytest.raises(TypeError, match="assignment via setitem"): - setslice() - - def test_constructor_from_too_large_array(self): - with pytest.raises(TypeError, match="expected dimension <= 1 data"): - SparseArray(np.arange(10).reshape((2, 5))) - - def test_constructor_from_sparse(self): - res = SparseArray(self.zarr) - assert res.fill_value == 0 - tm.assert_almost_equal(res.sp_values, self.zarr.sp_values) - - def test_constructor_copy(self): - cp = SparseArray(self.arr, copy=True) - cp.sp_values[:3] = 0 - assert not (self.arr.sp_values[:3] == 0).any() - - not_copy = SparseArray(self.arr) - not_copy.sp_values[:3] = 0 - assert (self.arr.sp_values[:3] == 0).all() - - def test_constructor_bool(self): - # GH 10648 - data = np.array([False, False, True, True, False, False]) - arr = SparseArray(data, fill_value=False, dtype=bool) - - assert arr.dtype == SparseDtype(bool) - tm.assert_numpy_array_equal(arr.sp_values, np.array([True, True])) - # Behavior change: np.asarray densifies. - # tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr)) - tm.assert_numpy_array_equal(arr.sp_index.indices, np.array([2, 3], np.int32)) - - dense = arr.to_dense() - assert dense.dtype == bool - tm.assert_numpy_array_equal(dense, data) - - def test_constructor_bool_fill_value(self): - arr = SparseArray([True, False, True], dtype=None) - assert arr.dtype == SparseDtype(np.bool_) - assert not arr.fill_value - - arr = SparseArray([True, False, True], dtype=np.bool_) - assert arr.dtype == SparseDtype(np.bool_) - assert not arr.fill_value - - arr = SparseArray([True, False, True], dtype=np.bool_, fill_value=True) - assert arr.dtype == SparseDtype(np.bool_, True) - assert arr.fill_value - - def test_constructor_float32(self): - # GH 10648 - data = np.array([1.0, np.nan, 3], dtype=np.float32) - arr = SparseArray(data, dtype=np.float32) - - assert arr.dtype == SparseDtype(np.float32) - tm.assert_numpy_array_equal(arr.sp_values, np.array([1, 3], dtype=np.float32)) - # Behavior change: np.asarray densifies. - # tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr)) - tm.assert_numpy_array_equal( - arr.sp_index.indices, np.array([0, 2], dtype=np.int32) - ) - - dense = arr.to_dense() - assert dense.dtype == np.float32 - tm.assert_numpy_array_equal(dense, data) - - def test_astype(self): - # float -> float - arr = SparseArray([None, None, 0, 2]) - result = arr.astype("Sparse[float32]") - expected = SparseArray([None, None, 0, 2], dtype=np.dtype("float32")) - tm.assert_sp_array_equal(result, expected) - - dtype = SparseDtype("float64", fill_value=0) - result = arr.astype(dtype) - expected = SparseArray._simple_new( - np.array([0.0, 2.0], dtype=dtype.subtype), IntIndex(4, [2, 3]), dtype - ) - tm.assert_sp_array_equal(result, expected) - - dtype = SparseDtype("int64", 0) - result = arr.astype(dtype) - expected = SparseArray._simple_new( - np.array([0, 2], dtype=np.int64), IntIndex(4, [2, 3]), dtype - ) - tm.assert_sp_array_equal(result, expected) - - arr = SparseArray([0, np.nan, 0, 1], fill_value=0) - with pytest.raises(ValueError, match="NA"): - arr.astype("Sparse[i8]") - - def test_astype_bool(self): - a = SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0)) - with tm.assert_produces_warning(FutureWarning, match="astype from Sparse"): - result = a.astype(bool) - expected = SparseArray( - [True, False, False, True], dtype=SparseDtype(bool, False) - ) - tm.assert_sp_array_equal(result, expected) - - # update fill value - result = a.astype(SparseDtype(bool, False)) - expected = SparseArray( - [True, False, False, True], dtype=SparseDtype(bool, False) - ) - tm.assert_sp_array_equal(result, expected) - - def test_astype_all(self, any_real_numpy_dtype): - vals = np.array([1, 2, 3]) - arr = SparseArray(vals, fill_value=1) - typ = np.dtype(any_real_numpy_dtype) - with tm.assert_produces_warning(FutureWarning, match="astype from Sparse"): - res = arr.astype(typ) - assert res.dtype == SparseDtype(typ, 1) - assert res.sp_values.dtype == typ - - tm.assert_numpy_array_equal(np.asarray(res.to_dense()), vals.astype(typ)) - - @pytest.mark.parametrize( - "arr, dtype, expected", - [ - ( - SparseArray([0, 1]), - "float", - SparseArray([0.0, 1.0], dtype=SparseDtype(float, 0.0)), - ), - (SparseArray([0, 1]), bool, SparseArray([False, True])), - ( - SparseArray([0, 1], fill_value=1), - bool, - SparseArray([False, True], dtype=SparseDtype(bool, True)), - ), - pytest.param( - SparseArray([0, 1]), - "datetime64[ns]", - SparseArray( - np.array([0, 1], dtype="datetime64[ns]"), - dtype=SparseDtype("datetime64[ns]", pd.Timestamp("1970")), - ), - marks=[pytest.mark.xfail(reason="NumPy-7619")], - ), - ( - SparseArray([0, 1, 10]), - str, - SparseArray(["0", "1", "10"], dtype=SparseDtype(str, "0")), - ), - (SparseArray(["10", "20"]), float, SparseArray([10.0, 20.0])), - ( - SparseArray([0, 1, 0]), - object, - SparseArray([0, 1, 0], dtype=SparseDtype(object, 0)), - ), - ], - ) - def test_astype_more(self, arr, dtype, expected): - - if isinstance(dtype, SparseDtype): - warn = None - else: - warn = FutureWarning - - with tm.assert_produces_warning(warn, match="astype from SparseDtype"): - result = arr.astype(dtype) - tm.assert_sp_array_equal(result, expected) - - def test_astype_nan_raises(self): - arr = SparseArray([1.0, np.nan]) - with pytest.raises(ValueError, match="Cannot convert non-finite"): - msg = "astype from SparseDtype" - with tm.assert_produces_warning(FutureWarning, match=msg): - arr.astype(int) - - def test_astype_copy_false(self): - # GH#34456 bug caused by using .view instead of .astype in astype_nansafe - arr = SparseArray([1, 2, 3]) - - dtype = SparseDtype(float, 0) - - result = arr.astype(dtype, copy=False) - expected = SparseArray([1.0, 2.0, 3.0], fill_value=0.0) - tm.assert_sp_array_equal(result, expected) - def test_set_fill_value(self): arr = SparseArray([1.0, np.nan, 2.0], fill_value=np.nan) arr.fill_value = 2 @@ -627,7 +41,7 @@ def test_set_fill_value(self): arr.fill_value = 2 assert arr.fill_value == 2 - # XXX: this seems fine? You can construct an integer + # TODO: this seems fine? You can construct an integer # sparsearray with NaN fill value, why not update one? # coerces to int # msg = "unable to set fill_value 3\\.1 to int64 dtype" @@ -644,8 +58,9 @@ def test_set_fill_value(self): arr.fill_value = True assert arr.fill_value + # FIXME: don't leave commented-out # coerces to bool - # XXX: we can construct an sparse array of bool + # TODO: we can construct an sparse array of bool # type and use as fill_value any value # msg = "fill_value must be True, False or nan" # with pytest.raises(ValueError, match=msg): @@ -706,163 +121,6 @@ def test_dense_repr(self, vals, fill_value): tm.assert_numpy_array_equal(res2, vals) - def test_getitem(self): - def _checkit(i): - tm.assert_almost_equal(self.arr[i], self.arr.to_dense()[i]) - - for i in range(len(self.arr)): - _checkit(i) - _checkit(-i) - - def test_getitem_arraylike_mask(self): - arr = SparseArray([0, 1, 2]) - result = arr[[True, False, True]] - expected = SparseArray([0, 2]) - tm.assert_sp_array_equal(result, expected) - - @pytest.mark.parametrize( - "slc", - [ - np.s_[:], - np.s_[1:10], - np.s_[1:100], - np.s_[10:1], - np.s_[:-3], - np.s_[-5:-4], - np.s_[:-12], - np.s_[-12:], - np.s_[2:], - np.s_[2::3], - np.s_[::2], - np.s_[::-1], - np.s_[::-2], - np.s_[1:6:2], - np.s_[:-6:-2], - ], - ) - @pytest.mark.parametrize( - "as_dense", [[np.nan] * 10, [1] * 10, [np.nan] * 5 + [1] * 5, []] - ) - def test_getslice(self, slc, as_dense): - as_dense = np.array(as_dense) - arr = SparseArray(as_dense) - - result = arr[slc] - expected = SparseArray(as_dense[slc]) - - tm.assert_sp_array_equal(result, expected) - - def test_getslice_tuple(self): - dense = np.array([np.nan, 0, 3, 4, 0, 5, np.nan, np.nan, 0]) - - sparse = SparseArray(dense) - res = sparse[(slice(4, None),)] - exp = SparseArray(dense[4:]) - tm.assert_sp_array_equal(res, exp) - - sparse = SparseArray(dense, fill_value=0) - res = sparse[(slice(4, None),)] - exp = SparseArray(dense[4:], fill_value=0) - tm.assert_sp_array_equal(res, exp) - - msg = "too many indices for array" - with pytest.raises(IndexError, match=msg): - sparse[4:, :] - - with pytest.raises(IndexError, match=msg): - # check numpy compat - dense[4:, :] - - def test_boolean_slice_empty(self): - arr = SparseArray([0, 1, 2]) - res = arr[[False, False, False]] - assert res.dtype == arr.dtype - - def test_neg_operator(self): - arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8) - res = -arr - exp = SparseArray([1, 2, np.nan, -3], fill_value=np.nan, dtype=np.int8) - tm.assert_sp_array_equal(exp, res) - - arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8) - res = -arr - exp = SparseArray([1, 2, -1, -3], fill_value=1, dtype=np.int8) - tm.assert_sp_array_equal(exp, res) - - def test_abs_operator(self): - arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8) - res = abs(arr) - exp = SparseArray([1, 2, np.nan, 3], fill_value=np.nan, dtype=np.int8) - tm.assert_sp_array_equal(exp, res) - - arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8) - res = abs(arr) - exp = SparseArray([1, 2, 1, 3], fill_value=1, dtype=np.int8) - tm.assert_sp_array_equal(exp, res) - - def test_invert_operator(self): - arr = SparseArray([False, True, False, True], fill_value=False, dtype=np.bool8) - res = ~arr - exp = SparseArray( - np.invert([False, True, False, True]), fill_value=True, dtype=np.bool8 - ) - res = ~arr - tm.assert_sp_array_equal(exp, res) - - arr = SparseArray([0, 1, 0, 2, 3, 0], fill_value=0, dtype=np.int32) - res = ~arr - exp = SparseArray([-1, -2, -1, -3, -4, -1], fill_value=-1, dtype=np.int32) - - @pytest.mark.parametrize("op", ["add", "sub", "mul", "truediv", "floordiv", "pow"]) - def test_binary_operators(self, op): - op = getattr(operator, op) - data1 = np.random.randn(20) - data2 = np.random.randn(20) - - data1[::2] = np.nan - data2[::3] = np.nan - - arr1 = SparseArray(data1) - arr2 = SparseArray(data2) - - data1[::2] = 3 - data2[::3] = 3 - farr1 = SparseArray(data1, fill_value=3) - farr2 = SparseArray(data2, fill_value=3) - - def _check_op(op, first, second): - res = op(first, second) - exp = SparseArray( - op(first.to_dense(), second.to_dense()), fill_value=first.fill_value - ) - assert isinstance(res, SparseArray) - tm.assert_almost_equal(res.to_dense(), exp.to_dense()) - - res2 = op(first, second.to_dense()) - assert isinstance(res2, SparseArray) - tm.assert_sp_array_equal(res, res2) - - res3 = op(first.to_dense(), second) - assert isinstance(res3, SparseArray) - tm.assert_sp_array_equal(res, res3) - - res4 = op(first, 4) - assert isinstance(res4, SparseArray) - - # Ignore this if the actual op raises (e.g. pow). - try: - exp = op(first.to_dense(), 4) - exp_fv = op(first.fill_value, 4) - except ValueError: - pass - else: - tm.assert_almost_equal(res4.fill_value, exp_fv) - tm.assert_almost_equal(res4.to_dense(), exp) - - with np.errstate(all="ignore"): - for first_arr, second_arr in [(arr1, arr2), (farr1, farr2)]: - _check_op(op, first_arr, second_arr) - def test_pickle(self): def _check_roundtrip(obj): unpickled = tm.round_trip_pickle(obj) @@ -980,163 +238,6 @@ def test_nonzero(self): class TestSparseArrayAnalytics: - @pytest.mark.parametrize( - "data,pos,neg", - [ - ([True, True, True], True, False), - ([1, 2, 1], 1, 0), - ([1.0, 2.0, 1.0], 1.0, 0.0), - ], - ) - def test_all(self, data, pos, neg): - # GH 17570 - out = SparseArray(data).all() - assert out - - out = SparseArray(data, fill_value=pos).all() - assert out - - data[1] = neg - out = SparseArray(data).all() - assert not out - - out = SparseArray(data, fill_value=pos).all() - assert not out - - @pytest.mark.parametrize( - "data,pos,neg", - [ - ([True, True, True], True, False), - ([1, 2, 1], 1, 0), - ([1.0, 2.0, 1.0], 1.0, 0.0), - ], - ) - def test_numpy_all(self, data, pos, neg): - # GH 17570 - out = np.all(SparseArray(data)) - assert out - - out = np.all(SparseArray(data, fill_value=pos)) - assert out - - data[1] = neg - out = np.all(SparseArray(data)) - assert not out - - out = np.all(SparseArray(data, fill_value=pos)) - assert not out - - # raises with a different message on py2. - msg = "the 'out' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.all(SparseArray(data), out=np.array([])) - - @pytest.mark.parametrize( - "data,pos,neg", - [ - ([False, True, False], True, False), - ([0, 2, 0], 2, 0), - ([0.0, 2.0, 0.0], 2.0, 0.0), - ], - ) - def test_any(self, data, pos, neg): - # GH 17570 - out = SparseArray(data).any() - assert out - - out = SparseArray(data, fill_value=pos).any() - assert out - - data[1] = neg - out = SparseArray(data).any() - assert not out - - out = SparseArray(data, fill_value=pos).any() - assert not out - - @pytest.mark.parametrize( - "data,pos,neg", - [ - ([False, True, False], True, False), - ([0, 2, 0], 2, 0), - ([0.0, 2.0, 0.0], 2.0, 0.0), - ], - ) - def test_numpy_any(self, data, pos, neg): - # GH 17570 - out = np.any(SparseArray(data)) - assert out - - out = np.any(SparseArray(data, fill_value=pos)) - assert out - - data[1] = neg - out = np.any(SparseArray(data)) - assert not out - - out = np.any(SparseArray(data, fill_value=pos)) - assert not out - - msg = "the 'out' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.any(SparseArray(data), out=out) - - def test_sum(self): - data = np.arange(10).astype(float) - out = SparseArray(data).sum() - assert out == 45.0 - - data[5] = np.nan - out = SparseArray(data, fill_value=2).sum() - assert out == 40.0 - - out = SparseArray(data, fill_value=np.nan).sum() - assert out == 40.0 - - @pytest.mark.parametrize( - "arr", - [np.array([0, 1, np.nan, 1]), np.array([0, 1, 1])], - ) - @pytest.mark.parametrize("fill_value", [0, 1, np.nan]) - @pytest.mark.parametrize("min_count, expected", [(3, 2), (4, np.nan)]) - def test_sum_min_count(self, arr, fill_value, min_count, expected): - # https://github.com/pandas-dev/pandas/issues/25777 - sparray = SparseArray(arr, fill_value=fill_value) - result = sparray.sum(min_count=min_count) - if np.isnan(expected): - assert np.isnan(result) - else: - assert result == expected - - def test_bool_sum_min_count(self): - spar_bool = pd.arrays.SparseArray( - [False, True] * 5, dtype=np.bool8, fill_value=True - ) - res = spar_bool.sum(min_count=1) - assert res == 5 - res = spar_bool.sum(min_count=11) - assert isna(res) - - def test_numpy_sum(self): - data = np.arange(10).astype(float) - out = np.sum(SparseArray(data)) - assert out == 45.0 - - data[5] = np.nan - out = np.sum(SparseArray(data, fill_value=2)) - assert out == 40.0 - - out = np.sum(SparseArray(data, fill_value=np.nan)) - assert out == 40.0 - - msg = "the 'dtype' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.sum(SparseArray(data), dtype=np.int64) - - msg = "the 'out' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.sum(SparseArray(data), out=out) - @pytest.mark.parametrize( "data,expected", [ @@ -1177,32 +278,6 @@ def test_cumsum(self, data, expected, numpy): with pytest.raises(ValueError, match=msg): SparseArray(data).cumsum(axis=axis) - def test_mean(self): - data = np.arange(10).astype(float) - out = SparseArray(data).mean() - assert out == 4.5 - - data[5] = np.nan - out = SparseArray(data).mean() - assert out == 40.0 / 9 - - def test_numpy_mean(self): - data = np.arange(10).astype(float) - out = np.mean(SparseArray(data)) - assert out == 4.5 - - data[5] = np.nan - out = np.mean(SparseArray(data)) - assert out == 40.0 / 9 - - msg = "the 'dtype' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.mean(SparseArray(data), dtype=np.int64) - - msg = "the 'out' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.mean(SparseArray(data), out=out) - def test_ufunc(self): # GH 13853 make sure ufunc is applied to fill_value sparse = SparseArray([1, np.nan, 2, np.nan, -2]) @@ -1281,86 +356,6 @@ def test_npoints(self): assert arr.npoints == 1 -class TestAccessor: - @pytest.mark.parametrize("attr", ["npoints", "density", "fill_value", "sp_values"]) - def test_get_attributes(self, attr): - arr = SparseArray([0, 1]) - ser = pd.Series(arr) - - result = getattr(ser.sparse, attr) - expected = getattr(arr, attr) - assert result == expected - - @td.skip_if_no_scipy - def test_from_coo(self): - import scipy.sparse - - row = [0, 3, 1, 0] - col = [0, 3, 1, 2] - data = [4, 5, 7, 9] - # TODO(scipy#13585): Remove dtype when scipy is fixed - # https://github.com/scipy/scipy/issues/13585 - sp_array = scipy.sparse.coo_matrix((data, (row, col)), dtype="int") - result = pd.Series.sparse.from_coo(sp_array) - - index = pd.MultiIndex.from_arrays([[0, 0, 1, 3], [0, 2, 1, 3]]) - expected = pd.Series([4, 9, 7, 5], index=index, dtype="Sparse[int]") - tm.assert_series_equal(result, expected) - - @td.skip_if_no_scipy - @pytest.mark.parametrize( - "sort_labels, expected_rows, expected_cols, expected_values_pos", - [ - ( - False, - [("b", 2), ("a", 2), ("b", 1), ("a", 1)], - [("z", 1), ("z", 2), ("x", 2), ("z", 0)], - {1: (1, 0), 3: (3, 3)}, - ), - ( - True, - [("a", 1), ("a", 2), ("b", 1), ("b", 2)], - [("x", 2), ("z", 0), ("z", 1), ("z", 2)], - {1: (1, 2), 3: (0, 1)}, - ), - ], - ) - def test_to_coo( - self, sort_labels, expected_rows, expected_cols, expected_values_pos - ): - import scipy.sparse - - values = SparseArray([0, np.nan, 1, 0, None, 3], fill_value=0) - index = pd.MultiIndex.from_tuples( - [ - ("b", 2, "z", 1), - ("a", 2, "z", 2), - ("a", 2, "z", 1), - ("a", 2, "x", 2), - ("b", 1, "z", 1), - ("a", 1, "z", 0), - ] - ) - ss = pd.Series(values, index=index) - - expected_A = np.zeros((4, 4)) - for value, (row, col) in expected_values_pos.items(): - expected_A[row, col] = value - - A, rows, cols = ss.sparse.to_coo( - row_levels=(0, 1), column_levels=(2, 3), sort_labels=sort_labels - ) - assert isinstance(A, scipy.sparse.coo_matrix) - tm.assert_numpy_array_equal(A.toarray(), expected_A) - assert rows == expected_rows - assert cols == expected_cols - - def test_non_sparse_raises(self): - ser = pd.Series([1, 2, 3]) - with pytest.raises(AttributeError, match=".sparse"): - ser.sparse.density - - def test_setting_fill_value_fillna_still_works(): # This is why letting users update fill_value / dtype is bad # astype has the same problem. @@ -1468,78 +463,3 @@ def test_drop_duplicates_fill_value(): result = df.drop_duplicates() expected = pd.DataFrame({i: SparseArray([0.0], fill_value=0) for i in range(5)}) tm.assert_frame_equal(result, expected) - - -class TestMinMax: - @pytest.mark.parametrize( - "raw_data,max_expected,min_expected", - [ - (np.arange(5.0), [4], [0]), - (-np.arange(5.0), [0], [-4]), - (np.array([0, 1, 2, np.nan, 4]), [4], [0]), - (np.array([np.nan] * 5), [np.nan], [np.nan]), - (np.array([]), [np.nan], [np.nan]), - ], - ) - def test_nan_fill_value(self, raw_data, max_expected, min_expected): - arr = SparseArray(raw_data) - max_result = arr.max() - min_result = arr.min() - assert max_result in max_expected - assert min_result in min_expected - - max_result = arr.max(skipna=False) - min_result = arr.min(skipna=False) - if np.isnan(raw_data).any(): - assert np.isnan(max_result) - assert np.isnan(min_result) - else: - assert max_result in max_expected - assert min_result in min_expected - - @pytest.mark.parametrize( - "fill_value,max_expected,min_expected", - [ - (100, 100, 0), - (-100, 1, -100), - ], - ) - def test_fill_value(self, fill_value, max_expected, min_expected): - arr = SparseArray( - np.array([fill_value, 0, 1]), dtype=SparseDtype("int", fill_value) - ) - max_result = arr.max() - assert max_result == max_expected - - min_result = arr.min() - assert min_result == min_expected - - def test_only_fill_value(self): - fv = 100 - arr = SparseArray(np.array([fv, fv, fv]), dtype=SparseDtype("int", fv)) - assert len(arr._valid_sp_values) == 0 - - assert arr.max() == fv - assert arr.min() == fv - assert arr.max(skipna=False) == fv - assert arr.min(skipna=False) == fv - - @pytest.mark.parametrize("func", ["min", "max"]) - @pytest.mark.parametrize("data", [np.array([]), np.array([np.nan, np.nan])]) - @pytest.mark.parametrize( - "dtype,expected", - [ - (SparseDtype(np.float64, np.nan), np.nan), - (SparseDtype(np.float64, 5.0), np.nan), - (SparseDtype("datetime64[ns]", pd.NaT), pd.NaT), - (SparseDtype("datetime64[ns]", pd.to_datetime("2018-05-05")), pd.NaT), - ], - ) - def test_na_value_if_no_valid_values(self, func, data, dtype, expected): - arr = SparseArray(data, dtype=dtype) - result = getattr(arr, func)() - if expected is pd.NaT: - # TODO: pin down whether we wrap datetime64("NaT") - assert result is pd.NaT or np.isnat(result) - else: - assert np.isnan(result) diff --git a/pandas/tests/arrays/sparse/test_astype.py b/pandas/tests/arrays/sparse/test_astype.py new file mode 100644 index 0000000000000..88efd0f4ea09f --- /dev/null +++ b/pandas/tests/arrays/sparse/test_astype.py @@ -0,0 +1,129 @@ +import numpy as np +import pytest + +from pandas._libs.sparse import IntIndex + +from pandas import Timestamp +import pandas._testing as tm +from pandas.core.arrays.sparse import ( + SparseArray, + SparseDtype, +) + + +class TestAstype: + def test_astype(self): + # float -> float + arr = SparseArray([None, None, 0, 2]) + result = arr.astype("Sparse[float32]") + expected = SparseArray([None, None, 0, 2], dtype=np.dtype("float32")) + tm.assert_sp_array_equal(result, expected) + + dtype = SparseDtype("float64", fill_value=0) + result = arr.astype(dtype) + expected = SparseArray._simple_new( + np.array([0.0, 2.0], dtype=dtype.subtype), IntIndex(4, [2, 3]), dtype + ) + tm.assert_sp_array_equal(result, expected) + + dtype = SparseDtype("int64", 0) + result = arr.astype(dtype) + expected = SparseArray._simple_new( + np.array([0, 2], dtype=np.int64), IntIndex(4, [2, 3]), dtype + ) + tm.assert_sp_array_equal(result, expected) + + arr = SparseArray([0, np.nan, 0, 1], fill_value=0) + with pytest.raises(ValueError, match="NA"): + arr.astype("Sparse[i8]") + + def test_astype_bool(self): + a = SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0)) + with tm.assert_produces_warning(FutureWarning, match="astype from Sparse"): + result = a.astype(bool) + expected = SparseArray( + [True, False, False, True], dtype=SparseDtype(bool, False) + ) + tm.assert_sp_array_equal(result, expected) + + # update fill value + result = a.astype(SparseDtype(bool, False)) + expected = SparseArray( + [True, False, False, True], dtype=SparseDtype(bool, False) + ) + tm.assert_sp_array_equal(result, expected) + + def test_astype_all(self, any_real_numpy_dtype): + vals = np.array([1, 2, 3]) + arr = SparseArray(vals, fill_value=1) + typ = np.dtype(any_real_numpy_dtype) + with tm.assert_produces_warning(FutureWarning, match="astype from Sparse"): + res = arr.astype(typ) + assert res.dtype == SparseDtype(typ, 1) + assert res.sp_values.dtype == typ + + tm.assert_numpy_array_equal(np.asarray(res.to_dense()), vals.astype(typ)) + + @pytest.mark.parametrize( + "arr, dtype, expected", + [ + ( + SparseArray([0, 1]), + "float", + SparseArray([0.0, 1.0], dtype=SparseDtype(float, 0.0)), + ), + (SparseArray([0, 1]), bool, SparseArray([False, True])), + ( + SparseArray([0, 1], fill_value=1), + bool, + SparseArray([False, True], dtype=SparseDtype(bool, True)), + ), + pytest.param( + SparseArray([0, 1]), + "datetime64[ns]", + SparseArray( + np.array([0, 1], dtype="datetime64[ns]"), + dtype=SparseDtype("datetime64[ns]", Timestamp("1970")), + ), + marks=[pytest.mark.xfail(reason="NumPy-7619")], + ), + ( + SparseArray([0, 1, 10]), + str, + SparseArray(["0", "1", "10"], dtype=SparseDtype(str, "0")), + ), + (SparseArray(["10", "20"]), float, SparseArray([10.0, 20.0])), + ( + SparseArray([0, 1, 0]), + object, + SparseArray([0, 1, 0], dtype=SparseDtype(object, 0)), + ), + ], + ) + def test_astype_more(self, arr, dtype, expected): + + if isinstance(dtype, SparseDtype): + warn = None + else: + warn = FutureWarning + + with tm.assert_produces_warning(warn, match="astype from SparseDtype"): + result = arr.astype(dtype) + tm.assert_sp_array_equal(result, expected) + + def test_astype_nan_raises(self): + arr = SparseArray([1.0, np.nan]) + with pytest.raises(ValueError, match="Cannot convert non-finite"): + msg = "astype from SparseDtype" + with tm.assert_produces_warning(FutureWarning, match=msg): + arr.astype(int) + + def test_astype_copy_false(self): + # GH#34456 bug caused by using .view instead of .astype in astype_nansafe + arr = SparseArray([1, 2, 3]) + + dtype = SparseDtype(float, 0) + + result = arr.astype(dtype, copy=False) + expected = SparseArray([1.0, 2.0, 3.0], fill_value=0.0) + tm.assert_sp_array_equal(result, expected) diff --git a/pandas/tests/arrays/sparse/test_constructors.py b/pandas/tests/arrays/sparse/test_constructors.py new file mode 100644 index 0000000000000..c1fcda4fcd121 --- /dev/null +++ b/pandas/tests/arrays/sparse/test_constructors.py @@ -0,0 +1,307 @@ +import numpy as np +import pytest + +from pandas._libs.sparse import IntIndex +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import isna +import pandas._testing as tm +from pandas.core.arrays.sparse import ( + SparseArray, + SparseDtype, +) + + +class TestConstructors: + def test_constructor_dtype(self): + arr = SparseArray([np.nan, 1, 2, np.nan]) + assert arr.dtype == SparseDtype(np.float64, np.nan) + assert arr.dtype.subtype == np.float64 + assert np.isnan(arr.fill_value) + + arr = SparseArray([np.nan, 1, 2, np.nan], fill_value=0) + assert arr.dtype == SparseDtype(np.float64, 0) + assert arr.fill_value == 0 + + arr = SparseArray([0, 1, 2, 4], dtype=np.float64) + assert arr.dtype == SparseDtype(np.float64, np.nan) + assert np.isnan(arr.fill_value) + + arr = SparseArray([0, 1, 2, 4], dtype=np.int64) + assert arr.dtype == SparseDtype(np.int64, 0) + assert arr.fill_value == 0 + + arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=np.int64) + assert arr.dtype == SparseDtype(np.int64, 0) + assert arr.fill_value == 0 + + arr = SparseArray([0, 1, 2, 4], dtype=None) + assert arr.dtype == SparseDtype(np.int64, 0) + assert arr.fill_value == 0 + + arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=None) + assert arr.dtype == SparseDtype(np.int64, 0) + assert arr.fill_value == 0 + + def test_constructor_dtype_str(self): + result = SparseArray([1, 2, 3], dtype="int") + expected = SparseArray([1, 2, 3], dtype=int) + tm.assert_sp_array_equal(result, expected) + + def test_constructor_sparse_dtype(self): + result = SparseArray([1, 0, 0, 1], dtype=SparseDtype("int64", -1)) + expected = SparseArray([1, 0, 0, 1], fill_value=-1, dtype=np.int64) + tm.assert_sp_array_equal(result, expected) + assert result.sp_values.dtype == np.dtype("int64") + + def test_constructor_sparse_dtype_str(self): + result = SparseArray([1, 0, 0, 1], dtype="Sparse[int32]") + expected = SparseArray([1, 0, 0, 1], dtype=np.int32) + tm.assert_sp_array_equal(result, expected) + assert result.sp_values.dtype == np.dtype("int32") + + def test_constructor_object_dtype(self): + # GH#11856 + arr = SparseArray(["A", "A", np.nan, "B"], dtype=object) + assert arr.dtype == SparseDtype(object) + assert np.isnan(arr.fill_value) + + arr = SparseArray(["A", "A", np.nan, "B"], dtype=object, fill_value="A") + assert arr.dtype == SparseDtype(object, "A") + assert arr.fill_value == "A" + + # GH#17574 + data = [False, 0, 100.0, 0.0] + arr = SparseArray(data, dtype=object, fill_value=False) + assert arr.dtype == SparseDtype(object, False) + assert arr.fill_value is False + arr_expected = np.array(data, dtype=object) + it = (type(x) == type(y) and x == y for x, y in zip(arr, arr_expected)) + assert np.fromiter(it, dtype=np.bool_).all() + + @pytest.mark.parametrize("dtype", [SparseDtype(int, 0), int]) + def test_constructor_na_dtype(self, dtype): + with pytest.raises(ValueError, match="Cannot convert"): + SparseArray([0, 1, np.nan], dtype=dtype) + + def test_constructor_warns_when_losing_timezone(self): + # GH#32501 warn when losing timezone information + dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") + + expected = SparseArray(np.asarray(dti, dtype="datetime64[ns]")) + + with tm.assert_produces_warning(UserWarning): + result = SparseArray(dti) + + tm.assert_sp_array_equal(result, expected) + + with tm.assert_produces_warning(UserWarning): + result = SparseArray(pd.Series(dti)) + + tm.assert_sp_array_equal(result, expected) + + def test_constructor_spindex_dtype(self): + arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2])) + # TODO: actionable? + # XXX: Behavior change: specifying SparseIndex no longer changes the + # fill_value + expected = SparseArray([0, 1, 2, 0], kind="integer") + tm.assert_sp_array_equal(arr, expected) + assert arr.dtype == SparseDtype(np.int64) + assert arr.fill_value == 0 + + arr = SparseArray( + data=[1, 2, 3], + sparse_index=IntIndex(4, [1, 2, 3]), + dtype=np.int64, + fill_value=0, + ) + exp = SparseArray([0, 1, 2, 3], dtype=np.int64, fill_value=0) + tm.assert_sp_array_equal(arr, exp) + assert arr.dtype == SparseDtype(np.int64) + assert arr.fill_value == 0 + + arr = SparseArray( + data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=np.int64 + ) + exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=np.int64) + tm.assert_sp_array_equal(arr, exp) + assert arr.dtype == SparseDtype(np.int64) + assert arr.fill_value == 0 + + arr = SparseArray( + data=[1, 2, 3], + sparse_index=IntIndex(4, [1, 2, 3]), + dtype=None, + fill_value=0, + ) + exp = SparseArray([0, 1, 2, 3], dtype=None) + tm.assert_sp_array_equal(arr, exp) + assert arr.dtype == SparseDtype(np.int64) + assert arr.fill_value == 0 + + @pytest.mark.parametrize("sparse_index", [None, IntIndex(1, [0])]) + def test_constructor_spindex_dtype_scalar(self, sparse_index): + # scalar input + arr = SparseArray(data=1, sparse_index=sparse_index, dtype=None) + exp = SparseArray([1], dtype=None) + tm.assert_sp_array_equal(arr, exp) + assert arr.dtype == SparseDtype(np.int64) + assert arr.fill_value == 0 + + arr = SparseArray(data=1, sparse_index=IntIndex(1, [0]), dtype=None) + exp = SparseArray([1], dtype=None) + tm.assert_sp_array_equal(arr, exp) + assert arr.dtype == SparseDtype(np.int64) + assert arr.fill_value == 0 + + def test_constructor_spindex_dtype_scalar_broadcasts(self): + arr = SparseArray( + data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=None + ) + exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=None) + tm.assert_sp_array_equal(arr, exp) + assert arr.dtype == SparseDtype(np.int64) + assert arr.fill_value == 0 + + @pytest.mark.parametrize( + "data, fill_value", + [ + (np.array([1, 2]), 0), + (np.array([1.0, 2.0]), np.nan), + ([True, False], False), + ([pd.Timestamp("2017-01-01")], pd.NaT), + ], + ) + def test_constructor_inferred_fill_value(self, data, fill_value): + result = SparseArray(data).fill_value + + if isna(fill_value): + assert isna(result) + else: + assert result == fill_value + + @pytest.mark.parametrize("format", ["coo", "csc", "csr"]) + @pytest.mark.parametrize("size", [0, 10]) + @td.skip_if_no_scipy + def test_from_spmatrix(self, size, format): + import scipy.sparse + + mat = scipy.sparse.random(size, 1, density=0.5, format=format) + result = SparseArray.from_spmatrix(mat) + + result = np.asarray(result) + expected = mat.toarray().ravel() + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("format", ["coo", "csc", "csr"]) + @td.skip_if_no_scipy + def test_from_spmatrix_including_explicit_zero(self, format): + import scipy.sparse + + mat = scipy.sparse.random(10, 1, density=0.5, format=format) + mat.data[0] = 0 + result = SparseArray.from_spmatrix(mat) + + result = np.asarray(result) + expected = mat.toarray().ravel() + tm.assert_numpy_array_equal(result, expected) + + @td.skip_if_no_scipy + def test_from_spmatrix_raises(self): + import scipy.sparse + + mat = scipy.sparse.eye(5, 4, format="csc") + + with pytest.raises(ValueError, match="not '4'"): + SparseArray.from_spmatrix(mat) + + @pytest.mark.parametrize( + "scalar,dtype", + [ + (False, SparseDtype(bool, False)), + (0.0, SparseDtype("float64", 0)), + (1, SparseDtype("int64", 1)), + ("z", SparseDtype("object", "z")), + ], + ) + def test_scalar_with_index_infer_dtype(self, scalar, dtype): + # GH#19163 + with tm.assert_produces_warning( + FutureWarning, match="The index argument has been deprecated" + ): + arr = SparseArray(scalar, index=[1, 2, 3], fill_value=scalar) + exp = SparseArray([scalar, scalar, scalar], fill_value=scalar) + + tm.assert_sp_array_equal(arr, exp) + + assert arr.dtype == dtype + assert exp.dtype == dtype + + def test_constructor_from_too_large_array(self): + with pytest.raises(TypeError, match="expected dimension <= 1 data"): + SparseArray(np.arange(10).reshape((2, 5))) + + def test_constructor_from_sparse(self): + zarr = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0) + res = SparseArray(zarr) + assert res.fill_value == 0 + tm.assert_almost_equal(res.sp_values, zarr.sp_values) + + def test_constructor_copy(self): + arr_data = np.array([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6]) + arr = SparseArray(arr_data) + + cp = SparseArray(arr, copy=True) + cp.sp_values[:3] = 0 + assert not (arr.sp_values[:3] == 0).any() + + not_copy = SparseArray(arr) + not_copy.sp_values[:3] = 0 + assert (arr.sp_values[:3] == 0).all() + + def test_constructor_bool(self): + # GH#10648 + data = np.array([False, False, True, True, False, False]) + arr = SparseArray(data, fill_value=False, dtype=bool) + + assert arr.dtype == SparseDtype(bool) + tm.assert_numpy_array_equal(arr.sp_values, np.array([True, True])) + # Behavior change: np.asarray densifies. + # tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr)) + tm.assert_numpy_array_equal(arr.sp_index.indices, np.array([2, 3], np.int32)) + + dense = arr.to_dense() + assert dense.dtype == bool + tm.assert_numpy_array_equal(dense, data) + + def test_constructor_bool_fill_value(self): + arr = SparseArray([True, False, True], dtype=None) + assert arr.dtype == SparseDtype(np.bool_) + assert not arr.fill_value + + arr = SparseArray([True, False, True], dtype=np.bool_) + assert arr.dtype == SparseDtype(np.bool_) + assert not arr.fill_value + + arr = SparseArray([True, False, True], dtype=np.bool_, fill_value=True) + assert arr.dtype == SparseDtype(np.bool_, True) + assert arr.fill_value + + def test_constructor_float32(self): + # GH#10648 + data = np.array([1.0, np.nan, 3], dtype=np.float32) + arr = SparseArray(data, dtype=np.float32) + + assert arr.dtype == SparseDtype(np.float32) + tm.assert_numpy_array_equal(arr.sp_values, np.array([1, 3], dtype=np.float32)) + # Behavior change: np.asarray densifies. + # tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr)) + tm.assert_numpy_array_equal( + arr.sp_index.indices, np.array([0, 2], dtype=np.int32) + ) + + dense = arr.to_dense() + assert dense.dtype == np.float32 + tm.assert_numpy_array_equal(dense, data) diff --git a/pandas/tests/arrays/sparse/test_indexing.py b/pandas/tests/arrays/sparse/test_indexing.py new file mode 100644 index 0000000000000..2794fe33e53e5 --- /dev/null +++ b/pandas/tests/arrays/sparse/test_indexing.py @@ -0,0 +1,292 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays.sparse import ( + SparseArray, + SparseDtype, +) + +arr_data = np.array([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6]) +arr = SparseArray(arr_data) + + +class TestGetitem: + def test_getitem(self): + def _checkit(i): + tm.assert_almost_equal(arr[i], arr.to_dense()[i]) + + for i in range(len(arr)): + _checkit(i) + _checkit(-i) + + def test_getitem_arraylike_mask(self): + arr = SparseArray([0, 1, 2]) + result = arr[[True, False, True]] + expected = SparseArray([0, 2]) + tm.assert_sp_array_equal(result, expected) + + @pytest.mark.parametrize( + "slc", + [ + np.s_[:], + np.s_[1:10], + np.s_[1:100], + np.s_[10:1], + np.s_[:-3], + np.s_[-5:-4], + np.s_[:-12], + np.s_[-12:], + np.s_[2:], + np.s_[2::3], + np.s_[::2], + np.s_[::-1], + np.s_[::-2], + np.s_[1:6:2], + np.s_[:-6:-2], + ], + ) + @pytest.mark.parametrize( + "as_dense", [[np.nan] * 10, [1] * 10, [np.nan] * 5 + [1] * 5, []] + ) + def test_getslice(self, slc, as_dense): + as_dense = np.array(as_dense) + arr = SparseArray(as_dense) + + result = arr[slc] + expected = SparseArray(as_dense[slc]) + + tm.assert_sp_array_equal(result, expected) + + def test_getslice_tuple(self): + dense = np.array([np.nan, 0, 3, 4, 0, 5, np.nan, np.nan, 0]) + + sparse = SparseArray(dense) + res = sparse[(slice(4, None),)] + exp = SparseArray(dense[4:]) + tm.assert_sp_array_equal(res, exp) + + sparse = SparseArray(dense, fill_value=0) + res = sparse[(slice(4, None),)] + exp = SparseArray(dense[4:], fill_value=0) + tm.assert_sp_array_equal(res, exp) + + msg = "too many indices for array" + with pytest.raises(IndexError, match=msg): + sparse[4:, :] + + with pytest.raises(IndexError, match=msg): + # check numpy compat + dense[4:, :] + + def test_boolean_slice_empty(self): + arr = SparseArray([0, 1, 2]) + res = arr[[False, False, False]] + assert res.dtype == arr.dtype + + def test_getitem_bool_sparse_array(self): + # GH 23122 + spar_bool = SparseArray([False, True] * 5, dtype=np.bool8, fill_value=True) + exp = SparseArray([np.nan, 2, np.nan, 5, 6]) + tm.assert_sp_array_equal(arr[spar_bool], exp) + + spar_bool = ~spar_bool + res = arr[spar_bool] + exp = SparseArray([np.nan, 1, 3, 4, np.nan]) + tm.assert_sp_array_equal(res, exp) + + spar_bool = SparseArray( + [False, True, np.nan] * 3, dtype=np.bool8, fill_value=np.nan + ) + res = arr[spar_bool] + exp = SparseArray([np.nan, 3, 5]) + tm.assert_sp_array_equal(res, exp) + + def test_getitem_bool_sparse_array_as_comparison(self): + # GH 45110 + arr = SparseArray([1, 2, 3, 4, np.nan, np.nan], fill_value=np.nan) + res = arr[arr > 2] + exp = SparseArray([3.0, 4.0], fill_value=np.nan) + tm.assert_sp_array_equal(res, exp) + + def test_get_item(self): + zarr = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0) + + assert np.isnan(arr[1]) + assert arr[2] == 1 + assert arr[7] == 5 + + assert zarr[0] == 0 + assert zarr[2] == 1 + assert zarr[7] == 5 + + errmsg = "must be an integer between -10 and 10" + + with pytest.raises(IndexError, match=errmsg): + arr[11] + + with pytest.raises(IndexError, match=errmsg): + arr[-11] + + assert arr[-1] == arr[len(arr) - 1] + + +class TestSetitem: + def test_set_item(self): + arr = SparseArray(arr_data).copy() + + def setitem(): + arr[5] = 3 + + def setslice(): + arr[1:5] = 2 + + with pytest.raises(TypeError, match="assignment via setitem"): + setitem() + + with pytest.raises(TypeError, match="assignment via setitem"): + setslice() + + +class TestTake: + def test_take_scalar_raises(self): + msg = "'indices' must be an array, not a scalar '2'." + with pytest.raises(ValueError, match=msg): + arr.take(2) + + def test_take(self): + exp = SparseArray(np.take(arr_data, [2, 3])) + tm.assert_sp_array_equal(arr.take([2, 3]), exp) + + exp = SparseArray(np.take(arr_data, [0, 1, 2])) + tm.assert_sp_array_equal(arr.take([0, 1, 2]), exp) + + def test_take_all_empty(self): + a = pd.array([0, 0], dtype=SparseDtype("int64")) + result = a.take([0, 1], allow_fill=True, fill_value=np.nan) + tm.assert_sp_array_equal(a, result) + + def test_take_fill_value(self): + data = np.array([1, np.nan, 0, 3, 0]) + sparse = SparseArray(data, fill_value=0) + + exp = SparseArray(np.take(data, [0]), fill_value=0) + tm.assert_sp_array_equal(sparse.take([0]), exp) + + exp = SparseArray(np.take(data, [1, 3, 4]), fill_value=0) + tm.assert_sp_array_equal(sparse.take([1, 3, 4]), exp) + + def test_take_negative(self): + exp = SparseArray(np.take(arr_data, [-1])) + tm.assert_sp_array_equal(arr.take([-1]), exp) + + exp = SparseArray(np.take(arr_data, [-4, -3, -2])) + tm.assert_sp_array_equal(arr.take([-4, -3, -2]), exp) + + def test_bad_take(self): + with pytest.raises(IndexError, match="bounds"): + arr.take([11]) + + def test_take_filling(self): + # similar tests as GH 12631 + sparse = SparseArray([np.nan, np.nan, 1, np.nan, 4]) + result = sparse.take(np.array([1, 0, -1])) + expected = SparseArray([np.nan, np.nan, 4]) + tm.assert_sp_array_equal(result, expected) + + # TODO: actionable? + # XXX: test change: fill_value=True -> allow_fill=True + result = sparse.take(np.array([1, 0, -1]), allow_fill=True) + expected = SparseArray([np.nan, np.nan, np.nan]) + tm.assert_sp_array_equal(result, expected) + + # allow_fill=False + result = sparse.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = SparseArray([np.nan, np.nan, 4]) + tm.assert_sp_array_equal(result, expected) + + msg = "Invalid value in 'indices'" + with pytest.raises(ValueError, match=msg): + sparse.take(np.array([1, 0, -2]), allow_fill=True) + + with pytest.raises(ValueError, match=msg): + sparse.take(np.array([1, 0, -5]), allow_fill=True) + + msg = "out of bounds value in 'indices'" + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, -6])) + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, 5])) + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, 5]), allow_fill=True) + + def test_take_filling_fill_value(self): + # same tests as GH#12631 + sparse = SparseArray([np.nan, 0, 1, 0, 4], fill_value=0) + result = sparse.take(np.array([1, 0, -1])) + expected = SparseArray([0, np.nan, 4], fill_value=0) + tm.assert_sp_array_equal(result, expected) + + # fill_value + result = sparse.take(np.array([1, 0, -1]), allow_fill=True) + # TODO: actionable? + # XXX: behavior change. + # the old way of filling self.fill_value doesn't follow EA rules. + # It's supposed to be self.dtype.na_value (nan in this case) + expected = SparseArray([0, np.nan, np.nan], fill_value=0) + tm.assert_sp_array_equal(result, expected) + + # allow_fill=False + result = sparse.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = SparseArray([0, np.nan, 4], fill_value=0) + tm.assert_sp_array_equal(result, expected) + + msg = "Invalid value in 'indices'." + with pytest.raises(ValueError, match=msg): + sparse.take(np.array([1, 0, -2]), allow_fill=True) + with pytest.raises(ValueError, match=msg): + sparse.take(np.array([1, 0, -5]), allow_fill=True) + + msg = "out of bounds value in 'indices'" + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, -6])) + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, 5])) + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, 5]), fill_value=True) + + @pytest.mark.parametrize("kind", ["block", "integer"]) + def test_take_filling_all_nan(self, kind): + sparse = SparseArray([np.nan, np.nan, np.nan, np.nan, np.nan], kind=kind) + result = sparse.take(np.array([1, 0, -1])) + expected = SparseArray([np.nan, np.nan, np.nan], kind=kind) + tm.assert_sp_array_equal(result, expected) + + result = sparse.take(np.array([1, 0, -1]), fill_value=True) + expected = SparseArray([np.nan, np.nan, np.nan], kind=kind) + tm.assert_sp_array_equal(result, expected) + + msg = "out of bounds value in 'indices'" + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, -6])) + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, 5])) + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, 5]), fill_value=True) + + +class TestWhere: + def test_where_retain_fill_value(self): + # GH#45691 don't lose fill_value on _where + arr = SparseArray([np.nan, 1.0], fill_value=0) + + mask = np.array([True, False]) + + res = arr._where(~mask, 1) + exp = SparseArray([1, 1.0], fill_value=0) + tm.assert_sp_array_equal(res, exp) + + ser = pd.Series(arr) + res = ser.where(~mask, 1) + tm.assert_series_equal(res, pd.Series(exp)) diff --git a/pandas/tests/arrays/sparse/test_reductions.py b/pandas/tests/arrays/sparse/test_reductions.py new file mode 100644 index 0000000000000..a33a282bb4869 --- /dev/null +++ b/pandas/tests/arrays/sparse/test_reductions.py @@ -0,0 +1,270 @@ +import numpy as np +import pytest + +from pandas import ( + NaT, + Timestamp, + isna, +) +from pandas.core.arrays.sparse import ( + SparseArray, + SparseDtype, +) + + +class TestReductions: + @pytest.mark.parametrize( + "data,pos,neg", + [ + ([True, True, True], True, False), + ([1, 2, 1], 1, 0), + ([1.0, 2.0, 1.0], 1.0, 0.0), + ], + ) + def test_all(self, data, pos, neg): + # GH#17570 + out = SparseArray(data).all() + assert out + + out = SparseArray(data, fill_value=pos).all() + assert out + + data[1] = neg + out = SparseArray(data).all() + assert not out + + out = SparseArray(data, fill_value=pos).all() + assert not out + + @pytest.mark.parametrize( + "data,pos,neg", + [ + ([True, True, True], True, False), + ([1, 2, 1], 1, 0), + ([1.0, 2.0, 1.0], 1.0, 0.0), + ], + ) + def test_numpy_all(self, data, pos, neg): + # GH#17570 + out = np.all(SparseArray(data)) + assert out + + out = np.all(SparseArray(data, fill_value=pos)) + assert out + + data[1] = neg + out = np.all(SparseArray(data)) + assert not out + + out = np.all(SparseArray(data, fill_value=pos)) + assert not out + + # raises with a different message on py2. + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.all(SparseArray(data), out=np.array([])) + + @pytest.mark.parametrize( + "data,pos,neg", + [ + ([False, True, False], True, False), + ([0, 2, 0], 2, 0), + ([0.0, 2.0, 0.0], 2.0, 0.0), + ], + ) + def test_any(self, data, pos, neg): + # GH#17570 + out = SparseArray(data).any() + assert out + + out = SparseArray(data, fill_value=pos).any() + assert out + + data[1] = neg + out = SparseArray(data).any() + assert not out + + out = SparseArray(data, fill_value=pos).any() + assert not out + + @pytest.mark.parametrize( + "data,pos,neg", + [ + ([False, True, False], True, False), + ([0, 2, 0], 2, 0), + ([0.0, 2.0, 0.0], 2.0, 0.0), + ], + ) + def test_numpy_any(self, data, pos, neg): + # GH#17570 + out = np.any(SparseArray(data)) + assert out + + out = np.any(SparseArray(data, fill_value=pos)) + assert out + + data[1] = neg + out = np.any(SparseArray(data)) + assert not out + + out = np.any(SparseArray(data, fill_value=pos)) + assert not out + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.any(SparseArray(data), out=out) + + def test_sum(self): + data = np.arange(10).astype(float) + out = SparseArray(data).sum() + assert out == 45.0 + + data[5] = np.nan + out = SparseArray(data, fill_value=2).sum() + assert out == 40.0 + + out = SparseArray(data, fill_value=np.nan).sum() + assert out == 40.0 + + @pytest.mark.parametrize( + "arr", + [np.array([0, 1, np.nan, 1]), np.array([0, 1, 1])], + ) + @pytest.mark.parametrize("fill_value", [0, 1, np.nan]) + @pytest.mark.parametrize("min_count, expected", [(3, 2), (4, np.nan)]) + def test_sum_min_count(self, arr, fill_value, min_count, expected): + # GH#25777 + sparray = SparseArray(arr, fill_value=fill_value) + result = sparray.sum(min_count=min_count) + if np.isnan(expected): + assert np.isnan(result) + else: + assert result == expected + + def test_bool_sum_min_count(self): + spar_bool = SparseArray([False, True] * 5, dtype=np.bool8, fill_value=True) + res = spar_bool.sum(min_count=1) + assert res == 5 + res = spar_bool.sum(min_count=11) + assert isna(res) + + def test_numpy_sum(self): + data = np.arange(10).astype(float) + out = np.sum(SparseArray(data)) + assert out == 45.0 + + data[5] = np.nan + out = np.sum(SparseArray(data, fill_value=2)) + assert out == 40.0 + + out = np.sum(SparseArray(data, fill_value=np.nan)) + assert out == 40.0 + + msg = "the 'dtype' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.sum(SparseArray(data), dtype=np.int64) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.sum(SparseArray(data), out=out) + + def test_mean(self): + data = np.arange(10).astype(float) + out = SparseArray(data).mean() + assert out == 4.5 + + data[5] = np.nan + out = SparseArray(data).mean() + assert out == 40.0 / 9 + + def test_numpy_mean(self): + data = np.arange(10).astype(float) + out = np.mean(SparseArray(data)) + assert out == 4.5 + + data[5] = np.nan + out = np.mean(SparseArray(data)) + assert out == 40.0 / 9 + + msg = "the 'dtype' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.mean(SparseArray(data), dtype=np.int64) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.mean(SparseArray(data), out=out) + + +class TestMinMax: + @pytest.mark.parametrize( + "raw_data,max_expected,min_expected", + [ + (np.arange(5.0), [4], [0]), + (-np.arange(5.0), [0], [-4]), + (np.array([0, 1, 2, np.nan, 4]), [4], [0]), + (np.array([np.nan] * 5), [np.nan], [np.nan]), + (np.array([]), [np.nan], [np.nan]), + ], + ) + def test_nan_fill_value(self, raw_data, max_expected, min_expected): + arr = SparseArray(raw_data) + max_result = arr.max() + min_result = arr.min() + assert max_result in max_expected + assert min_result in min_expected + + max_result = arr.max(skipna=False) + min_result = arr.min(skipna=False) + if np.isnan(raw_data).any(): + assert np.isnan(max_result) + assert np.isnan(min_result) + else: + assert max_result in max_expected + assert min_result in min_expected + + @pytest.mark.parametrize( + "fill_value,max_expected,min_expected", + [ + (100, 100, 0), + (-100, 1, -100), + ], + ) + def test_fill_value(self, fill_value, max_expected, min_expected): + arr = SparseArray( + np.array([fill_value, 0, 1]), dtype=SparseDtype("int", fill_value) + ) + max_result = arr.max() + assert max_result == max_expected + + min_result = arr.min() + assert min_result == min_expected + + def test_only_fill_value(self): + fv = 100 + arr = SparseArray(np.array([fv, fv, fv]), dtype=SparseDtype("int", fv)) + assert len(arr._valid_sp_values) == 0 + + assert arr.max() == fv + assert arr.min() == fv + assert arr.max(skipna=False) == fv + assert arr.min(skipna=False) == fv + + @pytest.mark.parametrize("func", ["min", "max"]) + @pytest.mark.parametrize("data", [np.array([]), np.array([np.nan, np.nan])]) + @pytest.mark.parametrize( + "dtype,expected", + [ + (SparseDtype(np.float64, np.nan), np.nan), + (SparseDtype(np.float64, 5.0), np.nan), + (SparseDtype("datetime64[ns]", NaT), NaT), + (SparseDtype("datetime64[ns]", Timestamp("2018-05-05")), NaT), + ], + ) + def test_na_value_if_no_valid_values(self, func, data, dtype, expected): + arr = SparseArray(data, dtype=dtype) + result = getattr(arr, func)() + if expected is NaT: + # TODO: pin down whether we wrap datetime64("NaT") + assert result is NaT or np.isnat(result) + else: + assert np.isnan(result) diff --git a/pandas/tests/arrays/sparse/test_unary.py b/pandas/tests/arrays/sparse/test_unary.py new file mode 100644 index 0000000000000..a99dbb10a1433 --- /dev/null +++ b/pandas/tests/arrays/sparse/test_unary.py @@ -0,0 +1,72 @@ +import operator + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import SparseArray + + +@pytest.mark.parametrize("fill_value", [0, np.nan]) +@pytest.mark.parametrize("op", [operator.pos, operator.neg]) +def test_unary_op(op, fill_value): + arr = np.array([0, 1, np.nan, 2]) + sparray = SparseArray(arr, fill_value=fill_value) + result = op(sparray) + expected = SparseArray(op(arr), fill_value=op(fill_value)) + tm.assert_sp_array_equal(result, expected) + + +@pytest.mark.parametrize("fill_value", [True, False]) +def test_invert(fill_value): + arr = np.array([True, False, False, True]) + sparray = SparseArray(arr, fill_value=fill_value) + result = ~sparray + expected = SparseArray(~arr, fill_value=not fill_value) + tm.assert_sp_array_equal(result, expected) + + result = ~pd.Series(sparray) + expected = pd.Series(expected) + tm.assert_series_equal(result, expected) + + result = ~pd.DataFrame({"A": sparray}) + expected = pd.DataFrame({"A": expected}) + tm.assert_frame_equal(result, expected) + + +class TestUnaryMethods: + def test_neg_operator(self): + arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8) + res = -arr + exp = SparseArray([1, 2, np.nan, -3], fill_value=np.nan, dtype=np.int8) + tm.assert_sp_array_equal(exp, res) + + arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8) + res = -arr + exp = SparseArray([1, 2, -1, -3], fill_value=1, dtype=np.int8) + tm.assert_sp_array_equal(exp, res) + + def test_abs_operator(self): + arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8) + res = abs(arr) + exp = SparseArray([1, 2, np.nan, 3], fill_value=np.nan, dtype=np.int8) + tm.assert_sp_array_equal(exp, res) + + arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8) + res = abs(arr) + exp = SparseArray([1, 2, 1, 3], fill_value=1, dtype=np.int8) + tm.assert_sp_array_equal(exp, res) + + def test_invert_operator(self): + arr = SparseArray([False, True, False, True], fill_value=False, dtype=np.bool8) + res = ~arr + exp = SparseArray( + np.invert([False, True, False, True]), fill_value=True, dtype=np.bool8 + ) + res = ~arr + tm.assert_sp_array_equal(exp, res) + + arr = SparseArray([0, 1, 0, 2, 3, 0], fill_value=0, dtype=np.int32) + res = ~arr + exp = SparseArray([-1, -2, -1, -3, -4, -1], fill_value=-1, dtype=np.int32)