Skip to content

Commit 8890cc1

Browse files
sinhrksjreback
authored andcommitted
ENH/BUG: Sparse now supports comparison op
Author: sinhrks <[email protected]> Closes #12971 from sinhrks/sparse_bool_test and squashes the following commits: d57807c [sinhrks] ENH/BUG: Sparse now supports comparison op
1 parent bec5272 commit 8890cc1

File tree

4 files changed

+159
-24
lines changed

4 files changed

+159
-24
lines changed

doc/source/whatsnew/v0.18.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ These changes conform sparse handling to return the correct types and work to ma
121121
- Bug in ``SparseArray`` addition ignores ``fill_value`` of right hand side (:issue:`12910`)
122122
- Bug in ``SparseArray`` mod raises ``AttributeError (:issue:`12910`)
123123
- Bug in ``SparseArray`` pow calculates ``1 ** np.nan`` as ``np.nan`` which must be 1 (:issue:`12910`)
124+
- Bug in ``SparseArray`` comparison output may incorrect result or raise ``ValueError`` (:issue:`12971`)
124125
- Bug in ``SparseSeries.__repr__`` raises ``TypeError`` when it is longer than ``max_rows`` (:issue:`10560`)
125126
- Bug in ``SparseSeries.shape`` ignores ``fill_value`` (:issue:`10452`)
126127
- Bug in ``SparseSeries`` and ``SparseArray`` may have different ``dtype`` from its dense values (:issue:`12908`)

pandas/sparse/array.py

+19-17
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,8 @@ def wrapper(self, other):
4646
elif lib.isscalar(other):
4747
new_fill_value = op(np.float64(self.fill_value), np.float64(other))
4848

49-
return SparseArray(op(self.sp_values, other),
50-
sparse_index=self.sp_index,
51-
fill_value=new_fill_value)
49+
return _wrap_result(name, op(self.sp_values, other),
50+
self.sp_index, new_fill_value)
5251
else: # pragma: no cover
5352
raise TypeError('operation with %s not supported' % type(other))
5453

@@ -59,30 +58,32 @@ def wrapper(self, other):
5958

6059

6160
def _sparse_array_op(left, right, op, name):
62-
sparse_op = lambda a, b: _sparse_op(a, b, name)
63-
6461
if left.sp_index.equals(right.sp_index):
6562
result = op(left.sp_values, right.sp_values)
6663
result_index = left.sp_index
6764
else:
68-
result, result_index = sparse_op(left, right)
69-
65+
sparse_op = getattr(splib, 'sparse_%s' % name)
66+
result, result_index = sparse_op(left.sp_values, left.sp_index,
67+
left.fill_value, right.sp_values,
68+
right.sp_index, right.fill_value)
7069
try:
7170
fill_value = op(left.fill_value, right.fill_value)
7271
except:
7372
fill_value = nan
74-
75-
return SparseArray(result, sparse_index=result_index,
76-
fill_value=fill_value)
73+
return _wrap_result(name, result, result_index, fill_value)
7774

7875

79-
def _sparse_op(this, other, name):
80-
sparse_op = getattr(splib, 'sparse_%s' % name)
81-
result, result_index = sparse_op(this.sp_values, this.sp_index,
82-
this.fill_value, other.sp_values,
83-
other.sp_index, other.fill_value)
84-
85-
return result, result_index
76+
def _wrap_result(name, data, sparse_index, fill_value):
77+
""" wrap op result to have correct dtype """
78+
if name in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'):
79+
# ToDo: We can remove this condition when removing
80+
# SparseArray's dtype default when closing GH 667
81+
return SparseArray(data, sparse_index=sparse_index,
82+
fill_value=fill_value,
83+
dtype=np.bool)
84+
else:
85+
return SparseArray(data, sparse_index=sparse_index,
86+
fill_value=fill_value)
8687

8788

8889
class SparseArray(PandasObject, np.ndarray):
@@ -594,4 +595,5 @@ def _make_index(length, indices, kind):
594595

595596

596597
ops.add_special_arithmetic_methods(SparseArray, arith_method=_arith_method,
598+
comp_method=_arith_method,
597599
use_numexpr=False)

pandas/sparse/tests/test_array.py

+96
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,19 @@ def test_constructor_bool(self):
262262
self.assertEqual(dense.dtype, bool)
263263
tm.assert_numpy_array_equal(dense, data)
264264

265+
def test_constructor_bool_fill_value(self):
266+
arr = SparseArray([True, False, True], dtype=None)
267+
self.assertEqual(arr.dtype, np.bool)
268+
self.assertFalse(arr.fill_value)
269+
270+
arr = SparseArray([True, False, True], dtype=np.bool)
271+
self.assertEqual(arr.dtype, np.bool)
272+
self.assertFalse(arr.fill_value)
273+
274+
arr = SparseArray([True, False, True], dtype=np.bool, fill_value=True)
275+
self.assertEqual(arr.dtype, np.bool)
276+
self.assertTrue(arr.fill_value)
277+
265278
def test_constructor_float32(self):
266279
# GH 10648
267280
data = np.array([1., np.nan, 3], dtype=np.float32)
@@ -522,6 +535,31 @@ def _check_numeric_ops(self, a, b, a_dense, b_dense):
522535
tm.assert_numpy_array_equal((a ** b).to_dense(), a_dense ** b_dense)
523536
tm.assert_numpy_array_equal((b ** a).to_dense(), b_dense ** a_dense)
524537

538+
def _check_comparison_ops(self, a, b, a_dense, b_dense):
539+
540+
def _check(res):
541+
tm.assertIsInstance(res, SparseArray)
542+
self.assertEqual(res.dtype, np.bool)
543+
self.assertIsInstance(res.fill_value, bool)
544+
545+
_check(a == b)
546+
tm.assert_numpy_array_equal((a == b).to_dense(), a_dense == b_dense)
547+
548+
_check(a != b)
549+
tm.assert_numpy_array_equal((a != b).to_dense(), a_dense != b_dense)
550+
551+
_check(a >= b)
552+
tm.assert_numpy_array_equal((a >= b).to_dense(), a_dense >= b_dense)
553+
554+
_check(a <= b)
555+
tm.assert_numpy_array_equal((a <= b).to_dense(), a_dense <= b_dense)
556+
557+
_check(a > b)
558+
tm.assert_numpy_array_equal((a > b).to_dense(), a_dense > b_dense)
559+
560+
_check(a < b)
561+
tm.assert_numpy_array_equal((a < b).to_dense(), a_dense < b_dense)
562+
525563
def test_float_scalar(self):
526564
values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
527565

@@ -541,6 +579,25 @@ def test_float_scalar(self):
541579
self._check_numeric_ops(a, 0, values, 0)
542580
self._check_numeric_ops(a, 3, values, 3)
543581

582+
def test_float_scalar_comparison(self):
583+
values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
584+
585+
for kind in ['integer', 'block']:
586+
a = SparseArray(values, kind=kind)
587+
self._check_comparison_ops(a, 1, values, 1)
588+
self._check_comparison_ops(a, 0, values, 0)
589+
self._check_comparison_ops(a, 3, values, 3)
590+
591+
a = SparseArray(values, kind=kind, fill_value=0)
592+
self._check_comparison_ops(a, 1, values, 1)
593+
self._check_comparison_ops(a, 0, values, 0)
594+
self._check_comparison_ops(a, 3, values, 3)
595+
596+
a = SparseArray(values, kind=kind, fill_value=2)
597+
self._check_comparison_ops(a, 1, values, 1)
598+
self._check_comparison_ops(a, 0, values, 0)
599+
self._check_comparison_ops(a, 3, values, 3)
600+
544601
def test_float_same_index(self):
545602
# when sp_index are the same
546603
for kind in ['integer', 'block']:
@@ -558,6 +615,23 @@ def test_float_same_index(self):
558615
b = SparseArray(rvalues, kind=kind, fill_value=0)
559616
self._check_numeric_ops(a, b, values, rvalues)
560617

618+
def test_float_same_index_comparison(self):
619+
# when sp_index are the same
620+
for kind in ['integer', 'block']:
621+
values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
622+
rvalues = np.array([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan])
623+
624+
a = SparseArray(values, kind=kind)
625+
b = SparseArray(rvalues, kind=kind)
626+
self._check_comparison_ops(a, b, values, rvalues)
627+
628+
values = np.array([0., 1., 2., 6., 0., 0., 1., 2., 1., 0.])
629+
rvalues = np.array([0., 2., 3., 4., 0., 0., 1., 3., 2., 0.])
630+
631+
a = SparseArray(values, kind=kind, fill_value=0)
632+
b = SparseArray(rvalues, kind=kind, fill_value=0)
633+
self._check_comparison_ops(a, b, values, rvalues)
634+
561635
def test_float_array(self):
562636
values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
563637
rvalues = np.array([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan])
@@ -601,6 +675,28 @@ def test_float_array_different_kind(self):
601675
b = SparseArray(rvalues, kind='block', fill_value=2)
602676
self._check_numeric_ops(a, b, values, rvalues)
603677

678+
def test_float_array_comparison(self):
679+
values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
680+
rvalues = np.array([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan])
681+
682+
for kind in ['integer', 'block']:
683+
a = SparseArray(values, kind=kind)
684+
b = SparseArray(rvalues, kind=kind)
685+
self._check_comparison_ops(a, b, values, rvalues)
686+
self._check_comparison_ops(a, b * 0, values, rvalues * 0)
687+
688+
a = SparseArray(values, kind=kind, fill_value=0)
689+
b = SparseArray(rvalues, kind=kind)
690+
self._check_comparison_ops(a, b, values, rvalues)
691+
692+
a = SparseArray(values, kind=kind, fill_value=0)
693+
b = SparseArray(rvalues, kind=kind, fill_value=0)
694+
self._check_comparison_ops(a, b, values, rvalues)
695+
696+
a = SparseArray(values, kind=kind, fill_value=1)
697+
b = SparseArray(rvalues, kind=kind, fill_value=2)
698+
self._check_comparison_ops(a, b, values, rvalues)
699+
604700

605701
if __name__ == '__main__':
606702
import nose

pandas/src/sparse.pyx

+43-7
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,8 @@ cdef class BlockIndex(SparseIndex):
317317

318318
cdef:
319319
object __weakref__ # need to be picklable
320-
int32_t* locbuf, *lenbuf
320+
int32_t *locbuf
321+
int32_t *lenbuf
321322

322323
def __init__(self, length, blocs, blengths):
323324

@@ -985,6 +986,12 @@ cdef inline float64_t __lt(float64_t a, float64_t b):
985986
cdef inline float64_t __gt(float64_t a, float64_t b):
986987
return a > b
987988

989+
cdef inline float64_t __le(float64_t a, float64_t b):
990+
return a <= b
991+
992+
cdef inline float64_t __ge(float64_t a, float64_t b):
993+
return a >= b
994+
988995
cdef inline float64_t __mod(float64_t a, float64_t b):
989996
if b == 0:
990997
return NaN
@@ -1040,33 +1047,62 @@ sparse_rtruediv = sparse_rdiv
10401047
cpdef sparse_floordiv(ndarray x, SparseIndex xindex, float64_t xfill,
10411048
ndarray y, SparseIndex yindex, float64_t yfill):
10421049
return sparse_combine(x, xindex, xfill,
1043-
y, yindex, yfill, __floordiv)
1050+
y, yindex, yfill, __floordiv)
10441051

10451052
cpdef sparse_rfloordiv(ndarray x, SparseIndex xindex, float64_t xfill,
10461053
ndarray y, SparseIndex yindex, float64_t yfill):
10471054
return sparse_combine(x, xindex, xfill,
1048-
y, yindex, yfill, __rfloordiv)
1055+
y, yindex, yfill, __rfloordiv)
10491056

10501057
cpdef sparse_mod(ndarray x, SparseIndex xindex, float64_t xfill,
10511058
ndarray y, SparseIndex yindex, float64_t yfill):
10521059
return sparse_combine(x, xindex, xfill,
1053-
y, yindex, yfill, __mod)
1060+
y, yindex, yfill, __mod)
10541061

10551062
cpdef sparse_rmod(ndarray x, SparseIndex xindex, float64_t xfill,
10561063
ndarray y, SparseIndex yindex, float64_t yfill):
10571064
return sparse_combine(x, xindex, xfill,
1058-
y, yindex, yfill, __rmod)
1065+
y, yindex, yfill, __rmod)
10591066

10601067
cpdef sparse_pow(ndarray x, SparseIndex xindex, float64_t xfill,
10611068
ndarray y, SparseIndex yindex, float64_t yfill):
10621069
return sparse_combine(x, xindex, xfill,
1063-
y, yindex, yfill, __pow)
1070+
y, yindex, yfill, __pow)
10641071

10651072
cpdef sparse_rpow(ndarray x, SparseIndex xindex, float64_t xfill,
10661073
ndarray y, SparseIndex yindex, float64_t yfill):
10671074
return sparse_combine(x, xindex, xfill,
1068-
y, yindex, yfill, __rpow)
1075+
y, yindex, yfill, __rpow)
1076+
1077+
cpdef sparse_eq(ndarray x, SparseIndex xindex, float64_t xfill,
1078+
ndarray y, SparseIndex yindex, float64_t yfill):
1079+
return sparse_combine(x, xindex, xfill,
1080+
y, yindex, yfill, __eq)
10691081

1082+
cpdef sparse_ne(ndarray x, SparseIndex xindex, float64_t xfill,
1083+
ndarray y, SparseIndex yindex, float64_t yfill):
1084+
return sparse_combine(x, xindex, xfill,
1085+
y, yindex, yfill, __ne)
1086+
1087+
cpdef sparse_lt(ndarray x, SparseIndex xindex, float64_t xfill,
1088+
ndarray y, SparseIndex yindex, float64_t yfill):
1089+
return sparse_combine(x, xindex, xfill,
1090+
y, yindex, yfill, __lt)
1091+
1092+
cpdef sparse_gt(ndarray x, SparseIndex xindex, float64_t xfill,
1093+
ndarray y, SparseIndex yindex, float64_t yfill):
1094+
return sparse_combine(x, xindex, xfill,
1095+
y, yindex, yfill, __gt)
1096+
1097+
cpdef sparse_le(ndarray x, SparseIndex xindex, float64_t xfill,
1098+
ndarray y, SparseIndex yindex, float64_t yfill):
1099+
return sparse_combine(x, xindex, xfill,
1100+
y, yindex, yfill, __le)
1101+
1102+
cpdef sparse_ge(ndarray x, SparseIndex xindex, float64_t xfill,
1103+
ndarray y, SparseIndex yindex, float64_t yfill):
1104+
return sparse_combine(x, xindex, xfill,
1105+
y, yindex, yfill, __ge)
10701106

10711107
#-------------------------------------------------------------------------------
10721108
# Indexing operations

0 commit comments

Comments
 (0)