ENH/BUG: Sparse now supports comparison op

sinhrks · jreback · commit 8890cc105f17 · 2016-04-25T17:59:49.000-04:00
Author: sinhrks <sinhrks@gmail.com> Closes #12971 from sinhrks/sparse_bool_test and squashes the following commits: d57807c [sinhrks] ENH/BUG: Sparse now supports comparison op
diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt
@@ -121,6 +121,7 @@ These changes conform sparse handling to return the correct types and work to ma
 - Bug in ``SparseArray`` addition ignores ``fill_value`` of right hand side (:issue:`12910`)
 - Bug in ``SparseArray`` mod raises ``AttributeError (:issue:`12910`)
 - Bug in ``SparseArray`` pow calculates ``1 ** np.nan`` as ``np.nan`` which must be 1 (:issue:`12910`)
+- Bug in ``SparseArray`` comparison output may incorrect result or raise ``ValueError`` (:issue:`12971`)
 - Bug in ``SparseSeries.__repr__`` raises ``TypeError`` when it is longer than ``max_rows`` (:issue:`10560`)
 - Bug in ``SparseSeries.shape`` ignores ``fill_value`` (:issue:`10452`)
 - Bug in ``SparseSeries`` and ``SparseArray`` may have different ``dtype`` from its dense values (:issue:`12908`)
diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py
@@ -46,9 +46,8 @@ def wrapper(self, other):
         elif lib.isscalar(other):
             new_fill_value = op(np.float64(self.fill_value), np.float64(other))
 
-            return SparseArray(op(self.sp_values, other),
-                               sparse_index=self.sp_index,
-                               fill_value=new_fill_value)
+            return _wrap_result(name, op(self.sp_values, other),
+                                self.sp_index, new_fill_value)
         else:  # pragma: no cover
             raise TypeError('operation with %s not supported' % type(other))
 
@@ -59,30 +58,32 @@ def wrapper(self, other):
 
 
 def _sparse_array_op(left, right, op, name):
-    sparse_op = lambda a, b: _sparse_op(a, b, name)
-
     if left.sp_index.equals(right.sp_index):
         result = op(left.sp_values, right.sp_values)
         result_index = left.sp_index
     else:
-        result, result_index = sparse_op(left, right)
-
+        sparse_op = getattr(splib, 'sparse_%s' % name)
+        result, result_index = sparse_op(left.sp_values, left.sp_index,
+                                         left.fill_value, right.sp_values,
+                                         right.sp_index, right.fill_value)
     try:
         fill_value = op(left.fill_value, right.fill_value)
     except:
         fill_value = nan
-
-    return SparseArray(result, sparse_index=result_index,
-                       fill_value=fill_value)
+    return _wrap_result(name, result, result_index, fill_value)
 
 
-def _sparse_op(this, other, name):
-    sparse_op = getattr(splib, 'sparse_%s' % name)
-    result, result_index = sparse_op(this.sp_values, this.sp_index,
-                                     this.fill_value, other.sp_values,
-                                     other.sp_index, other.fill_value)
-
-    return result, result_index
+def _wrap_result(name, data, sparse_index, fill_value):
+    """ wrap op result to have correct dtype """
+    if name in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'):
+        # ToDo: We can remove this condition when removing
+        # SparseArray's dtype default when closing GH 667
+        return SparseArray(data, sparse_index=sparse_index,
+                           fill_value=fill_value,
+                           dtype=np.bool)
+    else:
+        return SparseArray(data, sparse_index=sparse_index,
+                           fill_value=fill_value)
 
 
 class SparseArray(PandasObject, np.ndarray):
@@ -594,4 +595,5 @@ def _make_index(length, indices, kind):
 
 
 ops.add_special_arithmetic_methods(SparseArray, arith_method=_arith_method,
+                                   comp_method=_arith_method,
                                    use_numexpr=False)
diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py
@@ -262,6 +262,19 @@ def test_constructor_bool(self):
             self.assertEqual(dense.dtype, bool)
             tm.assert_numpy_array_equal(dense, data)
 
+    def test_constructor_bool_fill_value(self):
+        arr = SparseArray([True, False, True], dtype=None)
+        self.assertEqual(arr.dtype, np.bool)
+        self.assertFalse(arr.fill_value)
+
+        arr = SparseArray([True, False, True], dtype=np.bool)
+        self.assertEqual(arr.dtype, np.bool)
+        self.assertFalse(arr.fill_value)
+
+        arr = SparseArray([True, False, True], dtype=np.bool, fill_value=True)
+        self.assertEqual(arr.dtype, np.bool)
+        self.assertTrue(arr.fill_value)
+
     def test_constructor_float32(self):
         # GH 10648
         data = np.array([1., np.nan, 3], dtype=np.float32)
@@ -522,6 +535,31 @@ def _check_numeric_ops(self, a, b, a_dense, b_dense):
         tm.assert_numpy_array_equal((a ** b).to_dense(), a_dense ** b_dense)
         tm.assert_numpy_array_equal((b ** a).to_dense(), b_dense ** a_dense)
 
+    def _check_comparison_ops(self, a, b, a_dense, b_dense):
+
+        def _check(res):
+            tm.assertIsInstance(res, SparseArray)
+            self.assertEqual(res.dtype, np.bool)
+            self.assertIsInstance(res.fill_value, bool)
+
+        _check(a == b)
+        tm.assert_numpy_array_equal((a == b).to_dense(), a_dense == b_dense)
+
+        _check(a != b)
+        tm.assert_numpy_array_equal((a != b).to_dense(), a_dense != b_dense)
+
+        _check(a >= b)
+        tm.assert_numpy_array_equal((a >= b).to_dense(), a_dense >= b_dense)
+
+        _check(a <= b)
+        tm.assert_numpy_array_equal((a <= b).to_dense(), a_dense <= b_dense)
+
+        _check(a > b)
+        tm.assert_numpy_array_equal((a > b).to_dense(), a_dense > b_dense)
+
+        _check(a < b)
+        tm.assert_numpy_array_equal((a < b).to_dense(), a_dense < b_dense)
+
     def test_float_scalar(self):
         values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
 
@@ -541,6 +579,25 @@ def test_float_scalar(self):
             self._check_numeric_ops(a, 0, values, 0)
             self._check_numeric_ops(a, 3, values, 3)
 
+    def test_float_scalar_comparison(self):
+        values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
+
+        for kind in ['integer', 'block']:
+            a = SparseArray(values, kind=kind)
+            self._check_comparison_ops(a, 1, values, 1)
+            self._check_comparison_ops(a, 0, values, 0)
+            self._check_comparison_ops(a, 3, values, 3)
+
+            a = SparseArray(values, kind=kind, fill_value=0)
+            self._check_comparison_ops(a, 1, values, 1)
+            self._check_comparison_ops(a, 0, values, 0)
+            self._check_comparison_ops(a, 3, values, 3)
+
+            a = SparseArray(values, kind=kind, fill_value=2)
+            self._check_comparison_ops(a, 1, values, 1)
+            self._check_comparison_ops(a, 0, values, 0)
+            self._check_comparison_ops(a, 3, values, 3)
+
     def test_float_same_index(self):
         # when sp_index are the same
         for kind in ['integer', 'block']:
@@ -558,6 +615,23 @@ def test_float_same_index(self):
             b = SparseArray(rvalues, kind=kind, fill_value=0)
             self._check_numeric_ops(a, b, values, rvalues)
 
+    def test_float_same_index_comparison(self):
+        # when sp_index are the same
+        for kind in ['integer', 'block']:
+            values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
+            rvalues = np.array([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan])
+
+            a = SparseArray(values, kind=kind)
+            b = SparseArray(rvalues, kind=kind)
+            self._check_comparison_ops(a, b, values, rvalues)
+
+            values = np.array([0., 1., 2., 6., 0., 0., 1., 2., 1., 0.])
+            rvalues = np.array([0., 2., 3., 4., 0., 0., 1., 3., 2., 0.])
+
+            a = SparseArray(values, kind=kind, fill_value=0)
+            b = SparseArray(rvalues, kind=kind, fill_value=0)
+            self._check_comparison_ops(a, b, values, rvalues)
+
     def test_float_array(self):
         values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
         rvalues = np.array([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan])
@@ -601,6 +675,28 @@ def test_float_array_different_kind(self):
         b = SparseArray(rvalues, kind='block', fill_value=2)
         self._check_numeric_ops(a, b, values, rvalues)
 
+    def test_float_array_comparison(self):
+        values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
+        rvalues = np.array([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan])
+
+        for kind in ['integer', 'block']:
+            a = SparseArray(values, kind=kind)
+            b = SparseArray(rvalues, kind=kind)
+            self._check_comparison_ops(a, b, values, rvalues)
+            self._check_comparison_ops(a, b * 0, values, rvalues * 0)
+
+            a = SparseArray(values, kind=kind, fill_value=0)
+            b = SparseArray(rvalues, kind=kind)
+            self._check_comparison_ops(a, b, values, rvalues)
+
+            a = SparseArray(values, kind=kind, fill_value=0)
+            b = SparseArray(rvalues, kind=kind, fill_value=0)
+            self._check_comparison_ops(a, b, values, rvalues)
+
+            a = SparseArray(values, kind=kind, fill_value=1)
+            b = SparseArray(rvalues, kind=kind, fill_value=2)
+            self._check_comparison_ops(a, b, values, rvalues)
+
 
 if __name__ == '__main__':
     import nose
diff --git a/pandas/src/sparse.pyx b/pandas/src/sparse.pyx
@@ -317,7 +317,8 @@ cdef class BlockIndex(SparseIndex):
 
     cdef:
         object __weakref__ # need to be picklable
-        int32_t* locbuf, *lenbuf
+        int32_t *locbuf
+        int32_t *lenbuf
 
     def __init__(self, length, blocs, blengths):
 
@@ -985,6 +986,12 @@ cdef inline float64_t __lt(float64_t a, float64_t b):
 cdef inline float64_t __gt(float64_t a, float64_t b):
     return a > b
 
+cdef inline float64_t __le(float64_t a, float64_t b):
+    return a <= b
+
+cdef inline float64_t __ge(float64_t a, float64_t b):
+    return a >= b
+
 cdef inline float64_t __mod(float64_t a, float64_t b):
     if b == 0:
         return NaN
@@ -1040,33 +1047,62 @@ sparse_rtruediv = sparse_rdiv
 cpdef sparse_floordiv(ndarray x, SparseIndex xindex, float64_t xfill,
                       ndarray y, SparseIndex yindex, float64_t yfill):
     return sparse_combine(x, xindex, xfill,
-                             y, yindex, yfill, __floordiv)
+                          y, yindex, yfill, __floordiv)
 
 cpdef sparse_rfloordiv(ndarray x, SparseIndex xindex, float64_t xfill,
                        ndarray y, SparseIndex yindex, float64_t yfill):
     return sparse_combine(x, xindex, xfill,
-                             y, yindex, yfill, __rfloordiv)
+                          y, yindex, yfill, __rfloordiv)
 
 cpdef sparse_mod(ndarray x, SparseIndex xindex, float64_t xfill,
                  ndarray y, SparseIndex yindex, float64_t yfill):
     return sparse_combine(x, xindex, xfill,
-                             y, yindex, yfill, __mod)
+                          y, yindex, yfill, __mod)
 
 cpdef sparse_rmod(ndarray x, SparseIndex xindex, float64_t xfill,
                   ndarray y, SparseIndex yindex, float64_t yfill):
     return sparse_combine(x, xindex, xfill,
-                             y, yindex, yfill, __rmod)
+                          y, yindex, yfill, __rmod)
 
 cpdef sparse_pow(ndarray x, SparseIndex xindex, float64_t xfill,
                  ndarray y, SparseIndex yindex, float64_t yfill):
     return sparse_combine(x, xindex, xfill,
-                             y, yindex, yfill, __pow)
+                          y, yindex, yfill, __pow)
 
 cpdef sparse_rpow(ndarray x, SparseIndex xindex, float64_t xfill,
                   ndarray y, SparseIndex yindex, float64_t yfill):
     return sparse_combine(x, xindex, xfill,
-                             y, yindex, yfill, __rpow)
+                          y, yindex, yfill, __rpow)
+
+cpdef sparse_eq(ndarray x, SparseIndex xindex, float64_t xfill,
+                  ndarray y, SparseIndex yindex, float64_t yfill):
+    return sparse_combine(x, xindex, xfill,
+                          y, yindex, yfill, __eq)
 
+cpdef sparse_ne(ndarray x, SparseIndex xindex, float64_t xfill,
+                  ndarray y, SparseIndex yindex, float64_t yfill):
+    return sparse_combine(x, xindex, xfill,
+                          y, yindex, yfill, __ne)
+
+cpdef sparse_lt(ndarray x, SparseIndex xindex, float64_t xfill,
+                  ndarray y, SparseIndex yindex, float64_t yfill):
+    return sparse_combine(x, xindex, xfill,
+                          y, yindex, yfill, __lt)
+
+cpdef sparse_gt(ndarray x, SparseIndex xindex, float64_t xfill,
+                  ndarray y, SparseIndex yindex, float64_t yfill):
+    return sparse_combine(x, xindex, xfill,
+                          y, yindex, yfill, __gt)
+
+cpdef sparse_le(ndarray x, SparseIndex xindex, float64_t xfill,
+                ndarray y, SparseIndex yindex, float64_t yfill):
+    return sparse_combine(x, xindex, xfill,
+                          y, yindex, yfill, __le)
+
+cpdef sparse_ge(ndarray x, SparseIndex xindex, float64_t xfill,
+                ndarray y, SparseIndex yindex, float64_t yfill):
+    return sparse_combine(x, xindex, xfill,
+                          y, yindex, yfill, __ge)
 
 #-------------------------------------------------------------------------------
 # Indexing operations