Skip to content

Commit 0d2518e

Browse files
committed
ENH: best attempt at NA-friendly Series comparison and boolean binary operations, GH #801
1 parent f4d13fb commit 0d2518e

File tree

3 files changed

+249
-28
lines changed

3 files changed

+249
-28
lines changed

pandas/core/series.py

Lines changed: 103 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,78 @@ def wrapper(self, other):
8181
index=self.index, name=self.name)
8282
return wrapper
8383

84+
85+
def _comp_method(op, name):
86+
"""
87+
Wrapper function for Series arithmetic operations, to avoid
88+
code duplication.
89+
"""
90+
def na_op(x, y):
91+
if x.dtype == np.object_:
92+
if isinstance(y, list):
93+
y = lib.list_to_object_array(y)
94+
95+
if isinstance(y, np.ndarray):
96+
result = lib.vec_compare(x, y, op)
97+
else:
98+
result = lib.scalar_compare(x, y, op)
99+
else:
100+
result = op(x, y)
101+
102+
return result
103+
104+
def wrapper(self, other):
105+
from pandas.core.frame import DataFrame
106+
107+
if isinstance(other, Series):
108+
name = _maybe_match_name(self, other)
109+
return Series(na_op(self.values, other.values),
110+
index=self.index, name=name)
111+
elif isinstance(other, DataFrame):
112+
return NotImplemented
113+
else:
114+
# scalars
115+
return Series(na_op(self.values, other),
116+
index=self.index, name=self.name)
117+
return wrapper
118+
119+
120+
def _bool_method(op, name):
121+
"""
122+
Wrapper function for Series arithmetic operations, to avoid
123+
code duplication.
124+
"""
125+
def na_op(x, y):
126+
try:
127+
result = op(x, y)
128+
except TypeError:
129+
if isinstance(y, list):
130+
y = lib.list_to_object_array(y)
131+
132+
if isinstance(y, np.ndarray):
133+
result = lib.vec_binop(x, y, op)
134+
else:
135+
result = lib.scalar_binop(x, y, op)
136+
137+
return result
138+
139+
def wrapper(self, other):
140+
from pandas.core.frame import DataFrame
141+
142+
if isinstance(other, Series):
143+
name = _maybe_match_name(self, other)
144+
return Series(na_op(self.values, other.values),
145+
index=self.index, name=name)
146+
elif isinstance(other, DataFrame):
147+
return NotImplemented
148+
else:
149+
# scalars
150+
return Series(na_op(self.values, other),
151+
index=self.index, name=self.name)
152+
return wrapper
153+
154+
155+
84156
def _radd_compat(left, right):
85157
radd = lambda x, y: y + x
86158
# GH #353, NumPy 1.5.1 workaround
@@ -97,12 +169,14 @@ def _radd_compat(left, right):
97169

98170
return output
99171

172+
100173
def _maybe_match_name(a, b):
101174
name = None
102175
if a.name == b.name:
103176
name = a.name
104177
return name
105178

179+
106180
def _flex_method(op, name):
107181
doc = """
108182
Binary operator %s with support to substitute a fill_value for missing data
@@ -682,17 +756,17 @@ def iteritems(self, index=True):
682756
__rpow__ = _arith_method(lambda x, y: y ** x, '__pow__')
683757

684758
# comparisons
685-
# __gt__ = _arith_method(operator.gt, '__gt__')
686-
# __ge__ = _arith_method(operator.ge, '__ge__')
687-
# __lt__ = _arith_method(operator.lt, '__lt__')
688-
# __le__ = _arith_method(operator.le, '__le__')
689-
# __eq__ = _arith_method(operator.eq, '__eq__')
690-
# __ne__ = _arith_method(operator.ne, '__ne__')
759+
__gt__ = _comp_method(operator.gt, '__gt__')
760+
__ge__ = _comp_method(operator.ge, '__ge__')
761+
__lt__ = _comp_method(operator.lt, '__lt__')
762+
__le__ = _comp_method(operator.le, '__le__')
763+
__eq__ = _comp_method(operator.eq, '__eq__')
764+
__ne__ = _comp_method(operator.ne, '__ne__')
691765

692766
# binary logic
693-
# __or__ = _arith_method(operator.or_, '__or__')
694-
# __and__ = _arith_method(operator.and_, '__and__')
695-
# __xor__ = _arith_method(operator.xor, '__xor__')
767+
__or__ = _bool_method(operator.or_, '__or__')
768+
__and__ = _bool_method(operator.and_, '__and__')
769+
__xor__ = _bool_method(operator.xor, '__xor__')
696770

697771
# Inplace operators
698772
__iadd__ = __add__
@@ -1902,7 +1976,26 @@ def isin(self, values):
19021976
result = lib.ismember(self, value_set)
19031977
return Series(result, self.index, name=self.name)
19041978

1905-
#-------------------------------------------------------------------------------
1979+
def between(self, left, right, inclusive=True):
1980+
"""
1981+
Return boolean Series equivalent to left <= series <= right, taking NAs
1982+
(if any) into account
1983+
1984+
Parameters
1985+
----------
1986+
left : scalar
1987+
Left boundary
1988+
right : scalar
1989+
Right boundary
1990+
1991+
Returns
1992+
-------
1993+
is_between : Series
1994+
NAs, if any, will be preserved
1995+
"""
1996+
pass
1997+
1998+
#----------------------------------------------------------------------
19061999
# Miscellaneous
19072000

19082001
def plot(self, label=None, kind='line', use_index=True, rot=30, ax=None,

pandas/src/tseries.pyx

Lines changed: 115 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -218,10 +218,10 @@ def array_to_datetime(ndarray[int64_t, ndim=1] arr):
218218
cdef double INF = <double> np.inf
219219
cdef double NEGINF = -INF
220220

221-
cdef inline _checknull(object val):
221+
cdef inline bint _checknull(object val):
222222
return not np.PyArray_Check(val) and (val is None or val != val)
223223

224-
cdef inline _checknan(object val):
224+
cdef inline bint _checknan(object val):
225225
return not np.PyArray_Check(val) and val != val
226226

227227
cpdef checknull(object val):
@@ -232,6 +232,8 @@ cpdef checknull(object val):
232232
else:
233233
return _checknull(val)
234234

235+
@cython.wraparound(False)
236+
@cython.boundscheck(False)
235237
def isnullobj(ndarray[object] arr):
236238
cdef Py_ssize_t i, n
237239
cdef object val
@@ -240,11 +242,11 @@ def isnullobj(ndarray[object] arr):
240242
n = len(arr)
241243
result = np.zeros(n, dtype=np.uint8)
242244
for i from 0 <= i < n:
243-
val = arr[i]
244-
if _checknull(val):
245-
result[i] = 1
245+
result[i] = _checknull(arr[i])
246246
return result.view(np.bool_)
247247

248+
@cython.wraparound(False)
249+
@cython.boundscheck(False)
248250
def isnullobj2d(ndarray[object, ndim=2] arr):
249251
cdef Py_ssize_t i, j, n, m
250252
cdef object val
@@ -493,22 +495,117 @@ def convert_timestamps(ndarray values):
493495

494496
return out
495497

496-
# cdef class TypeConverter:
497-
# cdef:
498-
# cpython.PyTypeObject* klass_type
498+
@cython.wraparound(False)
499+
@cython.boundscheck(False)
500+
def scalar_compare(ndarray[object] values, object val, object op):
501+
import operator
502+
cdef:
503+
Py_ssize_t i, n = len(values)
504+
int flag
505+
object x
506+
507+
if op is operator.lt:
508+
flag = cpython.Py_LT
509+
elif op is operator.le:
510+
flag = cpython.Py_LE
511+
elif op is operator.gt:
512+
flag = cpython.Py_GT
513+
elif op is operator.ge:
514+
flag = cpython.Py_GE
515+
elif op is operator.eq:
516+
flag = cpython.Py_EQ
517+
elif op is operator.ne:
518+
flag = cpython.Py_NE
519+
else:
520+
raise ValueError('Unrecognized operator')
521+
522+
result = np.empty(n, dtype=object)
523+
524+
for i in range(n):
525+
x = values[i]
526+
if _checknull(x):
527+
result[i] = x
528+
else:
529+
result[i] = cpython.PyObject_RichCompareBool(x, val, flag)
530+
531+
return maybe_convert_bool(result)
532+
533+
@cython.wraparound(False)
534+
@cython.boundscheck(False)
535+
def vec_compare(ndarray[object] left, ndarray[object] right, object op):
536+
import operator
537+
cdef:
538+
Py_ssize_t i, n = len(left)
539+
int flag
540+
541+
if op is operator.lt:
542+
flag = cpython.Py_LT
543+
elif op is operator.le:
544+
flag = cpython.Py_LE
545+
elif op is operator.gt:
546+
flag = cpython.Py_GT
547+
elif op is operator.ge:
548+
flag = cpython.Py_GE
549+
elif op is operator.eq:
550+
flag = cpython.Py_EQ
551+
elif op is operator.ne:
552+
flag = cpython.Py_NE
553+
else:
554+
raise ValueError('Unrecognized operator')
555+
556+
result = np.empty(n, dtype=object)
557+
558+
for i in range(n):
559+
x = left[i]
560+
y = right[i]
561+
if _checknull(x):
562+
result[i] = x
563+
elif _checknull(y):
564+
result[i] = y
565+
else:
566+
result[i] = cpython.PyObject_RichCompareBool(x, y, flag)
567+
568+
return maybe_convert_bool(result)
569+
570+
571+
@cython.wraparound(False)
572+
@cython.boundscheck(False)
573+
def scalar_binop(ndarray[object] values, object val, object op):
574+
cdef:
575+
Py_ssize_t i, n = len(values)
576+
object x
577+
578+
result = np.empty(n, dtype=object)
579+
580+
for i in range(n):
581+
x = values[i]
582+
if _checknull(x):
583+
result[i] = x
584+
else:
585+
result[i] = op(x, val)
586+
587+
return maybe_convert_bool(result)
588+
589+
@cython.wraparound(False)
590+
@cython.boundscheck(False)
591+
def vec_binop(ndarray[object] left, ndarray[object] right, object op):
592+
cdef:
593+
Py_ssize_t i, n = len(left)
594+
595+
result = np.empty(n, dtype=object)
499596

500-
# cdef readonly:
501-
# object factory
502-
# object klass
597+
for i in range(n):
598+
x = left[i]
599+
y = right[i]
600+
if _checknull(x):
601+
result[i] = x
602+
elif _checknull(y):
603+
result[i] = y
604+
else:
605+
result[i] = op(x, y)
503606

504-
# def __init__(self, object klass, factory):
505-
# self.klass_type = (<PyObject*> klass).ob_type
506-
# self.factory = factory
607+
return maybe_convert_bool(result)
507608

508-
# def convert(self, object obj):
509-
# if cpython.PyObject_TypeCheck(obj, self.klass_type):
510-
# return obj
511-
# return self.factory(obj)
512609

513610
include "skiplist.pyx"
514611
include "groupby.pyx"

pandas/tests/test_series.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1099,6 +1099,37 @@ def test_operators_na_handling(self):
10991099
expected = Series(['foo_suffix', 'bar_suffix', 'baz_suffix', np.nan])
11001100
assert_series_equal(result, expected)
11011101

1102+
def test_comparison_operators_with_nas(self):
1103+
from pandas import DateRange
1104+
1105+
s = Series(DateRange('1/1/2000', periods=10), dtype=object)
1106+
s[::2] = np.nan
1107+
1108+
# test that comparions work
1109+
ops = ['lt', 'le', 'gt', 'ge', 'eq', 'ne']
1110+
for op in ops:
1111+
val = s[5]
1112+
1113+
f = getattr(operator, op)
1114+
result = f(s, val)
1115+
expected = f(s.dropna(), val).reindex(s.index)
1116+
assert_series_equal(result, expected)
1117+
1118+
# fffffffuuuuuuuuuuuu
1119+
# result = f(val, s)
1120+
# expected = f(val, s.dropna()).reindex(s.index)
1121+
# assert_series_equal(result, expected)
1122+
1123+
# boolean &, |, ^ should work with object arrays and propagate NAs
1124+
1125+
ops = ['and_', 'or_', 'xor']
1126+
for bool_op in ops:
1127+
f = getattr(operator, bool_op)
1128+
1129+
result = f(s < s[9], s > s[3])
1130+
expected = f(s.dropna() < s[9], s.dropna() > s[3]).reindex(s.index)
1131+
assert_series_equal(result, expected)
1132+
11021133
def test_idxmin(self):
11031134
# test idxmin
11041135
# _check_stat_op approach can not be used here because of isnull check.

0 commit comments

Comments
 (0)