Skip to content

BUG: GH11206 where pd.isnull did not consider numpy NaT null #11212

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 20, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 28 additions & 1 deletion asv_bench/benchmarks/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,7 @@ def time_frame_interpolate_some_good_infer(self):
self.df.interpolate(downcast='infer')


class frame_isnull(object):
class frame_isnull_floats_no_null(object):
goal_time = 0.2

def setup(self):
Expand All @@ -593,6 +593,33 @@ def time_frame_isnull(self):
isnull(self.df)


class frame_isnull_floats(object):
goal_time = 0.2

def setup(self):
np.random.seed(1234)
self.sample = np.array([np.nan, 1.0])
self.data = np.random.choice(self.sample, (1000, 1000))
self.df = DataFrame(self.data)

def time_frame_isnull(self):
isnull(self.df)


class frame_isnull_obj(object):
goal_time = 0.2

def setup(self):
np.random.seed(1234)
self.sample = np.array([NaT, np.nan, None, np.datetime64('NaT'),
np.timedelta64('NaT'), 0, 1, 2.0, '', 'abcd'])
self.data = np.random.choice(self.sample, (1000, 1000))
self.df = DataFrame(self.data)

def time_frame_isnull(self):
isnull(self.df)


class frame_iteritems(object):
goal_time = 0.2

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.17.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ Bug Fixes
- Bug in output formatting when using an index of ambiguous times (:issue:`11619`)
- Bug in comparisons of Series vs list-likes (:issue:`11339`)
- Bug in ``DataFrame.replace`` with a ``datetime64[ns, tz]`` and a non-compat to_replace (:issue:`11326`, :issue:`11153`)
- Bug in ``isnull`` where ``numpy.datetime64('NaT')`` in a ``numpy.array`` was not determined to be null(:issue:`11206`)
- Bug in list-like indexing with a mixed-integer Index (:issue:`11320`)
- Bug in ``pivot_table`` with ``margins=True`` when indexes are of ``Categorical`` dtype (:issue:`10993`)
- Bug in ``DataFrame.plot`` cannot use hex strings colors (:issue:`10299`)
Expand Down
10 changes: 5 additions & 5 deletions pandas/lib.pyx
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# cython: profile=False
cimport numpy as np
cimport cython
import numpy as np
Expand Down Expand Up @@ -54,7 +55,8 @@ from datetime import datetime as pydatetime
# this is our tseries.pxd
from datetime cimport *

from tslib cimport convert_to_tsobject, convert_to_timedelta64
from tslib cimport (convert_to_tsobject, convert_to_timedelta64,
_check_all_nulls)
import tslib
from tslib import NaT, Timestamp, Timedelta

Expand Down Expand Up @@ -245,8 +247,6 @@ def time64_to_datetime(ndarray[int64_t, ndim=1] arr):

return result

cdef inline int64_t get_timedelta64_value(val):
return val.view('i8')

#----------------------------------------------------------------------
# isnull / notnull related
Expand Down Expand Up @@ -346,10 +346,10 @@ def isnullobj(ndarray[object] arr):
cdef ndarray[uint8_t] result

n = len(arr)
result = np.zeros(n, dtype=np.uint8)
result = np.empty(n, dtype=np.uint8)
for i from 0 <= i < n:
val = arr[i]
result[i] = val is NaT or _checknull(val)
result[i] = _check_all_nulls(val)
return result.view(np.bool_)

@cython.wraparound(False)
Expand Down
2 changes: 2 additions & 0 deletions pandas/src/datetime.pxd
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# cython: profile=False
from numpy cimport int64_t, int32_t, npy_int64, npy_int32, ndarray
from cpython cimport PyObject

Expand Down Expand Up @@ -59,6 +60,7 @@ cdef extern from "numpy/ndarrayobject.h":

cdef extern from "numpy_helper.h":
npy_datetime get_datetime64_value(object o)
npy_timedelta get_timedelta64_value(object o)

cdef extern from "numpy/npy_common.h":

Expand Down
4 changes: 4 additions & 0 deletions pandas/src/numpy_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,11 @@ get_nat(void) {
PANDAS_INLINE npy_datetime
get_datetime64_value(PyObject* obj) {
return ((PyDatetimeScalarObject*) obj)->obval;
}

PANDAS_INLINE npy_timedelta
get_timedelta64_value(PyObject* obj) {
return ((PyTimedeltaScalarObject*) obj)->obval;
}

PANDAS_INLINE int
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,13 @@ def test_isnull_nat():
exp = np.array([True])
assert(np.array_equal(result, exp))

def test_isnull_numpy_nat():
arr = np.array([NaT, np.datetime64('NaT'), np.timedelta64('NaT'),
np.datetime64('NaT', 's')])
result = isnull(arr)
expected = np.array([True] * 4)
tm.assert_numpy_array_equal(result, expected)

def test_isnull_datetime():
assert (not isnull(datetime.now()))
assert notnull(datetime.now())
Expand Down
1 change: 1 addition & 0 deletions pandas/tslib.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ cdef bint _is_utc(object)
cdef bint _is_tzlocal(object)
cdef object _get_dst_info(object)
cdef bint _nat_scalar_rules[6]
cdef bint _check_all_nulls(obj)
19 changes: 18 additions & 1 deletion pandas/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
cimport numpy as np
from numpy cimport (int8_t, int32_t, int64_t, import_array, ndarray,
NPY_INT64, NPY_DATETIME, NPY_TIMEDELTA)
from datetime cimport get_datetime64_value, get_timedelta64_value
import numpy as np

# GH3363
Expand Down Expand Up @@ -707,12 +708,28 @@ NaT = NaTType()

iNaT = util.get_nat()

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe blow away _checknull_with_nat(and replace with _check_all_nulls and see if any timings are affected?


cdef inline bint _checknull_with_nat(object val):
""" utility to check if a value is a nat or not """
return val is None or (
PyFloat_Check(val) and val != val) or val is NaT

cdef inline bint _check_all_nulls(object val):
""" utility to check if a value is any type of null """
cdef bint res
if PyFloat_Check(val):
res = val != val
elif val is NaT:
res = 1
elif val is None:
res = 1
elif is_datetime64_object(val):
res = get_datetime64_value(val) == NPY_NAT
elif is_timedelta64_object(val):
res = get_timedelta64_value(val) == NPY_NAT
else:
res = 0
return res

cdef inline bint _cmp_nat_dt(_NaT lhs, _Timestamp rhs, int op) except -1:
return _nat_scalar_rules[op]

Expand Down