diff --git a/doc/source/release.rst b/doc/source/release.rst index f1d13645f863e..27e1e8df6bfe1 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -83,6 +83,7 @@ Improvements to existing features - pd.show_versions() is now available for convenience when reporting issues. - perf improvements to Series.str.extract (:issue:`5944`) - perf improvments in ``dtypes/ftypes`` methods (:issue:`5968`) + - perf improvments in indexing with object dtypes (:issue:`5968`) .. _release.bug_fixes-0.13.1: @@ -116,6 +117,7 @@ Bug Fixes - Fixed ``to_datetime`` for array with both Tz-aware datetimes and ``NaT``s (:issue:`5961`) - Bug in rolling skew/kurtosis when passed a Series with bad data (:issue:`5749`) - Bug in scipy ``interpolate`` methods with a datetime index (:issue: `5975`) + - Bug in NaT comparison if a mixed datetime/np.datetime64 with NaT were passed (:issue:`5968`) pandas 0.13.0 ------------- diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 354eadc7c7ba1..ca16be1b52ca2 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1889,7 +1889,10 @@ def make_block(values, items, ref_items, klass=None, ndim=None, dtype=None, if np.prod(values.shape): flat = values.ravel() - inferred_type = lib.infer_dtype(flat) + + # try with just the first element; we just need to see if + # this is a datetime or not + inferred_type = lib.infer_dtype(flat[0:1]) if inferred_type in ['datetime', 'datetime64']: # we have an object array that has been inferred as diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 5a842adb561b1..eb3f518b0dd4b 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -4,9 +4,12 @@ import os import warnings import nose +import sys +from distutils.version import LooseVersion import numpy as np +import pandas as pd from pandas.core.frame import DataFrame, Series from pandas.io.parsers import read_csv from pandas.io.stata import read_stata, StataReader @@ -66,6 +69,9 @@ def test_read_dta1(self): tm.assert_frame_equal(parsed_13, expected) def test_read_dta2(self): + if LooseVersion(sys.version) < '2.7': + raise nose.SkipTest('datetime interp under 2.6 is faulty') + expected = DataFrame.from_records( [ ( @@ -89,14 +95,14 @@ def test_read_dta2(self): datetime(2, 1, 1) ), ( - np.datetime64('NaT'), - np.datetime64('NaT'), - np.datetime64('NaT'), - np.datetime64('NaT'), - np.datetime64('NaT'), - np.datetime64('NaT'), - np.datetime64('NaT'), - np.datetime64('NaT') + pd.NaT, + pd.NaT, + pd.NaT, + pd.NaT, + pd.NaT, + pd.NaT, + pd.NaT, + pd.NaT, ) ], columns=['datetime_c', 'datetime_big_c', 'date', 'weekly_date', diff --git a/pandas/src/util.pxd b/pandas/src/util.pxd index d6f3c4caea306..7a30f018e623e 100644 --- a/pandas/src/util.pxd +++ b/pandas/src/util.pxd @@ -67,7 +67,7 @@ cdef inline is_array(object o): cdef inline bint _checknull(object val): try: - return val is None or (cpython.PyFloat_Check(val) and val != val) + return val is None or (cpython.PyFloat_Check(val) and val != val) except ValueError: return False diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 4fb8bc7deddb4..bae93602cb840 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -829,6 +829,11 @@ def test_to_datetime_mixed(self): expected = Series([NaT,Timestamp('20130408'),Timestamp('20130409')]) assert_series_equal(result,expected) + # mixed datetime/np.datetime64('NaT') + result = Series(to_datetime([dt.datetime(2000,1,1),np.datetime64('NaT')])) + expected = Series([dt.datetime(2000,1,1),NaT]) + assert_series_equal(result, expected) + def test_dayfirst(self): # GH 3341 diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index c118a9660b5e7..e303df23003cb 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -8,6 +8,7 @@ import numpy as np from cpython cimport ( PyTypeObject, PyFloat_Check, + PyLong_Check, PyObject_RichCompareBool, PyObject_RichCompare, PyString_Check, @@ -55,6 +56,9 @@ cdef int64_t NPY_NAT = util.get_nat() # < numpy 1.7 compat for NaT compat_NaT = np.array([NPY_NAT]).astype('m8[ns]').item() +# numpy actual nat object +np_NaT = np.datetime64('NaT',dtype='M8') + try: basestring except NameError: # py3 @@ -416,6 +420,11 @@ NaT = NaTType() iNaT = util.get_nat() +cdef inline bint _checknull_with_nat(object val): + """ utility to check if a value is a nat or not """ + return val is None or ( + PyFloat_Check(val) and val != val) or val is NaT + cdef inline bint _cmp_nat_dt(_NaT lhs, _Timestamp rhs, int op) except -1: return _nat_scalar_rules[op] @@ -761,7 +770,7 @@ cdef convert_to_tsobject(object ts, object tz, object unit): obj = _TSObject() - if ts is None or ts is NaT: + if ts is None or ts is NaT or ts is np_NaT: obj.value = NPY_NAT elif is_datetime64_object(ts): obj.value = _get_datetime64_nanos(ts) @@ -933,7 +942,7 @@ def datetime_to_datetime64(ndarray[object] values): iresult = result.view('i8') for i in range(n): val = values[i] - if util._checknull(val) or val is NaT: + if _checknull_with_nat(val): iresult[i] = iNaT elif PyDateTime_Check(val): if val.tzinfo is not None: @@ -999,7 +1008,7 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, iresult = result.view('i8') for i in range(n): val = values[i] - if util._checknull(val) or val is NaT: + if _checknull_with_nat(val): iresult[i] = iNaT elif PyDateTime_Check(val): if val.tzinfo is not None: @@ -1038,13 +1047,16 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, continue raise elif util.is_datetime64_object(val): - try: - iresult[i] = _get_datetime64_nanos(val) - except ValueError: - if coerce: - iresult[i] = iNaT - continue - raise + if val == np_NaT: + iresult[i] = iNaT + else: + try: + iresult[i] = _get_datetime64_nanos(val) + except ValueError: + if coerce: + iresult[i] = iNaT + continue + raise # if we are coercing, dont' allow integers elif util.is_integer_object(val) and not coerce: @@ -1114,7 +1126,7 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, for i in range(n): val = values[i] - if util._checknull(val): + if _checknull_with_nat(val): oresult[i] = val elif util.is_string_object(val): if len(val) == 0: @@ -1166,7 +1178,7 @@ def array_to_timedelta64(ndarray[object] values, coerce=True): result[i] = val - elif util._checknull(val) or val == iNaT or val is NaT: + elif _checknull_with_nat(val): result[i] = iNaT else: @@ -1316,7 +1328,7 @@ def array_strptime(ndarray[object] values, object fmt, coerce=False): iresult[i] = iNaT continue else: - if util._checknull(val) or val is NaT: + if _checknull_with_nat(val): iresult[i] = iNaT continue else: diff --git a/vb_suite/indexing.py b/vb_suite/indexing.py index beefec256ed81..dcfda997fabd6 100644 --- a/vb_suite/indexing.py +++ b/vb_suite/indexing.py @@ -167,3 +167,10 @@ frame_loc_dups = Benchmark('df2.loc[idx]', setup, start_date=datetime(2013, 1, 1)) + +setup = common_setup + """ +df = DataFrame(dict( A = [ 'foo'] * 1000000)) +""" + +frame_iloc_big = Benchmark('df.iloc[:100,0]', setup, + start_date=datetime(2013, 1, 1))