Skip to content

Commit 52a139e

Browse files
committed
Merge pull request #5973 from jreback/format_perf
PERF: perf improvments in indexing with object dtypes (GH5968)
2 parents cc2c70b + 6a9619f commit 52a139e

File tree

7 files changed

+58
-23
lines changed

7 files changed

+58
-23
lines changed

doc/source/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ Improvements to existing features
8383
- pd.show_versions() is now available for convenience when reporting issues.
8484
- perf improvements to Series.str.extract (:issue:`5944`)
8585
- perf improvments in ``dtypes/ftypes`` methods (:issue:`5968`)
86+
- perf improvments in indexing with object dtypes (:issue:`5968`)
8687

8788
.. _release.bug_fixes-0.13.1:
8889

@@ -116,6 +117,7 @@ Bug Fixes
116117
- Fixed ``to_datetime`` for array with both Tz-aware datetimes and ``NaT``s (:issue:`5961`)
117118
- Bug in rolling skew/kurtosis when passed a Series with bad data (:issue:`5749`)
118119
- Bug in scipy ``interpolate`` methods with a datetime index (:issue: `5975`)
120+
- Bug in NaT comparison if a mixed datetime/np.datetime64 with NaT were passed (:issue:`5968`)
119121

120122
pandas 0.13.0
121123
-------------

pandas/core/internals.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1889,7 +1889,10 @@ def make_block(values, items, ref_items, klass=None, ndim=None, dtype=None,
18891889

18901890
if np.prod(values.shape):
18911891
flat = values.ravel()
1892-
inferred_type = lib.infer_dtype(flat)
1892+
1893+
# try with just the first element; we just need to see if
1894+
# this is a datetime or not
1895+
inferred_type = lib.infer_dtype(flat[0:1])
18931896
if inferred_type in ['datetime', 'datetime64']:
18941897

18951898
# we have an object array that has been inferred as

pandas/io/tests/test_stata.py

+14-8
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,12 @@
44
import os
55
import warnings
66
import nose
7+
import sys
8+
from distutils.version import LooseVersion
79

810
import numpy as np
911

12+
import pandas as pd
1013
from pandas.core.frame import DataFrame, Series
1114
from pandas.io.parsers import read_csv
1215
from pandas.io.stata import read_stata, StataReader
@@ -66,6 +69,9 @@ def test_read_dta1(self):
6669
tm.assert_frame_equal(parsed_13, expected)
6770

6871
def test_read_dta2(self):
72+
if LooseVersion(sys.version) < '2.7':
73+
raise nose.SkipTest('datetime interp under 2.6 is faulty')
74+
6975
expected = DataFrame.from_records(
7076
[
7177
(
@@ -89,14 +95,14 @@ def test_read_dta2(self):
8995
datetime(2, 1, 1)
9096
),
9197
(
92-
np.datetime64('NaT'),
93-
np.datetime64('NaT'),
94-
np.datetime64('NaT'),
95-
np.datetime64('NaT'),
96-
np.datetime64('NaT'),
97-
np.datetime64('NaT'),
98-
np.datetime64('NaT'),
99-
np.datetime64('NaT')
98+
pd.NaT,
99+
pd.NaT,
100+
pd.NaT,
101+
pd.NaT,
102+
pd.NaT,
103+
pd.NaT,
104+
pd.NaT,
105+
pd.NaT,
100106
)
101107
],
102108
columns=['datetime_c', 'datetime_big_c', 'date', 'weekly_date',

pandas/src/util.pxd

+1-1
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ cdef inline is_array(object o):
6767

6868
cdef inline bint _checknull(object val):
6969
try:
70-
return val is None or (cpython.PyFloat_Check(val) and val != val)
70+
return val is None or (cpython.PyFloat_Check(val) and val != val)
7171
except ValueError:
7272
return False
7373

pandas/tseries/tests/test_timeseries.py

+5
Original file line numberDiff line numberDiff line change
@@ -829,6 +829,11 @@ def test_to_datetime_mixed(self):
829829
expected = Series([NaT,Timestamp('20130408'),Timestamp('20130409')])
830830
assert_series_equal(result,expected)
831831

832+
# mixed datetime/np.datetime64('NaT')
833+
result = Series(to_datetime([dt.datetime(2000,1,1),np.datetime64('NaT')]))
834+
expected = Series([dt.datetime(2000,1,1),NaT])
835+
assert_series_equal(result, expected)
836+
832837
def test_dayfirst(self):
833838

834839
# GH 3341

pandas/tslib.pyx

+25-13
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import numpy as np
88
from cpython cimport (
99
PyTypeObject,
1010
PyFloat_Check,
11+
PyLong_Check,
1112
PyObject_RichCompareBool,
1213
PyObject_RichCompare,
1314
PyString_Check,
@@ -55,6 +56,9 @@ cdef int64_t NPY_NAT = util.get_nat()
5556
# < numpy 1.7 compat for NaT
5657
compat_NaT = np.array([NPY_NAT]).astype('m8[ns]').item()
5758

59+
# numpy actual nat object
60+
np_NaT = np.datetime64('NaT',dtype='M8')
61+
5862
try:
5963
basestring
6064
except NameError: # py3
@@ -416,6 +420,11 @@ NaT = NaTType()
416420
iNaT = util.get_nat()
417421

418422

423+
cdef inline bint _checknull_with_nat(object val):
424+
""" utility to check if a value is a nat or not """
425+
return val is None or (
426+
PyFloat_Check(val) and val != val) or val is NaT
427+
419428
cdef inline bint _cmp_nat_dt(_NaT lhs, _Timestamp rhs, int op) except -1:
420429
return _nat_scalar_rules[op]
421430

@@ -761,7 +770,7 @@ cdef convert_to_tsobject(object ts, object tz, object unit):
761770

762771
obj = _TSObject()
763772

764-
if ts is None or ts is NaT:
773+
if ts is None or ts is NaT or ts is np_NaT:
765774
obj.value = NPY_NAT
766775
elif is_datetime64_object(ts):
767776
obj.value = _get_datetime64_nanos(ts)
@@ -933,7 +942,7 @@ def datetime_to_datetime64(ndarray[object] values):
933942
iresult = result.view('i8')
934943
for i in range(n):
935944
val = values[i]
936-
if util._checknull(val) or val is NaT:
945+
if _checknull_with_nat(val):
937946
iresult[i] = iNaT
938947
elif PyDateTime_Check(val):
939948
if val.tzinfo is not None:
@@ -999,7 +1008,7 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False,
9991008
iresult = result.view('i8')
10001009
for i in range(n):
10011010
val = values[i]
1002-
if util._checknull(val) or val is NaT:
1011+
if _checknull_with_nat(val):
10031012
iresult[i] = iNaT
10041013
elif PyDateTime_Check(val):
10051014
if val.tzinfo is not None:
@@ -1038,13 +1047,16 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False,
10381047
continue
10391048
raise
10401049
elif util.is_datetime64_object(val):
1041-
try:
1042-
iresult[i] = _get_datetime64_nanos(val)
1043-
except ValueError:
1044-
if coerce:
1045-
iresult[i] = iNaT
1046-
continue
1047-
raise
1050+
if val == np_NaT:
1051+
iresult[i] = iNaT
1052+
else:
1053+
try:
1054+
iresult[i] = _get_datetime64_nanos(val)
1055+
except ValueError:
1056+
if coerce:
1057+
iresult[i] = iNaT
1058+
continue
1059+
raise
10481060

10491061
# if we are coercing, dont' allow integers
10501062
elif util.is_integer_object(val) and not coerce:
@@ -1114,7 +1126,7 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False,
11141126

11151127
for i in range(n):
11161128
val = values[i]
1117-
if util._checknull(val):
1129+
if _checknull_with_nat(val):
11181130
oresult[i] = val
11191131
elif util.is_string_object(val):
11201132
if len(val) == 0:
@@ -1166,7 +1178,7 @@ def array_to_timedelta64(ndarray[object] values, coerce=True):
11661178

11671179
result[i] = val
11681180

1169-
elif util._checknull(val) or val == iNaT or val is NaT:
1181+
elif _checknull_with_nat(val):
11701182
result[i] = iNaT
11711183

11721184
else:
@@ -1316,7 +1328,7 @@ def array_strptime(ndarray[object] values, object fmt, coerce=False):
13161328
iresult[i] = iNaT
13171329
continue
13181330
else:
1319-
if util._checknull(val) or val is NaT:
1331+
if _checknull_with_nat(val):
13201332
iresult[i] = iNaT
13211333
continue
13221334
else:

vb_suite/indexing.py

+7
Original file line numberDiff line numberDiff line change
@@ -167,3 +167,10 @@
167167

168168
frame_loc_dups = Benchmark('df2.loc[idx]', setup,
169169
start_date=datetime(2013, 1, 1))
170+
171+
setup = common_setup + """
172+
df = DataFrame(dict( A = [ 'foo'] * 1000000))
173+
"""
174+
175+
frame_iloc_big = Benchmark('df.iloc[:100,0]', setup,
176+
start_date=datetime(2013, 1, 1))

0 commit comments

Comments
 (0)