Skip to content

PERF: perf improvments in indexing with object dtypes (GH5968) #5973

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 16, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ Improvements to existing features
- pd.show_versions() is now available for convenience when reporting issues.
- perf improvements to Series.str.extract (:issue:`5944`)
- perf improvments in ``dtypes/ftypes`` methods (:issue:`5968`)
- perf improvments in indexing with object dtypes (:issue:`5968`)

.. _release.bug_fixes-0.13.1:

Expand Down Expand Up @@ -116,6 +117,7 @@ Bug Fixes
- Fixed ``to_datetime`` for array with both Tz-aware datetimes and ``NaT``s (:issue:`5961`)
- Bug in rolling skew/kurtosis when passed a Series with bad data (:issue:`5749`)
- Bug in scipy ``interpolate`` methods with a datetime index (:issue: `5975`)
- Bug in NaT comparison if a mixed datetime/np.datetime64 with NaT were passed (:issue:`5968`)

pandas 0.13.0
-------------
Expand Down
5 changes: 4 additions & 1 deletion pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1889,7 +1889,10 @@ def make_block(values, items, ref_items, klass=None, ndim=None, dtype=None,

if np.prod(values.shape):
flat = values.ravel()
inferred_type = lib.infer_dtype(flat)

# try with just the first element; we just need to see if
# this is a datetime or not
inferred_type = lib.infer_dtype(flat[0:1])
if inferred_type in ['datetime', 'datetime64']:

# we have an object array that has been inferred as
Expand Down
22 changes: 14 additions & 8 deletions pandas/io/tests/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@
import os
import warnings
import nose
import sys
from distutils.version import LooseVersion

import numpy as np

import pandas as pd
from pandas.core.frame import DataFrame, Series
from pandas.io.parsers import read_csv
from pandas.io.stata import read_stata, StataReader
Expand Down Expand Up @@ -66,6 +69,9 @@ def test_read_dta1(self):
tm.assert_frame_equal(parsed_13, expected)

def test_read_dta2(self):
if LooseVersion(sys.version) < '2.7':
raise nose.SkipTest('datetime interp under 2.6 is faulty')

expected = DataFrame.from_records(
[
(
Expand All @@ -89,14 +95,14 @@ def test_read_dta2(self):
datetime(2, 1, 1)
),
(
np.datetime64('NaT'),
np.datetime64('NaT'),
np.datetime64('NaT'),
np.datetime64('NaT'),
np.datetime64('NaT'),
np.datetime64('NaT'),
np.datetime64('NaT'),
np.datetime64('NaT')
pd.NaT,
pd.NaT,
pd.NaT,
pd.NaT,
pd.NaT,
pd.NaT,
pd.NaT,
pd.NaT,
)
],
columns=['datetime_c', 'datetime_big_c', 'date', 'weekly_date',
Expand Down
2 changes: 1 addition & 1 deletion pandas/src/util.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ cdef inline is_array(object o):

cdef inline bint _checknull(object val):
try:
return val is None or (cpython.PyFloat_Check(val) and val != val)
return val is None or (cpython.PyFloat_Check(val) and val != val)
except ValueError:
return False

Expand Down
5 changes: 5 additions & 0 deletions pandas/tseries/tests/test_timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -829,6 +829,11 @@ def test_to_datetime_mixed(self):
expected = Series([NaT,Timestamp('20130408'),Timestamp('20130409')])
assert_series_equal(result,expected)

# mixed datetime/np.datetime64('NaT')
result = Series(to_datetime([dt.datetime(2000,1,1),np.datetime64('NaT')]))
expected = Series([dt.datetime(2000,1,1),NaT])
assert_series_equal(result, expected)

def test_dayfirst(self):

# GH 3341
Expand Down
38 changes: 25 additions & 13 deletions pandas/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import numpy as np
from cpython cimport (
PyTypeObject,
PyFloat_Check,
PyLong_Check,
PyObject_RichCompareBool,
PyObject_RichCompare,
PyString_Check,
Expand Down Expand Up @@ -55,6 +56,9 @@ cdef int64_t NPY_NAT = util.get_nat()
# < numpy 1.7 compat for NaT
compat_NaT = np.array([NPY_NAT]).astype('m8[ns]').item()

# numpy actual nat object
np_NaT = np.datetime64('NaT',dtype='M8')

try:
basestring
except NameError: # py3
Expand Down Expand Up @@ -416,6 +420,11 @@ NaT = NaTType()
iNaT = util.get_nat()


cdef inline bint _checknull_with_nat(object val):
""" utility to check if a value is a nat or not """
return val is None or (
PyFloat_Check(val) and val != val) or val is NaT

cdef inline bint _cmp_nat_dt(_NaT lhs, _Timestamp rhs, int op) except -1:
return _nat_scalar_rules[op]

Expand Down Expand Up @@ -761,7 +770,7 @@ cdef convert_to_tsobject(object ts, object tz, object unit):

obj = _TSObject()

if ts is None or ts is NaT:
if ts is None or ts is NaT or ts is np_NaT:
obj.value = NPY_NAT
elif is_datetime64_object(ts):
obj.value = _get_datetime64_nanos(ts)
Expand Down Expand Up @@ -933,7 +942,7 @@ def datetime_to_datetime64(ndarray[object] values):
iresult = result.view('i8')
for i in range(n):
val = values[i]
if util._checknull(val) or val is NaT:
if _checknull_with_nat(val):
iresult[i] = iNaT
elif PyDateTime_Check(val):
if val.tzinfo is not None:
Expand Down Expand Up @@ -999,7 +1008,7 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False,
iresult = result.view('i8')
for i in range(n):
val = values[i]
if util._checknull(val) or val is NaT:
if _checknull_with_nat(val):
iresult[i] = iNaT
elif PyDateTime_Check(val):
if val.tzinfo is not None:
Expand Down Expand Up @@ -1038,13 +1047,16 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False,
continue
raise
elif util.is_datetime64_object(val):
try:
iresult[i] = _get_datetime64_nanos(val)
except ValueError:
if coerce:
iresult[i] = iNaT
continue
raise
if val == np_NaT:
iresult[i] = iNaT
else:
try:
iresult[i] = _get_datetime64_nanos(val)
except ValueError:
if coerce:
iresult[i] = iNaT
continue
raise

# if we are coercing, dont' allow integers
elif util.is_integer_object(val) and not coerce:
Expand Down Expand Up @@ -1114,7 +1126,7 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False,

for i in range(n):
val = values[i]
if util._checknull(val):
if _checknull_with_nat(val):
oresult[i] = val
elif util.is_string_object(val):
if len(val) == 0:
Expand Down Expand Up @@ -1166,7 +1178,7 @@ def array_to_timedelta64(ndarray[object] values, coerce=True):

result[i] = val

elif util._checknull(val) or val == iNaT or val is NaT:
elif _checknull_with_nat(val):
result[i] = iNaT

else:
Expand Down Expand Up @@ -1316,7 +1328,7 @@ def array_strptime(ndarray[object] values, object fmt, coerce=False):
iresult[i] = iNaT
continue
else:
if util._checknull(val) or val is NaT:
if _checknull_with_nat(val):
iresult[i] = iNaT
continue
else:
Expand Down
7 changes: 7 additions & 0 deletions vb_suite/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,10 @@

frame_loc_dups = Benchmark('df2.loc[idx]', setup,
start_date=datetime(2013, 1, 1))

setup = common_setup + """
df = DataFrame(dict( A = [ 'foo'] * 1000000))
"""

frame_iloc_big = Benchmark('df.iloc[:100,0]', setup,
start_date=datetime(2013, 1, 1))