diff --git a/doc/source/release.rst b/doc/source/release.rst index e49812b207921..a932eda55ff32 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -335,6 +335,9 @@ Experimental Features - A :meth:`~pandas.DataFrame.query` method has been added that allows you to select elements of a ``DataFrame`` using a natural query syntax nearly identical to Python syntax. +- ``pd.eval`` and friends now evaluate operations involving ``datetime64`` + objects in Python space because ``numexpr`` cannot handle ``NaT`` values + (:issue:`4897`). .. _release.bug_fixes-0.13.0: diff --git a/pandas/computation/align.py b/pandas/computation/align.py index 60975bdc8a5b4..f420d0dacf34c 100644 --- a/pandas/computation/align.py +++ b/pandas/computation/align.py @@ -111,14 +111,20 @@ def _align_core(terms): typ = biggest._constructor axes = biggest.axes naxes = len(axes) + gt_than_one_axis = naxes > 1 - for term in (terms[i] for i in term_index): - for axis, items in enumerate(term.value.axes): - if isinstance(term.value, pd.Series) and naxes > 1: - ax, itm = naxes - 1, term.value.index + for value in (terms[i].value for i in term_index): + is_series = isinstance(value, pd.Series) + is_series_and_gt_one_axis = is_series and gt_than_one_axis + + for axis, items in enumerate(value.axes): + if is_series_and_gt_one_axis: + ax, itm = naxes - 1, value.index else: ax, itm = axis, items - axes[ax] = axes[ax].join(itm, how='outer') + + if not axes[ax].is_(itm): + axes[ax] = axes[ax].join(itm, how='outer') for i, ndim in compat.iteritems(ndims): for axis, items in zip(range(ndim), axes): @@ -136,7 +142,7 @@ def _align_core(terms): warnings.warn("Alignment difference on axis {0} is larger" " than an order of magnitude on term {1!r}, " "by more than {2:.4g}; performance may suffer" - "".format(axis, term.name, ordm), + "".format(axis, terms[i].name, ordm), category=pd.io.common.PerformanceWarning) if transpose: diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index ff9adc26b8201..ba2dffa9e71b8 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -493,8 +493,15 @@ def _possibly_evaluate_binop(self, op, op_class, lhs, rhs, maybe_eval_in_python=('==', '!=')): res = op(lhs, rhs) - # "in"/"not in" ops are always evaluated in python + if (res.op in _cmp_ops_syms and + lhs.is_datetime or rhs.is_datetime and + self.engine != 'pytables'): + # all date ops must be done in python bc numexpr doesn't work well + # with NaT + return self._possibly_eval(res, self.binary_ops) + if res.op in eval_in_python: + # "in"/"not in" ops are always evaluated in python return self._possibly_eval(res, eval_in_python) elif (lhs.return_type == object or rhs.return_type == object and self.engine != 'pytables'): diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index debc79e33968c..fd5ee159fe2b4 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -5,6 +5,7 @@ import operator as op from functools import partial from itertools import product, islice, chain +from datetime import datetime import numpy as np @@ -161,24 +162,16 @@ def raw(self): self.type)) @property - def kind(self): + def is_datetime(self): try: - return self.type.__name__ + t = self.type.type except AttributeError: - return self.type.type.__name__ + t = self.type + + return issubclass(t, (datetime, np.datetime64)) @property def value(self): - kind = self.kind.lower() - if kind == 'datetime64': - try: - return self._value.asi8 - except AttributeError: - return self._value.view('i8') - elif kind == 'datetime': - return pd.Timestamp(self._value) - elif kind == 'timestamp': - return self._value.asm8.view('i8') return self._value @value.setter @@ -248,6 +241,15 @@ def return_type(self): def isscalar(self): return all(operand.isscalar for operand in self.operands) + @property + def is_datetime(self): + try: + t = self.return_type.type + except AttributeError: + t = self.return_type + + return issubclass(t, (datetime, np.datetime64)) + def _in(x, y): """Compute the vectorized membership of ``x in y`` if possible, otherwise @@ -424,24 +426,20 @@ def stringify(value): lhs, rhs = self.lhs, self.rhs - if (is_term(lhs) and lhs.kind.startswith('datetime') and is_term(rhs) - and rhs.isscalar): + if is_term(lhs) and lhs.is_datetime and is_term(rhs) and rhs.isscalar: v = rhs.value if isinstance(v, (int, float)): v = stringify(v) - v = _ensure_decoded(v) - v = pd.Timestamp(v) + v = pd.Timestamp(_ensure_decoded(v)) if v.tz is not None: v = v.tz_convert('UTC') self.rhs.update(v) - if (is_term(rhs) and rhs.kind.startswith('datetime') and - is_term(lhs) and lhs.isscalar): + if is_term(rhs) and rhs.is_datetime and is_term(lhs) and lhs.isscalar: v = lhs.value if isinstance(v, (int, float)): v = stringify(v) - v = _ensure_decoded(v) - v = pd.Timestamp(v) + v = pd.Timestamp(_ensure_decoded(v)) if v.tz is not None: v = v.tz_convert('UTC') self.lhs.update(v) diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 3554b8a3f81e1..e9201c233753f 100755 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -1003,7 +1003,7 @@ def check_performance_warning_for_poor_alignment(self, engine, parser): expected = ("Alignment difference on axis {0} is larger" " than an order of magnitude on term {1!r}, " "by more than {2:.4g}; performance may suffer" - "".format(1, 's', np.log10(s.size - df.shape[1]))) + "".format(1, 'df', np.log10(s.size - df.shape[1]))) assert_equal(msg, expected) def test_performance_warning_for_poor_alignment(self): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7b9a75753136e..01e0d74ef8ce6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1894,29 +1894,6 @@ def _getitem_frame(self, key): raise ValueError('Must pass DataFrame with boolean values only') return self.where(key) - def _get_index_resolvers(self, axis): - # index or columns - axis_index = getattr(self, axis) - d = dict() - - for i, name in enumerate(axis_index.names): - if name is not None: - key = level = name - else: - # prefix with 'i' or 'c' depending on the input axis - # e.g., you must do ilevel_0 for the 0th level of an unnamed - # multiiindex - level_string = '{prefix}level_{i}'.format(prefix=axis[0], i=i) - key = level_string - level = i - - d[key] = Series(axis_index.get_level_values(level).values, - index=axis_index, name=level) - - # put the index/columns itself in the dict - d[axis] = axis_index - return d - def query(self, expr, **kwargs): """Query the columns of a frame with a boolean expression. @@ -2037,8 +2014,7 @@ def eval(self, expr, **kwargs): """ resolvers = kwargs.pop('resolvers', None) if resolvers is None: - index_resolvers = self._get_index_resolvers('index') - index_resolvers.update(self._get_index_resolvers('columns')) + index_resolvers = self._get_resolvers() resolvers = [self, index_resolvers] kwargs['local_dict'] = _ensure_scope(resolvers=resolvers, **kwargs) return _eval(expr, **kwargs) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4553e4804e98b..705679136c3d2 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -272,6 +272,42 @@ def _get_block_manager_axis(self, axis): return m - axis return axis + def _get_axis_resolvers(self, axis): + # index or columns + axis_index = getattr(self, axis) + d = dict() + prefix = axis[0] + + for i, name in enumerate(axis_index.names): + if name is not None: + key = level = name + else: + # prefix with 'i' or 'c' depending on the input axis + # e.g., you must do ilevel_0 for the 0th level of an unnamed + # multiiindex + key = '{prefix}level_{i}'.format(prefix=prefix, i=i) + level = i + + level_values = axis_index.get_level_values(level) + s = level_values.to_series() + s.index = axis_index + d[key] = s + + # put the index/columns itself in the dict + if isinstance(axis_index, MultiIndex): + dindex = axis_index + else: + dindex = axis_index.to_series() + + d[axis] = dindex + return d + + def _get_resolvers(self): + d = {} + for axis_name in self._AXIS_ORDERS: + d.update(self._get_axis_resolvers(axis_name)) + return d + @property def _info_axis(self): return getattr(self, self._info_axis_name) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index a6f806d5ce097..e5d2bb17ec7a8 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -11423,6 +11423,57 @@ def test_query_with_partially_named_multiindex(self): for parser, engine in product(['pandas'], ENGINES): yield self.check_query_with_partially_named_multiindex, parser, engine + def test_query_multiindex_get_index_resolvers(self): + for parser, engine in product(['pandas'], ENGINES): + yield self.check_query_multiindex_get_index_resolvers, parser, engine + + def check_query_multiindex_get_index_resolvers(self, parser, engine): + df = mkdf(10, 3, r_idx_nlevels=2, r_idx_names=['spam', 'eggs']) + resolvers = df._get_resolvers() + + def to_series(mi, level): + level_values = mi.get_level_values(level) + s = level_values.to_series() + s.index = mi + return s + + col_series = df.columns.to_series() + expected = {'index': df.index, + 'columns': col_series, + 'spam': to_series(df.index, 'spam'), + 'eggs': to_series(df.index, 'eggs'), + 'C0': col_series} + for k, v in resolvers.items(): + if isinstance(v, Index): + assert v.is_(expected[k]) + elif isinstance(v, Series): + print(k) + tm.assert_series_equal(v, expected[k]) + else: + raise AssertionError("object must be a Series or Index") + + def test_raise_on_panel_with_multiindex(self): + for parser, engine in product(PARSERS, ENGINES): + yield self.check_raise_on_panel_with_multiindex, parser, engine + + def check_raise_on_panel_with_multiindex(self, parser, engine): + skip_if_no_ne() + p = tm.makePanel(7) + p.items = tm.makeCustomIndex(len(p.items), nlevels=2) + with tm.assertRaises(NotImplementedError): + pd.eval('p + 1', parser=parser, engine=engine) + + def test_raise_on_panel4d_with_multiindex(self): + for parser, engine in product(PARSERS, ENGINES): + yield self.check_raise_on_panel4d_with_multiindex, parser, engine + + def check_raise_on_panel4d_with_multiindex(self, parser, engine): + skip_if_no_ne() + p4d = tm.makePanel4D(7) + p4d.items = tm.makeCustomIndex(len(p4d.items), nlevels=2) + with tm.assertRaises(NotImplementedError): + pd.eval('p4d + 1', parser=parser, engine=engine) + class TestDataFrameQueryNumExprPandas(unittest.TestCase): @classmethod @@ -11446,6 +11497,71 @@ def test_date_query_method(self): expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] assert_frame_equal(res, expec) + def test_date_query_with_NaT(self): + engine, parser = self.engine, self.parser + n = 10 + df = DataFrame(randn(n, 3)) + df['dates1'] = date_range('1/1/2012', periods=n) + df['dates2'] = date_range('1/1/2013', periods=n) + df['dates3'] = date_range('1/1/2014', periods=n) + df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT + df.loc[np.random.rand(n) > 0.5, 'dates3'] = pd.NaT + res = df.query('dates1 < 20130101 < dates3', engine=engine, + parser=parser) + expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] + assert_frame_equal(res, expec) + + def test_date_index_query(self): + engine, parser = self.engine, self.parser + n = 10 + df = DataFrame(randn(n, 3)) + df['dates1'] = date_range('1/1/2012', periods=n) + df['dates3'] = date_range('1/1/2014', periods=n) + df.set_index('dates1', inplace=True, drop=True) + res = df.query('index < 20130101 < dates3', engine=engine, + parser=parser) + expec = df[(df.index < '20130101') & ('20130101' < df.dates3)] + assert_frame_equal(res, expec) + + def test_date_index_query_with_NaT(self): + engine, parser = self.engine, self.parser + n = 10 + df = DataFrame(randn(n, 3)) + df['dates1'] = date_range('1/1/2012', periods=n) + df['dates3'] = date_range('1/1/2014', periods=n) + df.iloc[0, 0] = pd.NaT + df.set_index('dates1', inplace=True, drop=True) + res = df.query('index < 20130101 < dates3', engine=engine, + parser=parser) + expec = df[(df.index < '20130101') & ('20130101' < df.dates3)] + assert_frame_equal(res, expec) + + def test_date_index_query_with_NaT_duplicates(self): + engine, parser = self.engine, self.parser + n = 10 + d = {} + d['dates1'] = date_range('1/1/2012', periods=n) + d['dates3'] = date_range('1/1/2014', periods=n) + df = DataFrame(d) + df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT + df.set_index('dates1', inplace=True, drop=True) + res = df.query('index < 20130101 < dates3', engine=engine, parser=parser) + expec = df[(df.index.to_series() < '20130101') & ('20130101' < df.dates3)] + assert_frame_equal(res, expec) + + def test_date_query_with_non_date(self): + engine, parser = self.engine, self.parser + + n = 10 + df = DataFrame({'dates': date_range('1/1/2012', periods=n), + 'nondate': np.arange(n)}) + + ops = '==', '!=', '<', '>', '<=', '>=' + + for op in ops: + with tm.assertRaises(TypeError): + df.query('dates %s nondate' % op, parser=parser, engine=engine) + def test_query_scope(self): engine, parser = self.engine, self.parser from pandas.computation.common import NameResolutionError @@ -11608,6 +11724,57 @@ def test_date_query_method(self): expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] assert_frame_equal(res, expec) + def test_date_query_with_NaT(self): + engine, parser = self.engine, self.parser + n = 10 + df = DataFrame(randn(n, 3)) + df['dates1'] = date_range('1/1/2012', periods=n) + df['dates2'] = date_range('1/1/2013', periods=n) + df['dates3'] = date_range('1/1/2014', periods=n) + df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT + df.loc[np.random.rand(n) > 0.5, 'dates3'] = pd.NaT + res = df.query('(dates1 < 20130101) & (20130101 < dates3)', + engine=engine, parser=parser) + expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] + assert_frame_equal(res, expec) + + def test_date_index_query(self): + engine, parser = self.engine, self.parser + n = 10 + df = DataFrame(randn(n, 3)) + df['dates1'] = date_range('1/1/2012', periods=n) + df['dates3'] = date_range('1/1/2014', periods=n) + df.set_index('dates1', inplace=True, drop=True) + res = df.query('(index < 20130101) & (20130101 < dates3)', + engine=engine, parser=parser) + expec = df[(df.index < '20130101') & ('20130101' < df.dates3)] + assert_frame_equal(res, expec) + + def test_date_index_query_with_NaT(self): + engine, parser = self.engine, self.parser + n = 10 + df = DataFrame(randn(n, 3)) + df['dates1'] = date_range('1/1/2012', periods=n) + df['dates3'] = date_range('1/1/2014', periods=n) + df.iloc[0, 0] = pd.NaT + df.set_index('dates1', inplace=True, drop=True) + res = df.query('(index < 20130101) & (20130101 < dates3)', + engine=engine, parser=parser) + expec = df[(df.index < '20130101') & ('20130101' < df.dates3)] + assert_frame_equal(res, expec) + + def test_date_index_query_with_NaT_duplicates(self): + engine, parser = self.engine, self.parser + n = 10 + df = DataFrame(randn(n, 3)) + df['dates1'] = date_range('1/1/2012', periods=n) + df['dates3'] = date_range('1/1/2014', periods=n) + df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT + df.set_index('dates1', inplace=True, drop=True) + with tm.assertRaises(NotImplementedError): + res = df.query('index < 20130101 < dates3', engine=engine, + parser=parser) + def test_nested_scope(self): engine = self.engine parser = self.parser diff --git a/vb_suite/binary_ops.py b/vb_suite/binary_ops.py index 8293f650425e3..fc84dd8bcdb81 100644 --- a/vb_suite/binary_ops.py +++ b/vb_suite/binary_ops.py @@ -106,7 +106,7 @@ setup = common_setup + """ N = 1000000 halfway = N // 2 - 1 -s = Series(date_range('20010101', periods=N, freq='D')) +s = Series(date_range('20010101', periods=N, freq='T')) ts = s[halfway] """ diff --git a/vb_suite/eval.py b/vb_suite/eval.py index c666cd431cbb4..506d00b8bf9f9 100644 --- a/vb_suite/eval.py +++ b/vb_suite/eval.py @@ -47,12 +47,12 @@ eval_frame_mult_all_threads = \ Benchmark("pd.eval('df * df2 * df3 * df4')", common_setup, name='eval_frame_mult_all_threads', - start_date=datetime(2012, 7, 21)) + start_date=datetime(2013, 7, 21)) eval_frame_mult_one_thread = \ Benchmark("pd.eval('df * df2 * df3 * df4')", setup, name='eval_frame_mult_one_thread', - start_date=datetime(2012, 7, 26)) + start_date=datetime(2013, 7, 26)) eval_frame_mult_python = \ Benchmark("pdl.eval('df * df2 * df3 * df4', engine='python')", @@ -62,7 +62,7 @@ eval_frame_mult_python_one_thread = \ Benchmark("pd.eval('df * df2 * df3 * df4', engine='python')", setup, name='eval_frame_mult_python_one_thread', - start_date=datetime(2012, 7, 26)) + start_date=datetime(2013, 7, 26)) #---------------------------------------------------------------------- # multi and @@ -71,12 +71,12 @@ Benchmark("pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)')", common_setup, name='eval_frame_and_all_threads', - start_date=datetime(2012, 7, 21)) + start_date=datetime(2013, 7, 21)) eval_frame_and_one_thread = \ Benchmark("pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)')", setup, name='eval_frame_and_one_thread', - start_date=datetime(2012, 7, 26)) + start_date=datetime(2013, 7, 26)) setup = common_setup eval_frame_and_python = \ @@ -88,19 +88,19 @@ Benchmark("pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python')", setup, name='eval_frame_and_python_one_thread', - start_date=datetime(2012, 7, 26)) + start_date=datetime(2013, 7, 26)) #-------------------------------------------------------------------- # chained comp eval_frame_chained_cmp_all_threads = \ Benchmark("pd.eval('df < df2 < df3 < df4')", common_setup, name='eval_frame_chained_cmp_all_threads', - start_date=datetime(2012, 7, 21)) + start_date=datetime(2013, 7, 21)) eval_frame_chained_cmp_one_thread = \ Benchmark("pd.eval('df < df2 < df3 < df4')", setup, name='eval_frame_chained_cmp_one_thread', - start_date=datetime(2012, 7, 26)) + start_date=datetime(2013, 7, 26)) setup = common_setup eval_frame_chained_cmp_python = \ @@ -111,4 +111,31 @@ eval_frame_chained_cmp_one_thread = \ Benchmark("pd.eval('df < df2 < df3 < df4', engine='python')", setup, name='eval_frame_chained_cmp_python_one_thread', - start_date=datetime(2012, 7, 26)) + start_date=datetime(2013, 7, 26)) + + +common_setup = """from pandas_vb_common import * +""" + +setup = common_setup + """ +N = 1000000 +halfway = N // 2 - 1 +index = date_range('20010101', periods=N, freq='T') +s = Series(index) +ts = s.iloc[halfway] +""" + +series_setup = setup + """ +df = DataFrame({'dates': s.values}) +""" + +query_datetime_series = Benchmark("df.query('dates < ts')", + series_setup, + start_date=datetime(2013, 9, 27)) + +index_setup = setup + """ +df = DataFrame({'a': np.random.randn(N)}, index=index) +""" + +query_datetime_index = Benchmark("df.query('index < ts')", + index_setup, start_date=datetime(2013, 9, 27))