diff --git a/bench/bench_with_subset.R b/bench/bench_with_subset.R
new file mode 100644
index 0000000000000..69d0f7a9eec63
--- /dev/null
+++ b/bench/bench_with_subset.R
@@ -0,0 +1,53 @@
+library(microbenchmark)
+library(data.table)
+
+
+data.frame.subset.bench <- function (n=1e7, times=30) {
+ df <- data.frame(a=rnorm(n), b=rnorm(n), c=rnorm(n))
+ print(microbenchmark(subset(df, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c),
+ times=times))
+}
+
+
+# data.table allows something very similar to query with an expression
+# but we have chained comparisons AND we're faster BOO YAH!
+data.table.subset.expression.bench <- function (n=1e7, times=30) {
+ dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n))
+ print(microbenchmark(dt[, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c],
+ times=times))
+}
+
+
+# compare against subset with data.table for good measure
+data.table.subset.bench <- function (n=1e7, times=30) {
+ dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n))
+ print(microbenchmark(subset(dt, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c),
+ times=times))
+}
+
+
+data.frame.with.bench <- function (n=1e7, times=30) {
+ df <- data.frame(a=rnorm(n), b=rnorm(n), c=rnorm(n))
+
+ print(microbenchmark(with(df, a + b * (c ^ 2 + b ^ 2 - a) / (a * c) ^ 3),
+ times=times))
+}
+
+
+data.table.with.bench <- function (n=1e7, times=30) {
+ dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n))
+ print(microbenchmark(with(dt, a + b * (c ^ 2 + b ^ 2 - a) / (a * c) ^ 3),
+ times=times))
+}
+
+
+bench <- function () {
+ data.frame.subset.bench()
+ data.table.subset.expression.bench()
+ data.table.subset.bench()
+ data.frame.with.bench()
+ data.table.with.bench()
+}
+
+
+bench()
diff --git a/bench/bench_with_subset.py b/bench/bench_with_subset.py
new file mode 100644
index 0000000000000..99b98c9838a90
--- /dev/null
+++ b/bench/bench_with_subset.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+
+"""
+Microbenchmarks for comparison with R's "with" and "subset" functions
+"""
+
+from __future__ import print_function
+import numpy as np
+from numpy import array
+from timeit import repeat as timeit
+from pandas.compat import range, zip
+from pandas import DataFrame
+
+
+setup_common = """from pandas import DataFrame
+from numpy.random import randn
+df = DataFrame(randn(%d, 3), columns=list('abc'))
+%s"""
+
+
+setup_with = "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'"
+
+
+def bench_with(n, times=10, repeat=3, engine='numexpr'):
+ return np.array(timeit('df.eval(s, engine=%r)' % engine,
+ setup=setup_common % (n, setup_with),
+ repeat=repeat, number=times)) / times
+
+
+setup_subset = "s = 'a <= b <= c ** 2 + b ** 2 - a and b > c'"
+
+
+def bench_subset(n, times=10, repeat=3, engine='numexpr'):
+ return np.array(timeit('df.query(s, engine=%r)' % engine,
+ setup=setup_common % (n, setup_subset),
+ repeat=repeat, number=times)) / times
+
+
+def bench(mn=1, mx=7, num=100, engines=('python', 'numexpr'), verbose=False):
+ r = np.logspace(mn, mx, num=num).round().astype(int)
+
+ ev = DataFrame(np.empty((num, len(engines))), columns=engines)
+ qu = ev.copy(deep=True)
+
+ ev['size'] = qu['size'] = r
+
+ for engine in engines:
+ for i, n in enumerate(r):
+ if verbose:
+ print('engine: %r, i == %d' % (engine, i))
+ ev.loc[i, engine] = bench_with(n, times=1, repeat=1, engine=engine)
+ qu.loc[i, engine] = bench_subset(n, times=1, repeat=1,
+ engine=engine)
+
+ return ev, qu
+
+
+def plot_perf(df, engines, title, filename=None):
+ from matplotlib.pyplot import figure, rc
+
+ try:
+ from mpltools import style
+ except ImportError:
+ pass
+ else:
+ style.use('ggplot')
+
+ rc('text', usetex=True)
+
+ fig = figure(figsize=(4, 3), dpi=100)
+ ax = fig.add_subplot(111)
+
+ for engine in engines:
+ ax.plot(df.size, df[engine], label=engine, lw=2)
+
+ ax.set_xlabel('Number of Rows')
+ ax.set_ylabel('Time (s)')
+ ax.set_title(title)
+ ax.legend(loc='best')
+ ax.tick_params(top=False, right=False)
+
+ fig.tight_layout()
+
+ if filename is not None:
+ fig.savefig(filename)
+
+
+if __name__ == '__main__':
+ import os
+ import pandas as pd
+
+ pandas_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
+ static_path = os.path.join(pandas_dir, 'doc', 'source', '_static')
+
+ join = lambda p: os.path.join(static_path, p)
+
+ fn = join('eval-query-perf-data.h5')
+
+ engines = 'python', 'numexpr'
+
+ if not os.path.exists(fn):
+ ev, qu = bench(verbose=True)
+ ev.to_hdf(fn, 'eval')
+ qu.to_hdf(fn, 'query')
+ else:
+ ev = pd.read_hdf(fn, 'eval')
+ qu = pd.read_hdf(fn, 'query')
+
+ plot_perf(ev, engines, 'DataFrame.eval()', filename=join('eval-perf.png'))
+ plot_perf(qu, engines, 'DataFrame.query()',
+ filename=join('query-perf.png'))
+
+ plot_perf(ev[ev.size <= 50000], engines, 'DataFrame.eval()',
+ filename=join('eval-perf-small.png'))
+ plot_perf(qu[qu.size <= 100000], engines, 'DataFrame.query()',
+ filename=join('query-perf-small.png'))
diff --git a/doc/source/_static/eval-perf-small.png b/doc/source/_static/eval-perf-small.png
new file mode 100644
index 0000000000000..d86018363ffdc
Binary files /dev/null and b/doc/source/_static/eval-perf-small.png differ
diff --git a/doc/source/_static/eval-perf.png b/doc/source/_static/eval-perf.png
new file mode 100644
index 0000000000000..14c69c1b85d9e
Binary files /dev/null and b/doc/source/_static/eval-perf.png differ
diff --git a/doc/source/_static/query-perf-small.png b/doc/source/_static/query-perf-small.png
new file mode 100644
index 0000000000000..56fcc787a66af
Binary files /dev/null and b/doc/source/_static/query-perf-small.png differ
diff --git a/doc/source/_static/query-perf.png b/doc/source/_static/query-perf.png
new file mode 100644
index 0000000000000..d96318df94357
Binary files /dev/null and b/doc/source/_static/query-perf.png differ
diff --git a/doc/source/api.rst b/doc/source/api.rst
index 538965d0be7ad..28c1515e93bc5 100644
--- a/doc/source/api.rst
+++ b/doc/source/api.rst
@@ -155,6 +155,17 @@ Top-level dealing with datetimes
to_datetime
+Top-level evaluation
+~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: pandas
+
+.. autosummary::
+ :toctree: generated/
+
+ eval
+
+
Standard moving window functions
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -452,6 +463,7 @@ Indexing, iteration
DataFrame.tail
DataFrame.xs
DataFrame.isin
+ DataFrame.query
Binary operator functions
~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -502,6 +514,7 @@ Computations / Descriptive Stats
DataFrame.cumsum
DataFrame.describe
DataFrame.diff
+ DataFrame.eval
DataFrame.kurt
DataFrame.mad
DataFrame.max
diff --git a/doc/source/comparison_with_r.rst b/doc/source/comparison_with_r.rst
index 5759768051c0e..ef609aaa7d70c 100644
--- a/doc/source/comparison_with_r.rst
+++ b/doc/source/comparison_with_r.rst
@@ -1,28 +1,87 @@
.. currentmodule:: pandas
.. _compare_with_r:
-*******************************
Comparison with R / R libraries
*******************************
-Since pandas aims to provide a lot of the data manipulation and analysis
-functionality that people use R for, this page was started to provide a more
-detailed look at the R language and it's many 3rd party libraries as they
-relate to pandas. In offering comparisons with R and CRAN libraries, we care
-about the following things:
+Since ``pandas`` aims to provide a lot of the data manipulation and analysis
+functionality that people use `R `__ for, this page
+was started to provide a more detailed look at the `R language
+`__ and its many third
+party libraries as they relate to ``pandas``. In comparisons with R and CRAN
+libraries, we care about the following things:
- - **Functionality / flexibility**: what can / cannot be done with each tool
- - **Performance**: how fast are operations. Hard numbers / benchmarks are
+ - **Functionality / flexibility**: what can/cannot be done with each tool
+ - **Performance**: how fast are operations. Hard numbers/benchmarks are
preferable
- - **Ease-of-use**: is one tool easier or harder to use (you may have to be
- the judge of this given side-by-side code comparisons)
+ - **Ease-of-use**: Is one tool easier/harder to use (you may have to be
+ the judge of this, given side-by-side code comparisons)
+
+This page is also here to offer a bit of a translation guide for users of these
+R packages.
+
+Base R
+------
+
+|subset|_
+~~~~~~~~~~
+
+.. versionadded:: 0.13
+
+The :meth:`~pandas.DataFrame.query` method is similar to the base R ``subset``
+function. In R you might want to get the rows of a ``data.frame`` where one
+column's values are less than another column's values:
+
+ .. code-block:: r
+
+ df <- data.frame(a=rnorm(10), b=rnorm(10))
+ subset(df, a <= b)
+ df[df$a <= df$b,] # note the comma
+
+In ``pandas``, there are a few ways to perform subsetting. You can use
+:meth:`~pandas.DataFrame.query` or pass an expression as if it were an
+index/slice as well as standard boolean indexing:
+
+ .. ipython:: python
+
+ from pandas import DataFrame
+ from numpy.random import randn
+
+ df = DataFrame({'a': randn(10), 'b': randn(10)})
+ df.query('a <= b')
+ df[df.a <= df.b]
+ df.loc[df.a <= df.b]
-As I do not have an encyclopedic knowledge of R packages, feel free to suggest
-additional CRAN packages to add to this list. This is also here to offer a big
-of a translation guide for users of these R packages.
+For more details and examples see :ref:`the query documentation
+`.
-data.frame
-----------
+
+|with|_
+~~~~~~~~
+
+.. versionadded:: 0.13
+
+An expression using a data.frame called ``df`` in R with the columns ``a`` and
+``b`` would be evaluated using ``with`` like so:
+
+ .. code-block:: r
+
+ df <- data.frame(a=rnorm(10), b=rnorm(10))
+ with(df, a + b)
+ df$a + df$b # same as the previous expression
+
+In ``pandas`` the equivalent expression, using the
+:meth:`~pandas.DataFrame.eval` method, would be:
+
+ .. ipython:: python
+
+ df = DataFrame({'a': randn(10), 'b': randn(10)})
+ df.eval('a + b')
+ df.a + df.b # same as the previous expression
+
+In certain cases :meth:`~pandas.DataFrame.eval` will be much faster than
+evaluation in pure Python. For more details and examples see :ref:`the eval
+documentation `.
zoo
---
@@ -36,3 +95,9 @@ plyr
reshape / reshape2
------------------
+
+.. |with| replace:: ``with``
+.. _with: http://finzi.psych.upenn.edu/R/library/base/html/with.html
+
+.. |subset| replace:: ``subset``
+.. _subset: http://finzi.psych.upenn.edu/R/library/base/html/subset.html
diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst
index 95428bd27e2a2..87b68248c3e9e 100644
--- a/doc/source/enhancingperf.rst
+++ b/doc/source/enhancingperf.rst
@@ -225,8 +225,8 @@ the rows, applying our ``integrate_f_typed``, and putting this in the zeros arra
.. note::
- Loop like this would be *extremely* slow in python, but in cython looping over
- numpy arrays is *fast*.
+ Loops like this would be *extremely* slow in python, but in Cython looping
+ over numpy arrays is *fast*.
.. ipython:: python
@@ -289,3 +289,262 @@ Further topics
- Loading C modules into cython.
Read more in the `cython docs `__.
+
+.. _enhancingperf.eval:
+
+Expression Evaluation via :func:`~pandas.eval` (Experimental)
+-------------------------------------------------------------
+
+.. versionadded:: 0.13
+
+The top-level function :func:`~pandas.eval` implements expression evaluation of
+:class:`~pandas.Series` and :class:`~pandas.DataFrame` objects.
+
+.. note::
+
+ To benefit from using :func:`~pandas.eval` you need to
+ install ``numexpr``. See the :ref:`recommended dependencies section
+ ` for more details.
+
+The point of using :func:`~pandas.eval` for expression evaluation rather than
+plain Python is two-fold: 1) large :class:`~pandas.DataFrame` objects are
+evaluated more efficiently and 2) large arithmetic and boolean expressions are
+evaluated all at once by the underlying engine (by default ``numexpr`` is used
+for evaluation).
+
+.. note::
+
+ You should not use :func:`~pandas.eval` for simple
+ expressions or for expressions involving small DataFrames. In fact,
+ :func:`~pandas.eval` is many orders of magnitude slower for
+ smaller expressions/objects than plain ol' Python. A good rule of thumb is
+ to only use :func:`~pandas.eval` when you have a
+ :class:`~pandas.core.frame.DataFrame` with more than 10,000 rows.
+
+
+:func:`~pandas.eval` supports all arithmetic expressions supported by the
+engine in addition to some extensions available only in pandas.
+
+.. note::
+
+ The larger the frame and the larger the expression the more speedup you will
+ see from using :func:`~pandas.eval`.
+
+
+:func:`~pandas.eval` Examples
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:func:`~pandas.eval` works wonders for expressions containing large arrays
+
+First let's create 4 decent-sized arrays to play with:
+
+.. ipython:: python
+
+ import pandas as pd
+ from pandas import DataFrame, Series
+ from numpy.random import randn
+ import numpy as np
+ nrows, ncols = 20000, 100
+ df1, df2, df3, df4 = [DataFrame(randn(nrows, ncols)) for _ in xrange(4)]
+
+
+Now let's compare adding them together using plain ol' Python versus
+:func:`~pandas.eval`:
+
+
+.. ipython:: python
+
+ %timeit df1 + df2 + df3 + df4
+
+.. ipython:: python
+
+ %timeit pd.eval('df1 + df2 + df3 + df4')
+
+
+Now let's do the same thing but with comparisons:
+
+.. ipython:: python
+
+ %timeit (df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)
+
+.. ipython:: python
+
+ %timeit pd.eval('(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)')
+
+
+:func:`~pandas.eval` also works with unaligned pandas objects:
+
+
+.. ipython:: python
+
+ s = Series(randn(50))
+ %timeit df1 + df2 + df3 + df4 + s
+
+.. ipython:: python
+
+ %timeit pd.eval('df1 + df2 + df3 + df4 + s')
+
+.. note::
+
+ Operations such as ``1 and 2`` should be performed in Python. An exception
+ will be raised if you try to performed any boolean or bitwise operations
+ with scalar operands that are not of type ``bool`` or ``np.bool_``. *This
+ includes bitwise operations on scalars.* You should perform these kinds of
+ operations in Python.
+
+The ``DataFrame.eval`` method (Experimental)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In addition to the top level :func:`~pandas.eval` function you can also
+evaluate an expression in the "context" of a ``DataFrame``.
+
+
+.. ipython:: python
+
+ df = DataFrame(randn(5, 2), columns=['a', 'b'])
+ df.eval('a + b')
+
+
+Any expression that is a valid :func:`~pandas.eval` expression is also a valid
+``DataFrame.eval`` expression, with the added benefit that *you don't have to
+prefix the name of the* ``DataFrame`` *to the column you're interested in
+evaluating*.
+
+
+Local Variables
+~~~~~~~~~~~~~~~
+
+You can refer to local variables the same way you would in vanilla Python
+
+.. ipython:: python
+
+ df = DataFrame(randn(5, 2), columns=['a', 'b'])
+ newcol = randn(len(df))
+ df.eval('b + newcol')
+
+.. note::
+
+ The one exception is when you have a local (or global) with the same name as
+ a column in the ``DataFrame``
+
+ .. code-block:: python
+
+ df = DataFrame(randn(5, 2), columns=['a', 'b'])
+ a = randn(len(df))
+ df.eval('a + b')
+ NameResolutionError: resolvers and locals overlap on names ['a']
+
+
+ To deal with these conflicts, a special syntax exists for referring
+ variables with the same name as a column
+
+ .. ipython:: python
+ :suppress:
+
+ a = randn(len(df))
+
+ .. ipython:: python
+
+ df.eval('@a + b')
+
+ The same is true for :meth:`~pandas.DataFrame.query`
+
+ .. ipython:: python
+
+ df.query('@a < b')
+
+ .. ipython:: python
+ :suppress:
+
+ del a
+
+
+:func:`~pandas.eval` Parsers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+There are two different parsers and and two different engines you can use as
+the backend.
+
+The default ``'pandas'`` parser allows a more intuitive syntax for expressing
+query-like operations (comparisons, conjunctions and disjunctions). In
+particular, the precedence of the ``&`` and ``|`` operators is made equal to
+the precedence of the corresponding boolean operations ``and`` and ``or``.
+
+For example, the above conjunction can be written without parentheses.
+Alternatively, you can use the ``'python'`` parser to enforce strict Python
+semantics.
+
+.. ipython:: python
+
+ expr = '(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)'
+ x = pd.eval(expr, parser='python')
+ expr_no_parens = 'df1 > 0 & df2 > 0 & df3 > 0 & df4 > 0'
+ y = pd.eval(expr_no_parens, parser='pandas')
+ np.all(x == y)
+
+
+The same expression can be "anded" together with the word :keyword:`and` as
+well:
+
+.. ipython:: python
+
+ expr = '(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)'
+ x = pd.eval(expr, parser='python')
+ expr_with_ands = 'df1 > 0 and df2 > 0 and df3 > 0 and df4 > 0'
+ y = pd.eval(expr_with_ands, parser='pandas')
+ np.all(x == y)
+
+
+The ``and`` and ``or`` operators here have the same precedence that they would
+in vanilla Python.
+
+
+:func:`~pandas.eval` Backends
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+There's also the option to make :func:`~pandas.eval` operate identical to plain
+ol' Python.
+
+.. note::
+
+ Using the ``'python'`` engine is generally *not* useful, except for testing
+ other :func:`~pandas.eval` engines against it. You will acheive **no**
+ performance benefits using :func:`~pandas.eval` with ``engine='python'``.
+
+You can see this by using :func:`~pandas.eval` with the ``'python'`` engine is
+actually a bit slower (not by much) than evaluating the same expression in
+Python:
+
+.. ipython:: python
+
+ %timeit df1 + df2 + df3 + df4
+
+.. ipython:: python
+
+ %timeit pd.eval('df1 + df2 + df3 + df4', engine='python')
+
+
+:func:`~pandas.eval` Performance
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:func:`~pandas.eval` is intended to speed up certain kinds of operations. In
+particular, those operations involving complex expressions with large
+``DataFrame``/``Series`` objects should see a significant performance benefit.
+Here is a plot showing the running time of :func:`~pandas.eval` as function of
+the size of the frame involved in the computation. The two lines are two
+different engines.
+
+
+.. image:: _static/eval-perf.png
+
+
+.. note::
+
+ Operations with smallish objects (around 15k-20k rows) are faster using
+ plain Python:
+
+ .. image:: _static/eval-perf-small.png
+
+
+This plot was created using a ``DataFrame`` with 3 columns each containing
+floating point values generated using ``numpy.random.randn()``.
diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst
index d2fd11ee43615..2f2a47d4b0bf2 100644
--- a/doc/source/indexing.rst
+++ b/doc/source/indexing.rst
@@ -26,48 +26,58 @@ The axis labeling information in pandas objects serves many purposes:
- Enables automatic and explicit data alignment
- Allows intuitive getting and setting of subsets of the data set
-In this section / chapter, we will focus on the final point: namely, how to
-slice, dice, and generally get and set subsets of pandas objects. The primary
-focus will be on Series and DataFrame as they have received more development
-attention in this area. Expect more work to be invested higher-dimensional data
-structures (including Panel) in the future, especially in label-based advanced
+In this section, we will focus on the final point: namely, how to slice, dice,
+and generally get and set subsets of pandas objects. The primary focus will be
+on Series and DataFrame as they have received more development attention in
+this area. Expect more work to be invested higher-dimensional data structures
+(including ``Panel``) in the future, especially in label-based advanced
indexing.
.. note::
- The Python and NumPy indexing operators ``[]`` and attribute operator ``.`` provide quick and easy access to pandas data structures
- across a wide range of use cases. This makes interactive work intuitive, as
- there's little new to learn if you already know how to deal with Python
- dictionaries and NumPy arrays. However, since the type of the data to be accessed
- isn't known in advance, directly using
- standard operators has some optimization limits. For production code, we recommended
- that you take advantage of the optimized pandas data access methods exposed in this chapter.
+ The Python and NumPy indexing operators ``[]`` and attribute operator ``.``
+ provide quick and easy access to pandas data structures across a wide range
+ of use cases. This makes interactive work intuitive, as there's little new
+ to learn if you already know how to deal with Python dictionaries and NumPy
+ arrays. However, since the type of the data to be accessed isn't known in
+ advance, directly using standard operators has some optimization limits. For
+ production code, we recommended that you take advantage of the optimized
+ pandas data access methods exposed in this chapter.
.. warning::
- Whether a copy or a reference is returned for a setting operation, may depend on the context.
- This is sometimes called ``chained assignment`` and should be avoided.
- See :ref:`Returning a View versus Copy `
+ Whether a copy or a reference is returned for a setting operation, may
+ depend on the context. This is sometimes called ``chained assignment`` and
+ should be avoided. See :ref:`Returning a View versus Copy
+ `
See the :ref:`cookbook` for some advanced strategies
-Choice
-------
+Different Choices for Indexing (``loc``, ``iloc``, and ``ix``)
+--------------------------------------------------------------
+
+.. versionadded:: 0.11.0
-Starting in 0.11.0, object selection has had a number of user-requested additions in
-order to support more explicit location based indexing. Pandas now supports
-three types of multi-axis indexing.
+Object selection has had a number of user-requested additions in order to
+support more explicit location based indexing. Pandas now supports three types
+of multi-axis indexing.
-- ``.loc`` is strictly label based, will raise ``KeyError`` when the items are not found, allowed inputs are:
+- ``.loc`` is strictly label based, will raise ``KeyError`` when the items are
+ not found, allowed inputs are:
- - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is interpreted as a *label* of the index. This use is **not** an integer position along the index)
+ - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is interpreted as a
+ *label* of the index. This use is **not** an integer position along the
+ index)
- A list or array of labels ``['a', 'b', 'c']``
- - A slice object with labels ``'a':'f'``, (note that contrary to usual python slices, **both** the start and the stop are included!)
+ - A slice object with labels ``'a':'f'``, (note that contrary to usual python
+ slices, **both** the start and the stop are included!)
- A boolean array
See more at :ref:`Selection by Label `
-- ``.iloc`` is strictly integer position based (from ``0`` to ``length-1`` of the axis), will raise ``IndexError`` when the requested indicies are out of bounds. Allowed inputs are:
+- ``.iloc`` is strictly integer position based (from ``0`` to ``length-1`` of
+ the axis), will raise ``IndexError`` when the requested indicies are out of
+ bounds. Allowed inputs are:
- An integer e.g. ``5``
- A list or array of integers ``[4, 3, 0]``
@@ -75,20 +85,24 @@ three types of multi-axis indexing.
See more at :ref:`Selection by Position `
-- ``.ix`` supports mixed integer and label based access. It is primarily label based, but will fallback to integer positional access. ``.ix`` is the most general
- and will support any of the inputs to ``.loc`` and ``.iloc``, as well as support for floating point label schemes. ``.ix`` is especially useful when dealing with mixed positional and label
- based hierarchial indexes.
-
- As using integer slices with ``.ix`` have different behavior depending on whether the slice is interpreted as position based or label based, it's
+- ``.ix`` supports mixed integer and label based access. It is primarily label
+ based, but will fallback to integer positional access. ``.ix`` is the most
+ general and will support any of the inputs to ``.loc`` and ``.iloc``, as well
+ as support for floating point label schemes. ``.ix`` is especially useful
+ when dealing with mixed positional and label based hierarchial indexes.
+ As using integer slices with ``.ix`` have different behavior depending on
+ whether the slice is interpreted as position based or label based, it's
usually better to be explicit and use ``.iloc`` or ``.loc``.
- See more at :ref:`Advanced Indexing `, :ref:`Advanced Hierarchical ` and :ref:`Fallback Indexing `
+ See more at :ref:`Advanced Indexing `, :ref:`Advanced
+ Hierarchical ` and :ref:`Fallback Indexing
+ `
Getting values from an object with multi-axes selection uses the following
notation (using ``.loc`` as an example, but applies to ``.iloc`` and ``.ix`` as
well). Any of the axes accessors may be the null slice ``:``. Axes left out of
the specification are assumed to be ``:``. (e.g. ``p.loc['a']`` is equiv to
-``p.loc['a',:,:]``)
+``p.loc['a', :, :]``)
.. csv-table::
:header: "Object Type", "Indexers"
@@ -100,7 +114,7 @@ the specification are assumed to be ``:``. (e.g. ``p.loc['a']`` is equiv to
Panel; ``p.loc[item_indexer,major_indexer,minor_indexer]``
Deprecations
-~~~~~~~~~~~~
+------------
Beginning with version 0.11.0, it's recommended that you transition away from
the following methods as they *may* be deprecated in future versions.
@@ -168,7 +182,7 @@ You may find this useful for applying a transform (in-place) to a subset of the
columns.
Attribute Access
-~~~~~~~~~~~~~~~~
+----------------
.. _indexing.columns.multiple:
@@ -213,7 +227,7 @@ If you are using the IPython environment, you may also use tab-completion to
see these accessable attributes.
Slicing ranges
-~~~~~~~~~~~~~~
+--------------
The most robust and consistent way of slicing ranges along arbitrary axes is
described in the :ref:`Selection by Position ` section
@@ -247,7 +261,7 @@ largely as a convenience since it is such a common operation.
.. _indexing.label:
Selection By Label
-~~~~~~~~~~~~~~~~~~
+------------------
.. warning::
@@ -318,7 +332,7 @@ For getting a value explicity (equiv to deprecated ``df.get_value('a','A')``)
.. _indexing.integer:
Selection By Position
-~~~~~~~~~~~~~~~~~~~~~
+---------------------
.. warning::
@@ -415,7 +429,7 @@ Pandas will detect this and raise ``IndexError``, rather than return an empty st
.. _indexing.basics.partial_setting:
Setting With Enlargement
-~~~~~~~~~~~~~~~~~~~~~~~~
+------------------------
.. versionadded:: 0.13
@@ -450,7 +464,7 @@ This is like an ``append`` operation on the ``DataFrame``.
.. _indexing.basics.get_value:
Fast scalar value getting and setting
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-------------------------------------
Since indexing with ``[]`` must handle a lot of cases (single-label access,
slicing, boolean indexing, etc.), it has a bit of overhead in order to figure
@@ -481,7 +495,7 @@ You can also set using these same indexers.
df
Boolean indexing
-~~~~~~~~~~~~~~~~
+----------------
.. _indexing.boolean:
@@ -572,8 +586,8 @@ You can also describe columns using integer location:
df.isin(values, iloc=True)
-Where and Masking
-~~~~~~~~~~~~~~~~~
+The :meth:`~pandas.DataFrame.where` Method and Masking
+------------------------------------------------------
Selecting values from a Series with a boolean vector generally returns a
subset of the data. To guarantee that selection output has the same shape as
@@ -673,8 +687,304 @@ This is equivalent (but faster than) the following.
s.mask(s >= 0)
df.mask(df >= 0)
+.. _indexing.query:
+
+The :meth:`~pandas.DataFrame.query` Method (Experimental)
+---------------------------------------------------------
+
+.. versionadded:: 0.13
+
+:class:`~pandas.DataFrame` objects have a :meth:`~pandas.DataFrame.query`
+method that allows selection using an expression.
+
+You can get the value of the frame where column ``b`` has values
+between the values of columns ``a`` and ``c``. For example:
+
+.. ipython:: python
+ :suppress:
+
+ from numpy.random import randint, rand
+ np.random.seed(1234)
+
+.. ipython:: python
+
+ n = 10
+ df = DataFrame(rand(n, 3), columns=list('abc'))
+ df
+
+ # pure python
+ df[(df.a < df.b) & (df.b < df.c)]
+
+ # query
+ df.query('(a < b) & (b < c)')
+
+Do the same thing but fallback on a named index if there is no column
+with the name ``a``.
+
+.. ipython:: python
+
+ df = DataFrame(randint(n / 2, size=(n, 2)), columns=list('bc'))
+ df.index.name = 'a'
+ df
+ df.query('a < b and b < c')
+
+If instead you don't want to or cannot name your index, you can use the name
+``index`` in your query expression:
+
+.. ipython:: python
+ :suppress:
+
+ old_index = index
+ del index
+
+.. ipython:: python
+
+ df = DataFrame(randint(n, size=(n, 2)), columns=list('bc'))
+ df
+ df.query('index < b < c')
+
+.. ipython:: python
+ :suppress:
+
+ index = old_index
+ del old_index
+
+
+:class:`~pandas.MultiIndex` :meth:`~pandas.DataFrame.query` Syntax
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You can also use the levels of a ``DataFrame`` with a
+:class:`~pandas.MultiIndex` as if they were columns in the frame:
+
+.. ipython:: python
+
+ import pandas.util.testing as tm
+
+ n = 10
+ colors = tm.choice(['red', 'green'], size=n)
+ foods = tm.choice(['eggs', 'ham'], size=n)
+ colors
+ foods
+
+ index = MultiIndex.from_arrays([colors, foods], names=['color', 'food'])
+ df = DataFrame(randn(n, 2), index=index)
+ df
+ df.query('color == "red"')
+
+If the levels of the ``MultiIndex`` are unnamed, you can refer to them using
+special names:
+
+
+.. ipython:: python
+
+ df.index.names = [None, None]
+ df
+ df.query('ilevel_0 == "red"')
+
+
+The convention is ``ilevel_0``, which means "index level 0" for the 0th level
+of the ``index``.
+
+
+:meth:`~pandas.DataFrame.query` Use Cases
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A use case for :meth:`~pandas.DataFrame.query` is when you have a collection of
+:class:`~pandas.DataFrame` objects that have a subset of column names (or index
+levels/names) in common. You can pass the same query to both frames *without*
+having to specify which frame you're interested in querying
+
+.. ipython:: python
+
+ df = DataFrame(rand(n, 3), columns=list('abc'))
+ df
+ df2 = DataFrame(rand(n + 2, 3), columns=df.columns)
+ df2
+ expr = '0.0 <= a <= c <= 0.5'
+ map(lambda frame: frame.query(expr), [df, df2])
+
+:meth:`~pandas.DataFrame.query` Python versus pandas Syntax Comparison
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Full numpy-like syntax
+
+.. ipython:: python
+
+ df = DataFrame(randint(n, size=(n, 3)), columns=list('abc'))
+ df
+ df.query('(a < b) & (b < c)')
+ df[(df.a < df.b) & (df.b < df.c)]
+
+Slightly nicer by removing the parentheses (by binding making comparison
+operators bind tighter than ``&``/``|``)
+
+.. ipython:: python
+
+ df.query('a < b & b < c')
+
+Use English instead of symbols
+
+.. ipython:: python
+
+ df.query('a < b and b < c')
+
+Pretty close to how you might write it on paper
+
+.. ipython:: python
+
+ df.query('a < b < c')
+
+The ``in`` and ``not in`` operators
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:meth:`~pandas.DataFrame.query` also supports special use of Python's ``in`` and
+``not in`` comparison operators, providing a succint syntax for calling the
+``isin`` method of a ``Series`` or ``DataFrame``.
+
+.. ipython:: python
+ :suppress:
+
+ try:
+ old_d = d
+ del d
+ except NameError:
+ pass
+
+.. ipython:: python
+
+ # get all rows where columns "a" and "b" have overlapping values
+ df = DataFrame({'a': list('aabbccddeeff'), 'b': list('aaaabbbbcccc'),
+ 'c': randint(5, size=12), 'd': randint(9, size=12)})
+ df
+ df.query('a in b')
+
+ # How you'd do it in pure Python
+ df[df.a.isin(df.b)]
+
+ df.query('a not in b')
+
+ # pure Python
+ df[~df.a.isin(df.b)]
+
+
+You can combine this with other expressions for very succinct queries:
+
+
+.. ipython:: python
+
+ # rows where cols a and b have overlapping values and col c's values are less than col d's
+ df.query('a in b and c < d')
+
+ # pure Python
+ df[df.b.isin(df.a) & (df.c < df.d)]
+
+
+.. note::
+
+ Note that ``in`` and ``not in`` are evaluated in Python, since ``numexpr``
+ has no equivalent of this operation. However, **only the** ``in``/``not in``
+ **expression itself** is evaluated in vanilla Python. For example, in the
+ expression
+
+ .. code-block:: python
+
+ df.query('a in b + c + d')
+
+ ``(b + c + d)`` is evaluated by ``numexpr`` and *then* the ``in``
+ operation is evaluated in plain Python. In general, any operations that can
+ be evaluated using ``numexpr`` will be.
+
+Special use of the ``==`` operator with ``list`` objects
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Comparing a ``list`` of values to a column using ``==``/``!=`` works similarly
+to ``in``/``not in``
+
+.. ipython:: python
+
+ df.query('b == ["a", "b", "c"]')
+
+ # pure Python
+ df[df.b.isin(["a", "b", "c"])]
+
+ df.query('c == [1, 2]')
+
+ df.query('c != [1, 2]')
+
+ # using in/not in
+ df.query('[1, 2] in c')
+
+ df.query('[1, 2] not in c')
+
+ # pure Python
+ df[df.c.isin([1, 2])]
+
+
+Boolean Operators
+~~~~~~~~~~~~~~~~~
+
+You can negate boolean expressions with the word ``not`` or the ``~`` operator.
+
+.. ipython:: python
+
+ df = DataFrame(rand(n, 3), columns=list('abc'))
+ df['bools'] = rand(len(df)) > 0.5
+ df.query('~bools')
+ df.query('not bools')
+ df.query('not bools') == df[~df.bools]
+
+Of course, expressions can be arbitrarily complex too
+
+.. ipython:: python
+
+ # short query syntax
+ shorter = df.query('a < b < c and (not bools) or bools > 2')
+
+ # equivalent in pure Python
+ longer = df[(df.a < df.b) & (df.b < df.c) & (~df.bools) | (df.bools > 2)]
+
+ shorter
+ longer
+
+ shorter == longer
+
+.. ipython:: python
+ :suppress:
+
+ try:
+ d = old_d
+ del old_d
+ except NameError:
+ pass
+
+
+Performance of :meth:`~pandas.DataFrame.query`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+``DataFrame.query()`` using ``numexpr`` is slightly faster than Python for
+large frames
+
+.. image:: _static/query-perf.png
+
+.. note::
+
+ You will only see the performance benefits of using the ``numexpr`` engine
+ with ``DataFrame.query()`` if your frame has more than approximately 50,000
+ rows
+
+ .. image:: _static/query-perf-small.png
+
+This plot was created using a ``DataFrame`` with 3 columns each containing
+floating point values generated using ``numpy.random.randn()``.
+
+.. ipython:: python
+ :suppress:
+
+ df = DataFrame(randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
+ df2 = df.copy()
+
Take Methods
-~~~~~~~~~~~~
+------------
.. _indexing.take:
@@ -740,7 +1050,7 @@ faster than fancy indexing.
timeit ser.take(indexer)
Duplicate Data
-~~~~~~~~~~~~~~
+--------------
.. _indexing.duplicate:
@@ -766,8 +1076,8 @@ should be taken instead.
.. _indexing.dictionarylike:
-Dictionary-like ``get`` method
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Dictionary-like :meth:`~pandas.DataFrame.get` method
+----------------------------------------------------
Each of Series, DataFrame, and Panel have a ``get`` method which can return a
default value.
@@ -865,8 +1175,8 @@ labels or even boolean vectors:
Slicing with labels is closely related to the ``truncate`` method which does
precisely ``.ix[start:stop]`` but returns a copy (for legacy reasons).
-The ``select`` method
-~~~~~~~~~~~~~~~~~~~~~
+The :meth:`~pandas.DataFrame.select` Method
+-------------------------------------------
Another way to extract slices from an object is with the ``select`` method of
Series, DataFrame, and Panel. This method should be used only when there is no
@@ -877,8 +1187,8 @@ more direct way. ``select`` takes a function which operates on labels along
df.select(lambda x: x == 'A', axis=1)
-The ``lookup`` method
-~~~~~~~~~~~~~~~~~~~~~
+The :meth:`~pandas.DataFrame.lookup` Method
+-------------------------------------------
Sometimes you want to extract a set of values given a sequence of row labels
and column labels, and the ``lookup`` method allows for this and returns a
@@ -890,7 +1200,7 @@ numpy array. For instance,
dflookup.lookup(list(range(0,10,2)), ['B','C','A','B','D'])
Setting values in mixed-type DataFrame
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--------------------------------------
.. _indexing.mixed_type_setting:
@@ -909,7 +1219,7 @@ scalar values, though setting arbitrary vectors is not yet supported:
.. _indexing.view_versus_copy:
Returning a view versus a copy
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+------------------------------
The rules about when a view on the data is returned are entirely dependent on
NumPy. Whenever an array of labels or a boolean vector are involved in the
@@ -970,7 +1280,7 @@ When assigning values to subsets of your data, thus, make sure to either use the
pandas access methods or explicitly handle the assignment creating a copy.
Fallback indexing
-~~~~~~~~~~~~~~~~~~~~
+-----------------
.. _indexing.fallback:
@@ -1006,6 +1316,71 @@ convert to an integer index:
df_new[(df_new['index'] >= 1.0) & (df_new['index'] < 2)]
+.. _indexing.class:
+
+Index objects
+-------------
+
+The pandas :class:`~pandas.Index` class and its subclasses can be viewed as
+implementing an *ordered multiset*. Duplicates are allowed. However, if you try
+to convert an :class:`~pandas.Index` object with duplicate entries into a
+``set``, an exception will be raised.
+
+:class:`~pandas.Index` also provides the infrastructure necessary for
+lookups, data alignment, and reindexing. The easiest way to create an
+:class:`~pandas.Index` directly is to pass a ``list`` or other sequence to
+:class:`~pandas.Index`:
+
+.. ipython:: python
+
+ index = Index(['e', 'd', 'a', 'b'])
+ index
+ 'd' in index
+
+You can also pass a ``name`` to be stored in the index:
+
+
+.. ipython:: python
+
+ index = Index(['e', 'd', 'a', 'b'], name='something')
+ index.name
+
+Starting with pandas 0.5, the name, if set, will be shown in the console
+display:
+
+.. ipython:: python
+
+ index = Index(list(range(5)), name='rows')
+ columns = Index(['A', 'B', 'C'], name='cols')
+ df = DataFrame(np.random.randn(5, 3), index=index, columns=columns)
+ df
+ df['A']
+
+
+Set operations on Index objects
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. _indexing.set_ops:
+
+The three main operations are ``union (|)``, ``intersection (&)``, and ``diff
+(-)``. These can be directly called as instance methods or used via overloaded
+operators:
+
+.. ipython:: python
+
+ a = Index(['c', 'b', 'a'])
+ b = Index(['c', 'e', 'd'])
+ a.union(b)
+ a | b
+ a & b
+ a - b
+
+The ``isin`` method of Index objects
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+One additional operation is the ``isin`` method that works analogously to the
+``Series.isin`` method found :ref:`here `.
+
.. _indexing.hierarchical:
Hierarchical indexing (MultiIndex)
@@ -1206,7 +1581,7 @@ mailing list.
.. _indexing.xs:
Cross-section with hierarchical index
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The ``xs`` method of ``DataFrame`` additionally takes a level argument to make
selecting data at a particular level of a MultiIndex easier.
@@ -1238,8 +1613,8 @@ instance:
print df2_aligned
-The need for sortedness
-~~~~~~~~~~~~~~~~~~~~~~~
+The need for sortedness with :class:`~pandas.MultiIndex`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
**Caveat emptor**: the present implementation of ``MultiIndex`` requires that
the labels be sorted for some of the slicing / indexing routines to work
@@ -1311,8 +1686,8 @@ However:
...
KeyError: Key length (3) was greater than MultiIndex lexsort depth (2)
-Swapping levels with ``swaplevel``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Swapping levels with :meth:`~pandas.MultiIndex.swaplevel`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The ``swaplevel`` function can switch the order of two levels:
@@ -1323,8 +1698,8 @@ The ``swaplevel`` function can switch the order of two levels:
.. _indexing.reorderlevels:
-Reordering levels with ``reorder_levels``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Reordering levels with :meth:`~pandas.MultiIndex.reorder_levels`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The ``reorder_levels`` function generalizes the ``swaplevel`` function,
allowing you to permute the hierarchical index levels in one step:
@@ -1354,68 +1729,9 @@ not check (or care) whether the levels themselves are sorted. Fortunately, the
constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but
if you compute the levels and labels yourself, please be careful.
-.. _indexing.class:
-
-Index objects
--------------
-
-The pandas Index class and its subclasses can be viewed as implementing an
-*ordered set* in addition to providing the support infrastructure necessary for
-lookups, data alignment, and reindexing. The easiest way to create one directly
-is to pass a list or other sequence to ``Index``:
-
-.. ipython:: python
-
- index = Index(['e', 'd', 'a', 'b'])
- index
- 'd' in index
-
-You can also pass a ``name`` to be stored in the index:
-
-
-.. ipython:: python
-
- index = Index(['e', 'd', 'a', 'b'], name='something')
- index.name
-
-Starting with pandas 0.5, the name, if set, will be shown in the console
-display:
-
-.. ipython:: python
-
- index = Index(list(range(5)), name='rows')
- columns = Index(['A', 'B', 'C'], name='cols')
- df = DataFrame(np.random.randn(5, 3), index=index, columns=columns)
- df
- df['A']
-
-
-Set operations on Index objects
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. _indexing.set_ops:
-
-The three main operations are ``union (|)``, ``intersection (&)``, and ``diff
-(-)``. These can be directly called as instance methods or used via overloaded
-operators:
-
-.. ipython:: python
-
- a = Index(['c', 'b', 'a'])
- b = Index(['c', 'e', 'd'])
- a.union(b)
- a | b
- a & b
- a - b
-
-``isin`` method of Index objects
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-One additional operation is the ``isin`` method that works analogously to the
-``Series.isin`` method found :ref:`here `.
Setting index metadata (``name(s)``, ``levels``, ``labels``)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+------------------------------------------------------------
.. _indexing.set_metadata:
@@ -1444,7 +1760,7 @@ add an index after you've already done so. There are a couple of different
ways.
Add an index using DataFrame columns
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+------------------------------------
.. _indexing.set_index:
@@ -1487,7 +1803,7 @@ the index in-place (without creating a new object):
data
Remove / reset the index, ``reset_index``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+------------------------------------------
As a convenience, there is a new function on DataFrame called ``reset_index``
which transfers the index values into the DataFrame's columns and sets a simple
@@ -1518,7 +1834,7 @@ discards the index, instead of putting index values in the DataFrame's columns.
deprecated.
Adding an ad hoc index
-~~~~~~~~~~~~~~~~~~~~~~
+----------------------
If you create an index yourself, you can just assign it to the ``index`` field:
@@ -1531,9 +1847,9 @@ Indexing internal details
.. note::
- The following is largely relevant for those actually working on the pandas
- codebase. And the source code is still the best place to look at the
- specifics of how things are implemented.
+ The following is largely relevant for those actually working on the pandas
+ codebase. The source code is still the best place to look at the specifics
+ of how things are implemented.
In pandas there are a few objects implemented which can serve as valid
containers for the axis labels:
@@ -1545,6 +1861,8 @@ containers for the axis labels:
- ``Int64Index``: a version of ``Index`` highly optimized for 64-bit integer
data, such as time stamps
- ``MultiIndex``: the standard hierarchical index object
+ - ``PeriodIndex``: An Index object with Period elements
+ - ``DatetimeIndex``: An Index object with Timestamp elements
- ``date_range``: fixed frequency date range generated from a time rule or
DateOffset. An ndarray of Python datetime objects
diff --git a/doc/source/io.rst b/doc/source/io.rst
index c29af29d2e63f..e30eb030afb88 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -1962,7 +1962,7 @@ storing/selecting from homogeneous index DataFrames.
store.select('df_mi')
# the levels are automatically included as data columns
- store.select('df_mi', Term('foo=bar'))
+ store.select('df_mi', 'foo=bar')
.. _io.hdf5-query:
@@ -1970,49 +1970,102 @@ storing/selecting from homogeneous index DataFrames.
Querying a Table
~~~~~~~~~~~~~~~~
+.. warning::
+
+ This query capabilities have changed substantially starting in ``0.13.0``.
+ Queries from prior version are accepted (with a ``DeprecationWarning``) printed
+ if its not string-like.
+
``select`` and ``delete`` operations have an optional criterion that can
be specified to select/delete only a subset of the data. This allows one
to have a very large on-disk table and retrieve only a portion of the
data.
-A query is specified using the ``Term`` class under the hood.
+A query is specified using the ``Term`` class under the hood, as a boolean expression.
- - 'index' and 'columns' are supported indexers of a DataFrame
- - 'major_axis', 'minor_axis', and 'items' are supported indexers of
+ - ``index`` and ``columns`` are supported indexers of a DataFrame
+ - ``major_axis``, ``minor_axis``, and ``items`` are supported indexers of
the Panel
+ - if ``data_columns`` are specified, these can be used as additional indexers
+
+Valid comparison operators are:
+
+ - ``=, ==, !=, >, >=, <, <=``
+
+Valid boolean expressions are combined with:
+
+ - ``|`` : or
+ - ``&`` : and
+ - ``(`` and ``)`` : for grouping
+
+These rules are similar to how boolean expressions are used in pandas for indexing.
+
+.. note::
+
+ - ``=`` will be automatically expanded to the comparison operator ``==``
+ - ``~`` is the not operator, but can only be used in very limited
+ circumstances
+ - If a list/tuple of expressions is passed they will be combined via ``&``
+
+The following are valid expressions:
+
+ - ``'index>=date'``
+ - ``"columns=['A', 'D']"``
+ - ``"columns in ['A', 'D']"``
+ - ``'columns=A'``
+ - ``'columns==A'``
+ - ``"~(columns=['A','B'])"``
+ - ``'index>df.index[3] & string="bar"'``
+ - ``'(index>df.index[3] & index<=df.index[6]) | string="bar"'``
+ - ``"ts>=Timestamp('2012-02-01')"``
+ - ``"major_axis>=20130101"``
+
+The ``indexers`` are on the left-hand side of the sub-expression:
-Valid terms can be created from ``dict, list, tuple, or
-string``. Objects can be embeded as values. Allowed operations are: ``<,
-<=, >, >=, =, !=``. ``=`` will be inferred as an implicit set operation
-(e.g. if 2 or more values are provided). The following are all valid
-terms.
+ - ``columns``, ``major_axis``, ``ts``
- - ``dict(field = 'index', op = '>', value = '20121114')``
- - ``('index', '>', '20121114')``
- - ``'index > 20121114'``
- - ``('index', '>', datetime(2012, 11, 14))``
- - ``('index', ['20121114', '20121115'])``
- - ``('major_axis', '=', Timestamp('2012/11/14'))``
- - ``('minor_axis', ['A', 'B'])``
+The right-hand side of the sub-expression (after a comparsion operator) can be:
-Queries are built up using a list of ``Terms`` (currently only
-**anding** of terms is supported). An example query for a panel might be
-specified as follows. ``['major_axis>20000102', ('minor_axis', '=',
-['A', 'B']) ]``. This is roughly translated to: `major_axis must be
-greater than the date 20000102 and the minor_axis must be A or B`
+ - functions that will be evaluated, e.g. ``Timestamp('2012-02-01')``
+ - strings, e.g. ``"bar"``
+ - date-like, e.g. ``20130101``, or ``"20130101"``
+ - lists, e.g. ``"['A','B']"``
+ - variables that are defined in the local names space, e.g. ``date``
+
+Here are some examples:
+
+.. ipython:: python
+
+ dfq = DataFrame(randn(10,4),columns=list('ABCD'),index=date_range('20130101',periods=10))
+ store.append('dfq',dfq,format='table',data_columns=True)
+
+Use boolean expressions, with in-line function evaluation.
+
+.. ipython:: python
+
+ store.select('dfq',"index>Timestamp('20130104') & columns=['A', 'B']")
+
+Use and inline column reference
+
+.. ipython:: python
+
+ store.select('dfq',where="A>0 or C>0")
+
+Works with a Panel as well.
.. ipython:: python
store.append('wp',wp)
store
- store.select('wp', [ Term('major_axis>20000102'), Term('minor_axis', '=', ['A', 'B']) ])
+ store.select('wp', "major_axis>Timestamp('20000102') & minor_axis=['A', 'B']")
-The ``columns`` keyword can be supplied to select a list of columns to be returned,
-this is equivalent to passing a ``Term('columns', list_of_columns_to_filter)``:
+The ``columns`` keyword can be supplied to select a list of columns to be
+returned, this is equivalent to passing a
+``'columns=list_of_columns_to_filter'``:
.. ipython:: python
- store.select('df', columns=['A', 'B'])
+ store.select('df', "columns=['A', 'B']")
``start`` and ``stop`` parameters can be specified to limit the total search
space. These are in terms of the total number of rows in a table.
@@ -2023,10 +2076,18 @@ space. These are in terms of the total number of rows in a table.
wp.to_frame()
# limiting the search
- store.select('wp',[ Term('major_axis>20000102'),
- Term('minor_axis', '=', ['A','B']) ],
+ store.select('wp',"major_axis>20000102 & minor_axis=['A','B']",
start=0, stop=10)
+.. note::
+
+ ``select`` will raise a ``ValueError`` if the query expression has an unknown
+ variable reference. Usually this means that you are trying to select on a column
+ that is **not** a data_column.
+
+ ``select`` will raise a ``SyntaxError`` if the query expression is not valid.
+
+
.. _io.hdf5-timedelta:
**Using timedelta64[ns]**
@@ -2048,7 +2109,7 @@ specified in the format: ``()``, where float may be signed (and fra
dftd['C'] = dftd['A']-dftd['B']
dftd
store.append('dftd',dftd,data_columns=True)
- store.select('dftd',Term("C","<","-3.5D"))
+ store.select('dftd',"C<'-3.5D'")
Indexing
~~~~~~~~
@@ -2057,10 +2118,13 @@ You can create/modify an index for a table with ``create_table_index``
after data is already in the table (after and ``append/put``
operation). Creating a table index is **highly** encouraged. This will
speed your queries a great deal when you use a ``select`` with the
-indexed dimension as the ``where``. **Indexes are automagically created
-(starting 0.10.1)** on the indexables and any data columns you
-specify. This behavior can be turned off by passing ``index=False`` to
-``append``.
+indexed dimension as the ``where``.
+
+.. note::
+
+ Indexes are automagically created (starting ``0.10.1``) on the indexables
+ and any data columns you specify. This behavior can be turned off by passing
+ ``index=False`` to ``append``.
.. ipython:: python
@@ -2117,7 +2181,7 @@ create a new table!)
Iterator
~~~~~~~~
-Starting in 0.11, you can pass, ``iterator=True`` or ``chunksize=number_in_a_chunk``
+Starting in ``0.11.0``, you can pass, ``iterator=True`` or ``chunksize=number_in_a_chunk``
to ``select`` and ``select_as_multiple`` to return an iterator on the results.
The default is 50,000 rows returned in a chunk.
@@ -2151,7 +2215,7 @@ Advanced Queries
To retrieve a single indexable or data column, use the
method ``select_column``. This will, for example, enable you to get the index
very quickly. These return a ``Series`` of the result, indexed by the row number.
-These do not currently accept the ``where`` selector (coming soon)
+These do not currently accept the ``where`` selector.
.. ipython:: python
diff --git a/doc/source/release.rst b/doc/source/release.rst
index 0ed1f39d72cb5..b8a817a00403c 100644
--- a/doc/source/release.rst
+++ b/doc/source/release.rst
@@ -294,7 +294,15 @@ See :ref:`Internal Refactoring`
Experimental Features
~~~~~~~~~~~~~~~~~~~~~
-.. _release:bug_fixes-0.13.0:
+- The new :func:`~pandas.eval` function implements expression evaluation using
+ ``numexpr`` behind the scenes. This results in large speedups for complicated
+ expressions involving large DataFrames/Series.
+- :class:`~pandas.DataFrame` has a new :meth:`~pandas.DataFrame.eval` that
+ evaluates an expression in the context of the ``DataFrame``.
+- A :meth:`~pandas.DataFrame.query` method has been added that allows
+ you to select elements of a ``DataFrame`` using a natural query syntax nearly
+ identical to Python syntax.
+
Bug Fixes
~~~~~~~~~
diff --git a/doc/source/v0.10.0.txt b/doc/source/v0.10.0.txt
index d0c0ecc148239..0c86add1225ad 100644
--- a/doc/source/v0.10.0.txt
+++ b/doc/source/v0.10.0.txt
@@ -262,7 +262,7 @@ Updated PyTables Support
[ Term('major_axis>20000102'), Term('minor_axis', '=', ['A','B']) ])
# removing data from tables
- store.remove('wp', [ 'major_axis', '>', wp.major_axis[3] ])
+ store.remove('wp', Term('major_axis>20000103'))
store.select('wp')
# deleting a store
diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt
index c56af23e85eae..694281b813c3b 100644
--- a/doc/source/v0.13.0.txt
+++ b/doc/source/v0.13.0.txt
@@ -187,6 +187,96 @@ Indexing API Changes
p
p.loc[:,:,'C']
+HDFStore API Changes
+~~~~~~~~~~~~~~~~~~~~
+
+ - Query Format Changes. A much more string-like query format is now supported.
+
+ .. ipython:: python
+
+ path = 'test_query.h5'
+ dfq = DataFrame(randn(10,4),columns=list('ABCD'),index=date_range('20130101',periods=10))
+ dfq.to_hdf(path,'dfq',format='table',data_columns=True)
+
+ Use boolean expressions, with in-line function evaluation.
+
+ .. ipython:: python
+
+ read_hdf(path,'dfq',where="index>Timestamp('20130104') & columns=['A', 'B']")
+
+ Use an inline column reference
+
+ .. ipython:: python
+
+ read_hdf(path,'dfq',where="A>0 or C>0")
+
+ See :ref:`the docs`.
+
+ - Significant table writing performance improvements
+ - handle a passed ``Series`` in table format (:issue:`4330`)
+ - added an ``is_open`` property to indicate if the underlying file handle is_open;
+ a closed store will now report 'CLOSED' when viewing the store (rather than raising an error)
+ (:issue:`4409`)
+ - a close of a ``HDFStore`` now will close that instance of the ``HDFStore``
+ but will only close the actual file if the ref count (by ``PyTables``) w.r.t. all of the open handles
+ are 0. Essentially you have a local instance of ``HDFStore`` referenced by a variable. Once you
+ close it, it will report closed. Other references (to the same file) will continue to operate
+ until they themselves are closed. Performing an action on a closed file will raise
+ ``ClosedFileError``
+
+ .. ipython:: python
+
+ path = 'test.h5'
+ df = DataFrame(randn(10,2))
+ store1 = HDFStore(path)
+ store2 = HDFStore(path)
+ store1.append('df',df)
+ store2.append('df2',df)
+
+ store1
+ store2
+ store1.close()
+ store2
+ store2.close()
+ store2
+
+ .. ipython:: python
+ :suppress:
+
+ import os
+ os.remove(path)
+
+ - removed the ``_quiet`` attribute, replace by a ``DuplicateWarning`` if retrieving
+ duplicate rows from a table (:issue:`4367`)
+ - removed the ``warn`` argument from ``open``. Instead a ``PossibleDataLossError`` exception will
+ be raised if you try to use ``mode='w'`` with an OPEN file handle (:issue:`4367`)
+ - allow a passed locations array or mask as a ``where`` condition (:issue:`4467`).
+ See :ref:`here` for an example.
+
+ - the ``format`` keyword now replaces the ``table`` keyword; allowed values are ``fixed(f)`` or ``table(t)``
+ the same defaults as prior < 0.13.0 remain, e.g. ``put`` implies 'fixed` or 'f' (Fixed) format
+ and ``append`` imples 'table' or 't' (Table) format
+
+ .. ipython:: python
+
+ path = 'test.h5'
+ df = DataFrame(randn(10,2))
+ df.to_hdf(path,'df_table',format='table')
+ df.to_hdf(path,'df_table2',append=True)
+ df.to_hdf(path,'df_fixed')
+ with get_store(path) as store:
+ print store
+
+ .. ipython:: python
+ :suppress:
+
+ import os
+ os.remove('test.h5')
+ os.remove('test_query.h5')
+ - add the keyword ``dropna=True`` to ``append`` to change whether ALL nan rows are not written
+ to the store (default is ``True``, ALL nan rows are NOT written), also settable
+ via the option ``io.hdf.dropna_table`` (:issue:`4625`)
+
Enhancements
~~~~~~~~~~~~
@@ -271,6 +361,90 @@ Enhancements
is evaluated, respecttively. See scipy docs.
- DataFrame constructor now accepts a numpy masked record array (:issue:`3478`)
+
+.. _whatsnew_0130.experimental:
+
+Experimental
+~~~~~~~~~~~~
+
+- :func:`~pandas.eval`:
+
+ - The new :func:`~pandas.eval` function implements expression evaluation using
+ ``numexpr`` behind the scenes. This results in large speedups for
+ complicated expressions involving large DataFrames/Series. For example,
+
+ .. ipython:: python
+
+ nrows, ncols = 20000, 100
+ df1, df2, df3, df4 = [DataFrame(randn(nrows, ncols))
+ for _ in xrange(4)]
+
+ .. ipython:: python
+
+ %timeit pd.eval('df1 + df2 + df3 + df4')
+
+ For more details, see the :ref:`enhancing performance documentation on eval
+ `
+
+- :meth:`~pandas.DataFrame.eval`
+
+ - Similar to :func:`~pandas.eval`, :class:`~pandas.DataFrame` has a new
+ :meth:`~pandas.DataFrame.eval` that evaluates an expression in the context
+ of the ``DataFrame``. For example,
+
+ .. ipython:: python
+ :suppress:
+
+ try:
+ del a
+ except NameError:
+ pass
+
+ try:
+ del b
+ except NameError:
+ pass
+
+ .. ipython:: python
+
+ df = DataFrame(randn(10, 2), columns=['a', 'b'])
+ df.eval('a + b')
+
+
+- :meth:`~pandas.DataFrame.query`
+
+ - In 0.13 a :meth:`~pandas.DataFrame.query` method has been added that allows
+ you to select elements of a ``DataFrame`` using a natural query syntax
+ nearly identical to Python syntax. For example,
+
+ .. ipython:: python
+ :suppress:
+
+ try:
+ del a
+ except NameError:
+ pass
+
+ try:
+ del b
+ except NameError:
+ pass
+
+ try:
+ del c
+ except NameError:
+ pass
+
+ .. ipython:: python
+
+ n = 20
+ df = DataFrame(randint(n, size=(n, 3)), columns=['a', 'b', 'c'])
+ df.query('a < b < c')
+
+ selects all the rows of ``df`` where ``a < b < c`` evaluates to ``True``.
+ For more details see the :ref:`indexing documentation on query
+ `.
+
.. _whatsnew_0130.refactoring:
Internal Refactoring
diff --git a/pandas/__init__.py b/pandas/__init__.py
index 03681d3fa5a3f..c4c012d6c5095 100644
--- a/pandas/__init__.py
+++ b/pandas/__init__.py
@@ -42,6 +42,7 @@
from pandas.stats.api import *
from pandas.tseries.api import *
from pandas.io.api import *
+from pandas.computation.api import *
from pandas.util.testing import debug
diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
index 12c929cd59820..10e1464739203 100644
--- a/pandas/compat/__init__.py
+++ b/pandas/compat/__init__.py
@@ -46,11 +46,13 @@
from StringIO import StringIO
BytesIO = StringIO
import cPickle
+ import httplib
except ImportError:
import builtins
from io import StringIO, BytesIO
cStringIO = StringIO
import pickle as cPickle
+ import http.client as httplib
if PY3:
diff --git a/pandas/computation/__init__.py b/pandas/computation/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/pandas/computation/align.py b/pandas/computation/align.py
new file mode 100644
index 0000000000000..60975bdc8a5b4
--- /dev/null
+++ b/pandas/computation/align.py
@@ -0,0 +1,247 @@
+"""Core eval alignment algorithms
+"""
+
+import warnings
+from functools import partial, wraps
+from pandas.compat import zip, range
+
+import numpy as np
+
+import pandas as pd
+from pandas import compat
+import pandas.core.common as com
+
+
+def _align_core_single_unary_op(term):
+ if isinstance(term.value, np.ndarray):
+ typ = partial(np.asanyarray, dtype=term.value.dtype)
+ else:
+ typ = type(term.value)
+ ret = typ,
+
+ if not hasattr(term.value, 'axes'):
+ ret += None,
+ else:
+ ret += _zip_axes_from_type(typ, term.value.axes),
+ return ret
+
+
+def _zip_axes_from_type(typ, new_axes):
+ axes = {}
+ for ax_ind, ax_name in compat.iteritems(typ._AXIS_NAMES):
+ axes[ax_name] = new_axes[ax_ind]
+ return axes
+
+
+def _maybe_promote_shape(values, naxes):
+ # test to see if we have an array else leave since must be a number
+ if not isinstance(values, np.ndarray):
+ return values
+
+ ndims = values.ndim
+ if ndims > naxes:
+ raise AssertionError('cannot have more dims than axes, '
+ '{0} > {1}'.format(ndims, naxes))
+ if ndims == naxes:
+ return values
+
+ ndim, nax = range(ndims), range(naxes)
+
+ axes_slice = [slice(None)] * naxes
+
+ # set difference of numaxes and ndims
+ slices = list(set(nax) - set(ndim))
+
+ if ndims == naxes:
+ if slices:
+ raise AssertionError('slices should be empty if ndims == naxes '
+ '{0}'.format(slices))
+ else:
+ if not slices:
+ raise AssertionError('slices should NOT be empty if ndim != naxes '
+ '{0}'.format(slices))
+
+ for sl in slices:
+ axes_slice[sl] = np.newaxis
+
+ return values[tuple(axes_slice)]
+
+
+def _any_pandas_objects(terms):
+ """Check a sequence of terms for instances of PandasObject."""
+ return any(isinstance(term.value, pd.core.generic.PandasObject)
+ for term in terms)
+
+
+def _filter_special_cases(f):
+ @wraps(f)
+ def wrapper(terms):
+ # single unary operand
+ if len(terms) == 1:
+ return _align_core_single_unary_op(terms[0])
+
+ term_values = (term.value for term in terms)
+ # only scalars or indexes
+ if all(isinstance(term.value, pd.Index) or term.isscalar for term in
+ terms):
+ return np.result_type(*term_values), None
+
+ # single element ndarrays
+ all_has_size = all(hasattr(term.value, 'size') for term in terms)
+ if all_has_size and all(term.value.size == 1 for term in terms):
+ return np.result_type(*term_values), None
+
+ # no pandas objects
+ if not _any_pandas_objects(terms):
+ return np.result_type(*term_values), None
+
+ return f(terms)
+ return wrapper
+
+
+@_filter_special_cases
+def _align_core(terms):
+ term_index = [i for i, term in enumerate(terms) if hasattr(term.value,
+ 'axes')]
+ term_dims = [terms[i].value.ndim for i in term_index]
+ ndims = pd.Series(dict(zip(term_index, term_dims)))
+
+ # initial axes are the axes of the largest-axis'd term
+ biggest = terms[ndims.idxmax()].value
+ typ = biggest._constructor
+ axes = biggest.axes
+ naxes = len(axes)
+
+ for term in (terms[i] for i in term_index):
+ for axis, items in enumerate(term.value.axes):
+ if isinstance(term.value, pd.Series) and naxes > 1:
+ ax, itm = naxes - 1, term.value.index
+ else:
+ ax, itm = axis, items
+ axes[ax] = axes[ax].join(itm, how='outer')
+
+ for i, ndim in compat.iteritems(ndims):
+ for axis, items in zip(range(ndim), axes):
+ ti = terms[i].value
+
+ if hasattr(ti, 'reindex_axis'):
+ transpose = isinstance(ti, pd.Series) and naxes > 1
+ reindexer = axes[naxes - 1] if transpose else items
+
+ term_axis_size = len(ti.axes[axis])
+ reindexer_size = len(reindexer)
+
+ ordm = np.log10(abs(reindexer_size - term_axis_size))
+ if ordm >= 1 and reindexer_size >= 10000:
+ warnings.warn("Alignment difference on axis {0} is larger"
+ " than an order of magnitude on term {1!r}, "
+ "by more than {2:.4g}; performance may suffer"
+ "".format(axis, term.name, ordm),
+ category=pd.io.common.PerformanceWarning)
+
+ if transpose:
+ f = partial(ti.reindex, index=reindexer, copy=False)
+ else:
+ f = partial(ti.reindex_axis, reindexer, axis=axis,
+ copy=False)
+
+ if pd.lib.is_bool_array(ti.values):
+ r = f(fill_value=True)
+ else:
+ r = f()
+
+ terms[i].update(r)
+
+ res = _maybe_promote_shape(terms[i].value.T if transpose else
+ terms[i].value, naxes)
+ res = res.T if transpose else res
+
+ try:
+ v = res.values
+ except AttributeError:
+ v = res
+ terms[i].update(v)
+
+ return typ, _zip_axes_from_type(typ, axes)
+
+
+def _filter_terms(flat):
+ # numeric literals
+ literals = frozenset(filter(lambda x: isinstance(x, Constant), flat))
+
+ # these are strings which are variable names
+ names = frozenset(flat) - literals
+
+ # literals are not names and names are not literals, so intersection should
+ # be empty
+ if literals & names:
+ raise ValueError('literals cannot be names and names cannot be '
+ 'literals')
+ return names, literals
+
+
+def _align(terms):
+ """Align a set of terms"""
+ try:
+ # flatten the parse tree (a nested list, really)
+ terms = list(com.flatten(terms))
+ except TypeError:
+ # can't iterate so it must just be a constant or single variable
+ if isinstance(terms.value, pd.core.generic.NDFrame):
+ typ = type(terms.value)
+ return typ, _zip_axes_from_type(typ, terms.value.axes)
+ return np.result_type(terms.type), None
+
+ # if all resolved variables are numeric scalars
+ if all(term.isscalar for term in terms):
+ return np.result_type(*(term.value for term in terms)).type, None
+
+ # perform the main alignment
+ typ, axes = _align_core(terms)
+ return typ, axes
+
+
+def _reconstruct_object(typ, obj, axes, dtype):
+ """Reconstruct an object given its type, raw value, and possibly empty
+ (None) axes.
+
+ Parameters
+ ----------
+ typ : object
+ A type
+ obj : object
+ The value to use in the type constructor
+ axes : dict
+ The axes to use to construct the resulting pandas object
+
+ Returns
+ -------
+ ret : typ
+ An object of type ``typ`` with the value `obj` and possible axes
+ `axes`.
+ """
+ try:
+ typ = typ.type
+ except AttributeError:
+ pass
+
+ try:
+ res_t = np.result_type(obj.dtype, dtype)
+ except AttributeError:
+ res_t = dtype
+
+ if (not isinstance(typ, partial) and
+ issubclass(typ, pd.core.generic.PandasObject)):
+ return typ(obj, dtype=res_t, **axes)
+
+ # special case for pathological things like ~True/~False
+ if hasattr(res_t, 'type') and typ == np.bool_ and res_t != np.bool_:
+ ret_value = res_t.type(obj)
+ else:
+ ret_value = typ(obj).astype(res_t)
+
+ try:
+ ret = ret_value.item()
+ except ValueError:
+ ret = ret_value
+ return ret
diff --git a/pandas/computation/api.py b/pandas/computation/api.py
new file mode 100644
index 0000000000000..db8269a497768
--- /dev/null
+++ b/pandas/computation/api.py
@@ -0,0 +1,2 @@
+from pandas.computation.eval import eval
+from pandas.computation.expr import Expr
diff --git a/pandas/computation/common.py b/pandas/computation/common.py
new file mode 100644
index 0000000000000..9af2197a4fd69
--- /dev/null
+++ b/pandas/computation/common.py
@@ -0,0 +1,13 @@
+import numpy as np
+import pandas as pd
+
+
+def _ensure_decoded(s):
+ """ if we have bytes, decode them to unicode """
+ if isinstance(s, (np.bytes_, bytes)):
+ s = s.decode(pd.get_option('display.encoding'))
+ return s
+
+
+class NameResolutionError(NameError):
+ pass
diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py
new file mode 100644
index 0000000000000..88efc9eeab5d5
--- /dev/null
+++ b/pandas/computation/engines.py
@@ -0,0 +1,125 @@
+"""Engine classes for :func:`~pandas.eval`
+"""
+
+import abc
+
+from pandas import compat
+from pandas.core import common as com
+from pandas.computation.align import _align, _reconstruct_object
+from pandas.computation.ops import UndefinedVariableError
+
+
+class AbstractEngine(object):
+ """Object serving as a base class for all engines."""
+
+ __metaclass__ = abc.ABCMeta
+
+ has_neg_frac = False
+
+ def __init__(self, expr):
+ self.expr = expr
+ self.aligned_axes = None
+ self.result_type = None
+
+ def convert(self):
+ """Convert an expression for evaluation.
+
+ Defaults to return the expression as a string.
+ """
+ return com.pprint_thing(self.expr)
+
+ def pre_evaluate(self):
+ self.expr.check_name_clashes()
+
+ def evaluate(self):
+ """Run the engine on the expression
+
+ This method performs alignment which is necessary no matter what engine
+ is being used, thus its implementation is in the base class.
+
+ Returns
+ -------
+ obj : object
+ The result of the passed expression.
+ """
+ if not self._is_aligned:
+ self.result_type, self.aligned_axes = _align(self.expr.terms)
+
+ # make sure no names in resolvers and locals/globals clash
+ self.pre_evaluate()
+ res = self._evaluate()
+ return _reconstruct_object(self.result_type, res, self.aligned_axes,
+ self.expr.terms.return_type)
+
+ @property
+ def _is_aligned(self):
+ return self.aligned_axes is not None and self.result_type is not None
+
+ @abc.abstractmethod
+ def _evaluate(self):
+ """Return an evaluated expression.
+
+ Parameters
+ ----------
+ env : Scope
+ The local and global environment in which to evaluate an
+ expression.
+
+ Notes
+ -----
+ Must be implemented by subclasses.
+ """
+ pass
+
+
+class NumExprEngine(AbstractEngine):
+ """NumExpr engine class"""
+ has_neg_frac = True
+
+ def __init__(self, expr):
+ super(NumExprEngine, self).__init__(expr)
+
+ def convert(self):
+ return str(super(NumExprEngine, self).convert())
+
+ def _evaluate(self):
+ import numexpr as ne
+
+ # add the resolvers to locals
+ self.expr.add_resolvers_to_locals()
+
+ # convert the expression to a valid numexpr expression
+ s = self.convert()
+
+ try:
+ return ne.evaluate(s, local_dict=self.expr.env.locals,
+ global_dict=self.expr.env.globals,
+ truediv=self.expr.truediv)
+ except KeyError as e:
+ # python 3 compat kludge
+ try:
+ msg = e.message
+ except AttributeError:
+ msg = compat.text_type(e)
+ raise UndefinedVariableError(msg)
+
+
+class PythonEngine(AbstractEngine):
+ """Evaluate an expression in Python space.
+
+ Mostly for testing purposes.
+ """
+ has_neg_frac = False
+
+ def __init__(self, expr):
+ super(PythonEngine, self).__init__(expr)
+
+ def evaluate(self):
+ self.pre_evaluate()
+ return self.expr()
+
+ def _evaluate(self):
+ pass
+
+
+_engines = {'numexpr': NumExprEngine, 'python': PythonEngine}
diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py
new file mode 100644
index 0000000000000..36b1e2bc96090
--- /dev/null
+++ b/pandas/computation/eval.py
@@ -0,0 +1,206 @@
+#!/usr/bin/env python
+
+"""Top level ``eval`` module.
+"""
+
+import numbers
+import numpy as np
+
+from pandas.core import common as com
+from pandas.compat import string_types
+from pandas.computation.expr import Expr, _parsers, _ensure_scope
+from pandas.computation.engines import _engines
+
+
+def _check_engine(engine):
+ """Make sure a valid engine is passed.
+
+ Parameters
+ ----------
+ engine : str
+
+ Raises
+ ------
+ KeyError
+ * If an invalid engine is passed
+ ImportError
+ * If numexpr was requested but doesn't exist
+ """
+ if engine not in _engines:
+ raise KeyError('Invalid engine {0!r} passed, valid engines are'
+ ' {1}'.format(engine, list(_engines.keys())))
+
+ # TODO: validate this in a more general way (thinking of future engines
+ # that won't necessarily be import-able)
+ # Could potentially be done on engine instantiation
+ if engine == 'numexpr':
+ try:
+ import numexpr
+ except ImportError:
+ raise ImportError("'numexpr' not found. Cannot use "
+ "engine='numexpr' if 'numexpr' is not installed")
+
+
+def _check_parser(parser):
+ """Make sure a valid parser is passed.
+
+ Parameters
+ ----------
+ parser : str
+
+ Raises
+ ------
+ KeyError
+ * If an invalid parser is passed
+ """
+ if parser not in _parsers:
+ raise KeyError('Invalid parser {0!r} passed, valid parsers are'
+ ' {1}'.format(parser, _parsers.keys()))
+
+
+def _check_resolvers(resolvers):
+ if resolvers is not None:
+ for resolver in resolvers:
+ if not hasattr(resolver, '__getitem__'):
+ name = type(resolver).__name__
+ raise AttributeError('Resolver of type {0!r} must implement '
+ 'the __getitem__ method'.format(name))
+
+
+def _check_expression(expr):
+ """Make sure an expression is not an empty string
+
+ Parameters
+ ----------
+ expr : object
+ An object that can be converted to a string
+
+ Raises
+ ------
+ ValueError
+ * If expr is an empty string
+ """
+ if not expr:
+ raise ValueError("expr cannot be an empty string")
+
+
+def _convert_expression(expr):
+ """Convert an object to an expression.
+
+ Thus function converts an object to an expression (a unicode string) and
+ checks to make sure it isn't empty after conversion. This is used to
+ convert operators to their string representation for recursive calls to
+ :func:`~pandas.eval`.
+
+ Parameters
+ ----------
+ expr : object
+ The object to be converted to a string.
+
+ Returns
+ -------
+ s : unicode
+ The string representation of an object.
+
+ Raises
+ ------
+ ValueError
+ * If the expression is empty.
+ """
+ s = com.pprint_thing(expr)
+ _check_expression(s)
+ return s
+
+
+def eval(expr, parser='pandas', engine='numexpr', truediv=True,
+ local_dict=None, global_dict=None, resolvers=None, level=2):
+ """Evaluate a Python expression as a string using various backends.
+
+ The following arithmetic operations are supported: ``+``, ``-``, ``*``,
+ ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following
+ boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not).
+ Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`,
+ :keyword:`or`, and :keyword:`not` with the same semantics as the
+ corresponding bitwise operators. :class:`~pandas.Series` and
+ :class:`~pandas.DataFrame` objects are supported and behave as they would
+ with plain ol' Python evaluation.
+
+ Parameters
+ ----------
+ expr : str or unicode
+ The expression to evaluate. This string cannot contain any Python
+ `statements
+ `__,
+ only Python `expressions
+ `__.
+ parser : string, default 'pandas', {'pandas', 'python'}
+ The parser to use to construct the syntax tree from the expression. The
+ default of ``'pandas'`` parses code slightly different than standard
+ Python. Alternatively, you can parse an expression using the
+ ``'python'`` parser to retain strict Python semantics. See the
+ :ref:`enhancing performance ` documentation for
+ more details.
+ engine : string, default 'numexpr', {'python', 'numexpr'}
+
+ The engine used to evaluate the expression. Supported engines are
+
+ - ``'numexpr'``: This default engine evaluates pandas objects using
+ numexpr for large speed ups in complex expressions
+ with large frames.
+ - ``'python'``: Performs operations as if you had ``eval``'d in top
+ level python. This engine is generally not that useful.
+
+ More backends may be available in the future.
+
+ truediv : bool, optional
+ Whether to use true division, like in Python >= 3
+ local_dict : dict or None, optional
+ A dictionary of local variables, taken from locals() by default.
+ global_dict : dict or None, optional
+ A dictionary of global variables, taken from globals() by default.
+ resolvers : list of dict-like or None, optional
+ A list of objects implementing the ``__getitem__`` special method that
+ you can use to inject an additional collection of namespaces to use for
+ variable lookup. For example, this is used in the
+ :meth:`~pandas.DataFrame.query` method to inject the
+ :attr:`~pandas.DataFrame.index` and :attr:`~pandas.DataFrame.columns`
+ variables that refer to their respective :class:`~pandas.DataFrame`
+ instance attributes.
+ level : int, optional
+ The number of prior stack frames to traverse and add to the current
+ scope. Most users will **not** need to change this parameter.
+
+ Returns
+ -------
+ ndarray, numeric scalar, DataFrame, Series
+
+ Notes
+ -----
+ The ``dtype`` of any objects involved in an arithmetic ``%`` operation are
+ recursively cast to ``float64``.
+
+ See the :ref:`enhancing performance ` documentation for
+ more details.
+
+ See Also
+ --------
+ pandas.DataFrame.query
+ pandas.DataFrame.eval
+ """
+ expr = _convert_expression(expr)
+ _check_engine(engine)
+ _check_parser(parser)
+ _check_resolvers(resolvers)
+
+ # get our (possibly passed-in) scope
+ env = _ensure_scope(global_dict=global_dict, local_dict=local_dict,
+ resolvers=resolvers, level=level)
+
+ parsed_expr = Expr(expr, engine=engine, parser=parser, env=env,
+ truediv=truediv)
+
+ # construct the engine and evaluate the parsed expression
+ eng = _engines[engine]
+ eng_inst = eng(parsed_expr)
+ ret = eng_inst.evaluate()
+ return ret
diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py
new file mode 100644
index 0000000000000..ff9adc26b8201
--- /dev/null
+++ b/pandas/computation/expr.py
@@ -0,0 +1,763 @@
+""":func:`~pandas.eval` parsers
+"""
+
+import ast
+import operator
+import sys
+import inspect
+import tokenize
+import datetime
+import struct
+
+from functools import partial
+
+import pandas as pd
+from pandas import compat
+from pandas.compat import StringIO, zip, reduce, string_types
+from pandas.core.base import StringMixin
+from pandas.core import common as com
+from pandas.computation.common import NameResolutionError
+from pandas.computation.ops import (_cmp_ops_syms, _bool_ops_syms,
+ _arith_ops_syms, _unary_ops_syms, is_term)
+from pandas.computation.ops import _reductions, _mathops, _LOCAL_TAG
+from pandas.computation.ops import Op, BinOp, UnaryOp, Term, Constant, Div
+
+
+def _ensure_scope(level=2, global_dict=None, local_dict=None, resolvers=None,
+ **kwargs):
+ """Ensure that we are grabbing the correct scope."""
+ return Scope(gbls=global_dict, lcls=local_dict, level=level,
+ resolvers=resolvers)
+
+
+def _check_disjoint_resolver_names(resolver_keys, local_keys, global_keys):
+ """Make sure that variables in resolvers don't overlap with locals or
+ globals.
+ """
+ res_locals = list(com.intersection(resolver_keys, local_keys))
+ if res_locals:
+ msg = "resolvers and locals overlap on names {0}".format(res_locals)
+ raise NameResolutionError(msg)
+
+ res_globals = list(com.intersection(resolver_keys, global_keys))
+ if res_globals:
+ msg = "resolvers and globals overlap on names {0}".format(res_globals)
+ raise NameResolutionError(msg)
+
+
+def _replacer(x, pad_size):
+ """Replace a number with its padded hexadecimal representation. Used to tag
+ temporary variables with their calling scope's id.
+ """
+ # get the hex repr of the binary char and remove 0x and pad by pad_size
+ # zeros
+ try:
+ hexin = ord(x)
+ except TypeError:
+ # bytes literals masquerade as ints when iterating in py3
+ hexin = x
+
+ return hex(hexin).replace('0x', '').rjust(pad_size, '0')
+
+
+def _raw_hex_id(obj, pad_size=2):
+ """Return the padded hexadecimal id of ``obj``."""
+ # interpret as a pointer since that's what really what id returns
+ packed = struct.pack('@P', id(obj))
+
+ return ''.join(_replacer(x, pad_size) for x in packed)
+
+
+class Scope(StringMixin):
+ """Object to hold scope, with a few bells to deal with some custom syntax
+ added by pandas.
+
+ Parameters
+ ----------
+ gbls : dict or None, optional, default None
+ lcls : dict or Scope or None, optional, default None
+ level : int, optional, default 1
+ resolvers : list-like or None, optional, default None
+
+ Attributes
+ ----------
+ globals : dict
+ locals : dict
+ level : int
+ resolvers : tuple
+ resolver_keys : frozenset
+ """
+ __slots__ = ('globals', 'locals', 'resolvers', '_global_resolvers',
+ 'resolver_keys', '_resolver', 'level', 'ntemps')
+
+ def __init__(self, gbls=None, lcls=None, level=1, resolvers=None):
+ self.level = level
+ self.resolvers = tuple(resolvers or [])
+ self.globals = dict()
+ self.locals = dict()
+ self.ntemps = 1 # number of temporary variables in this scope
+
+ if isinstance(lcls, Scope):
+ ld, lcls = lcls, dict()
+ self.locals.update(ld.locals.copy())
+ self.globals.update(ld.globals.copy())
+ self.resolvers += ld.resolvers
+ self.update(ld.level)
+
+ frame = sys._getframe(level)
+ try:
+ self.globals.update(gbls or frame.f_globals)
+ self.locals.update(lcls or frame.f_locals)
+ finally:
+ del frame
+
+ # add some useful defaults
+ self.globals['Timestamp'] = pd.lib.Timestamp
+ self.globals['datetime'] = datetime
+
+ # SUCH a hack
+ self.globals['True'] = True
+ self.globals['False'] = False
+
+ res_keys = (list(o.keys()) for o in self.resolvers)
+ self.resolver_keys = frozenset(reduce(operator.add, res_keys, []))
+ self._global_resolvers = self.resolvers + (self.locals, self.globals)
+ self._resolver = None
+
+ self.resolver_dict = {}
+ for o in self.resolvers:
+ self.resolver_dict.update(dict(o))
+
+ def __unicode__(self):
+ return com.pprint_thing("locals: {0}\nglobals: {0}\nresolvers: "
+ "{0}".format(list(self.locals.keys()),
+ list(self.globals.keys()),
+ list(self.resolver_keys)))
+
+ def __getitem__(self, key):
+ return self.resolve(key, globally=False)
+
+ def resolve(self, key, globally=False):
+ resolvers = self.locals, self.globals
+ if globally:
+ resolvers = self._global_resolvers
+
+ for resolver in resolvers:
+ try:
+ return resolver[key]
+ except KeyError:
+ pass
+
+ def update(self, level=None):
+ """Update the current scope by going back `level` levels.
+
+ Parameters
+ ----------
+ level : int or None, optional, default None
+ """
+ # we are always 2 levels below the caller
+ # plus the caller may be below the env level
+ # in which case we need addtl levels
+ sl = 2
+ if level is not None:
+ sl += level
+
+ # add sl frames to the scope starting with the
+ # most distant and overwritting with more current
+ # makes sure that we can capture variable scope
+ frame = inspect.currentframe()
+ try:
+ frames = []
+ while sl >= 0:
+ frame = frame.f_back
+ sl -= 1
+ frames.append(frame)
+ for f in frames[::-1]:
+ self.locals.update(f.f_locals)
+ self.globals.update(f.f_globals)
+ finally:
+ del frame, frames
+
+ def add_tmp(self, value, where='locals'):
+ """Add a temporary variable to the scope.
+
+ Parameters
+ ----------
+ value : object
+ An arbitrary object to be assigned to a temporary variable.
+ where : basestring, optional, default 'locals', {'locals', 'globals'}
+ What scope to add the value to.
+
+ Returns
+ -------
+ name : basestring
+ The name of the temporary variable created.
+ """
+ d = getattr(self, where, None)
+
+ if d is None:
+ raise AttributeError("Cannot add value to non-existent scope "
+ "{0!r}".format(where))
+ if not isinstance(d, dict):
+ raise TypeError("Cannot add value to object of type {0!r}, "
+ "scope must be a dictionary"
+ "".format(type(d).__name__))
+ name = 'tmp_var_{0}_{1}_{2}'.format(type(value).__name__, self.ntemps,
+ _raw_hex_id(self))
+ d[name] = value
+
+ # only increment if the variable gets put in the scope
+ self.ntemps += 1
+ return name
+
+ def remove_tmp(self, name, where='locals'):
+ d = getattr(self, where, None)
+ if d is None:
+ raise AttributeError("Cannot remove value from non-existent scope "
+ "{0!r}".format(where))
+ if not isinstance(d, dict):
+ raise TypeError("Cannot remove value from object of type {0!r}, "
+ "scope must be a dictionary"
+ "".format(type(d).__name__))
+ del d[name]
+ self.ntemps -= 1
+
+
+def _rewrite_assign(source):
+ """Rewrite the assignment operator for PyTables expression that want to use
+ ``=`` as a substitute for ``==``.
+ """
+ res = []
+ g = tokenize.generate_tokens(StringIO(source).readline)
+ for toknum, tokval, _, _, _ in g:
+ res.append((toknum, '==' if tokval == '=' else tokval))
+ return tokenize.untokenize(res)
+
+
+def _replace_booleans(source):
+ """Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise
+ precedence is changed to boolean precedence.
+ """
+ return source.replace('|', ' or ').replace('&', ' and ')
+
+
+def _replace_locals(source, local_symbol='@'):
+ """Replace local variables with a syntacticall valid name."""
+ return source.replace(local_symbol, _LOCAL_TAG)
+
+
+def _preparse(source):
+ """Compose assignment and boolean replacement."""
+ return _replace_booleans(_rewrite_assign(source))
+
+
+def _is_type(t):
+ """Factory for a type checking function of type ``t`` or tuple of types."""
+ return lambda x: isinstance(x.value, t)
+
+
+_is_list = _is_type(list)
+_is_str = _is_type(string_types)
+
+
+# partition all AST nodes
+_all_nodes = frozenset(filter(lambda x: isinstance(x, type) and
+ issubclass(x, ast.AST),
+ (getattr(ast, node) for node in dir(ast))))
+
+
+def _filter_nodes(superclass, all_nodes=_all_nodes):
+ """Filter out AST nodes that are subclasses of ``superclass``."""
+ node_names = (node.__name__ for node in all_nodes
+ if issubclass(node, superclass))
+ return frozenset(node_names)
+
+
+_all_node_names = frozenset(map(lambda x: x.__name__, _all_nodes))
+_mod_nodes = _filter_nodes(ast.mod)
+_stmt_nodes = _filter_nodes(ast.stmt)
+_expr_nodes = _filter_nodes(ast.expr)
+_expr_context_nodes = _filter_nodes(ast.expr_context)
+_slice_nodes = _filter_nodes(ast.slice)
+_boolop_nodes = _filter_nodes(ast.boolop)
+_operator_nodes = _filter_nodes(ast.operator)
+_unary_op_nodes = _filter_nodes(ast.unaryop)
+_cmp_op_nodes = _filter_nodes(ast.cmpop)
+_comprehension_nodes = _filter_nodes(ast.comprehension)
+_handler_nodes = _filter_nodes(ast.excepthandler)
+_arguments_nodes = _filter_nodes(ast.arguments)
+_keyword_nodes = _filter_nodes(ast.keyword)
+_alias_nodes = _filter_nodes(ast.alias)
+
+
+# nodes that we don't support directly but are needed for parsing
+_hacked_nodes = frozenset(['Assign', 'Module', 'Expr'])
+
+
+_unsupported_expr_nodes = frozenset(['Yield', 'GeneratorExp', 'IfExp',
+ 'DictComp', 'SetComp', 'Repr', 'Lambda',
+ 'Set', 'AST', 'Is', 'IsNot'])
+
+# these nodes are low priority or won't ever be supported (e.g., AST)
+_unsupported_nodes = ((_stmt_nodes | _mod_nodes | _handler_nodes |
+ _arguments_nodes | _keyword_nodes | _alias_nodes |
+ _expr_context_nodes | _unsupported_expr_nodes) -
+ _hacked_nodes)
+
+# we're adding a different assignment in some cases to be equality comparison
+# and we don't want `stmt` and friends in their so get only the class whose
+# names are capitalized
+_base_supported_nodes = (_all_node_names - _unsupported_nodes) | _hacked_nodes
+_msg = 'cannot both support and not support {0}'.format(_unsupported_nodes &
+ _base_supported_nodes)
+assert not _unsupported_nodes & _base_supported_nodes, _msg
+
+
+def _node_not_implemented(node_name, cls):
+ """Return a function that raises a NotImplementedError with a passed node
+ name.
+ """
+ def f(self, *args, **kwargs):
+ raise NotImplementedError("{0!r} nodes are not "
+ "implemented".format(node_name))
+ return f
+
+
+def disallow(nodes):
+ """Decorator to disallow certain nodes from parsing. Raises a
+ NotImplementedError instead.
+
+ Returns
+ -------
+ disallowed : callable
+ """
+ def disallowed(cls):
+ cls.unsupported_nodes = ()
+ for node in nodes:
+ new_method = _node_not_implemented(node, cls)
+ name = 'visit_{0}'.format(node)
+ cls.unsupported_nodes += (name,)
+ setattr(cls, name, new_method)
+ return cls
+ return disallowed
+
+
+def _op_maker(op_class, op_symbol):
+ """Return a function to create an op class with its symbol already passed.
+
+ Returns
+ -------
+ f : callable
+ """
+ def f(self, node, *args, **kwargs):
+ """Return a partial function with an Op subclass with an operator
+ already passed.
+
+ Returns
+ -------
+ f : callable
+ """
+ return partial(op_class, op_symbol, *args, **kwargs)
+ return f
+
+
+_op_classes = {'binary': BinOp, 'unary': UnaryOp}
+
+
+def add_ops(op_classes):
+ """Decorator to add default implementation of ops."""
+ def f(cls):
+ for op_attr_name, op_class in compat.iteritems(op_classes):
+ ops = getattr(cls, '{0}_ops'.format(op_attr_name))
+ ops_map = getattr(cls, '{0}_op_nodes_map'.format(op_attr_name))
+ for op in ops:
+ op_node = ops_map[op]
+ if op_node is not None:
+ made_op = _op_maker(op_class, op)
+ setattr(cls, 'visit_{0}'.format(op_node), made_op)
+ return cls
+ return f
+
+
+@disallow(_unsupported_nodes)
+@add_ops(_op_classes)
+class BaseExprVisitor(ast.NodeVisitor):
+ """Custom ast walker. Parsers of other engines should subclass this class
+ if necessary.
+
+ Parameters
+ ----------
+ env : Scope
+ engine : str
+ parser : str
+ preparser : callable
+ """
+ const_type = Constant
+ term_type = Term
+
+ binary_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms
+ binary_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', 'In', 'NotIn',
+ 'BitAnd', 'BitOr', 'And', 'Or', 'Add', 'Sub', 'Mult',
+ None, 'Pow', 'FloorDiv', 'Mod')
+ binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes))
+
+ unary_ops = _unary_ops_syms
+ unary_op_nodes = 'UAdd', 'USub', 'Invert', 'Not'
+ unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes))
+
+ rewrite_map = {
+ ast.Eq: ast.In,
+ ast.NotEq: ast.NotIn,
+ ast.In: ast.In,
+ ast.NotIn: ast.NotIn
+ }
+
+ def __init__(self, env, engine, parser, preparser=_preparse):
+ self.env = env
+ self.engine = engine
+ self.parser = parser
+ self.preparser = preparser
+
+ def visit(self, node, **kwargs):
+ if isinstance(node, string_types):
+ clean = self.preparser(node)
+ node = ast.fix_missing_locations(ast.parse(clean))
+ elif not isinstance(node, ast.AST):
+ raise TypeError("Cannot visit objects of type {0!r}"
+ "".format(node.__class__.__name__))
+
+ method = 'visit_' + node.__class__.__name__
+ visitor = getattr(self, method)
+ return visitor(node, **kwargs)
+
+ def visit_Module(self, node, **kwargs):
+ if len(node.body) != 1:
+ raise SyntaxError('only a single expression is allowed')
+ expr = node.body[0]
+ return self.visit(expr, **kwargs)
+
+ def visit_Expr(self, node, **kwargs):
+ return self.visit(node.value, **kwargs)
+
+ def _rewrite_membership_op(self, node, left, right):
+ # the kind of the operator (is actually an instance)
+ op_instance = node.op
+ op_type = type(op_instance)
+
+ # must be two terms and the comparison operator must be ==/!=/in/not in
+ if is_term(left) and is_term(right) and op_type in self.rewrite_map:
+
+ left_list, right_list = map(_is_list, (left, right))
+ left_str, right_str = map(_is_str, (left, right))
+
+ # if there are any strings or lists in the expression
+ if left_list or right_list or left_str or right_str:
+ op_instance = self.rewrite_map[op_type]()
+
+ # pop the string variable out of locals and replace it with a list
+ # of one string, kind of a hack
+ if right_str:
+ self.env.remove_tmp(right.name)
+ name = self.env.add_tmp([right.value])
+ right = self.term_type(name, self.env)
+
+ if left_str:
+ self.env.remove_tmp(left.name)
+ name = self.env.add_tmp([left.value])
+ left = self.term_type(name, self.env)
+
+ op = self.visit(op_instance)
+ return op, op_instance, left, right
+
+ def _possibly_transform_eq_ne(self, node, left=None, right=None):
+ if left is None:
+ left = self.visit(node.left, side='left')
+ if right is None:
+ right = self.visit(node.right, side='right')
+ op, op_class, left, right = self._rewrite_membership_op(node, left,
+ right)
+ return op, op_class, left, right
+
+ def _possibly_eval(self, binop, eval_in_python):
+ # eval `in` and `not in` (for now) in "partial" python space
+ # things that can be evaluated in "eval" space will be turned into
+ # temporary variables. for example,
+ # [1,2] in a + 2 * b
+ # in that case a + 2 * b will be evaluated using numexpr, and the "in"
+ # call will be evaluated using isin (in python space)
+ return binop.evaluate(self.env, self.engine, self.parser,
+ self.term_type, eval_in_python)
+
+ def _possibly_evaluate_binop(self, op, op_class, lhs, rhs,
+ eval_in_python=('in', 'not in'),
+ maybe_eval_in_python=('==', '!=')):
+ res = op(lhs, rhs)
+
+ # "in"/"not in" ops are always evaluated in python
+ if res.op in eval_in_python:
+ return self._possibly_eval(res, eval_in_python)
+ elif (lhs.return_type == object or rhs.return_type == object and
+ self.engine != 'pytables'):
+ # evaluate "==" and "!=" in python if either of our operands has an
+ # object return type
+ return self._possibly_eval(res, eval_in_python +
+ maybe_eval_in_python)
+ return res
+
+ def visit_BinOp(self, node, **kwargs):
+ op, op_class, left, right = self._possibly_transform_eq_ne(node)
+ return self._possibly_evaluate_binop(op, op_class, left, right)
+
+ def visit_Div(self, node, **kwargs):
+ return lambda lhs, rhs: Div(lhs, rhs,
+ truediv=self.env.locals['truediv'])
+
+ def visit_UnaryOp(self, node, **kwargs):
+ op = self.visit(node.op)
+ operand = self.visit(node.operand)
+ return op(operand)
+
+ def visit_Name(self, node, **kwargs):
+ return self.term_type(node.id, self.env, **kwargs)
+
+ def visit_Num(self, node, **kwargs):
+ return self.const_type(node.n, self.env)
+
+ def visit_Str(self, node, **kwargs):
+ name = self.env.add_tmp(node.s)
+ return self.term_type(name, self.env)
+
+ def visit_List(self, node, **kwargs):
+ name = self.env.add_tmp([self.visit(e).value for e in node.elts])
+ return self.term_type(name, self.env)
+
+ visit_Tuple = visit_List
+
+ def visit_Index(self, node, **kwargs):
+ """ df.index[4] """
+ return self.visit(node.value)
+
+ def visit_Subscript(self, node, **kwargs):
+ value = self.visit(node.value)
+ slobj = self.visit(node.slice)
+ result = pd.eval(slobj, local_dict=self.env, engine=self.engine,
+ parser=self.parser)
+ try:
+ # a Term instance
+ v = value.value[result]
+ except AttributeError:
+ # an Op instance
+ lhs = pd.eval(value, local_dict=self.env, engine=self.engine,
+ parser=self.parser)
+ v = lhs[result]
+ name = self.env.add_tmp(v)
+ return self.term_type(name, env=self.env)
+
+ def visit_Slice(self, node, **kwargs):
+ """ df.index[slice(4,6)] """
+ lower = node.lower
+ if lower is not None:
+ lower = self.visit(lower).value
+ upper = node.upper
+ if upper is not None:
+ upper = self.visit(upper).value
+ step = node.step
+ if step is not None:
+ step = self.visit(step).value
+
+ return slice(lower, upper, step)
+
+ def visit_Assign(self, node, **kwargs):
+ cmpr = ast.Compare(ops=[ast.Eq()], left=node.targets[0],
+ comparators=[node.value])
+ return self.visit(cmpr)
+
+ def visit_Attribute(self, node, **kwargs):
+ attr = node.attr
+ value = node.value
+
+ ctx = node.ctx
+ if isinstance(ctx, ast.Load):
+ # resolve the value
+ resolved = self.visit(value).value
+ try:
+ v = getattr(resolved, attr)
+ name = self.env.add_tmp(v)
+ return self.term_type(name, self.env)
+ except AttributeError:
+ # something like datetime.datetime where scope is overriden
+ if isinstance(value, ast.Name) and value.id == attr:
+ return resolved
+
+ raise ValueError("Invalid Attribute context {0}".format(ctx.__name__))
+
+ def visit_Call(self, node, **kwargs):
+
+ # this can happen with: datetime.datetime
+ if isinstance(node.func, ast.Attribute):
+ res = self.visit_Attribute(node.func)
+ elif not isinstance(node.func, ast.Name):
+ raise TypeError("Only named functions are supported")
+ else:
+ res = self.visit(node.func)
+
+ if res is None:
+ raise ValueError("Invalid function call {0}".format(node.func.id))
+ if hasattr(res, 'value'):
+ res = res.value
+
+ args = [self.visit(targ).value for targ in node.args]
+ if node.starargs is not None:
+ args = args + self.visit(node.starargs).value
+
+ keywords = {}
+ for key in node.keywords:
+ if not isinstance(key, ast.keyword):
+ raise ValueError("keyword error in function call "
+ "'{0}'".format(node.func.id))
+ keywords[key.arg] = self.visit(key.value).value
+ if node.kwargs is not None:
+ keywords.update(self.visit(node.kwargs).value)
+
+ return self.const_type(res(*args, **keywords), self.env)
+
+ def translate_In(self, op):
+ return op
+
+ def visit_Compare(self, node, **kwargs):
+ ops = node.ops
+ comps = node.comparators
+
+ # base case: we have something like a CMP b
+ if len(comps) == 1:
+ op = self.translate_In(ops[0])
+ binop = ast.BinOp(op=op, left=node.left, right=comps[0])
+ return self.visit(binop)
+
+ # recursive case: we have a chained comparison, a CMP b CMP c, etc.
+ left = node.left
+ values = []
+ for op, comp in zip(ops, comps):
+ new_node = self.visit(ast.Compare(comparators=[comp], left=left,
+ ops=[self.translate_In(op)]))
+ left = comp
+ values.append(new_node)
+ return self.visit(ast.BoolOp(op=ast.And(), values=values))
+
+ def _try_visit_binop(self, bop):
+ if isinstance(bop, (Op, Term)):
+ return bop
+ return self.visit(bop)
+
+ def visit_BoolOp(self, node, **kwargs):
+ def visitor(x, y):
+ lhs = self._try_visit_binop(x)
+ rhs = self._try_visit_binop(y)
+
+ op, op_class, lhs, rhs = self._possibly_transform_eq_ne(node, lhs,
+ rhs)
+ return self._possibly_evaluate_binop(op, node.op, lhs, rhs)
+
+ operands = node.values
+ return reduce(visitor, operands)
+
+
+_python_not_supported = frozenset(['Assign', 'Dict', 'Call', 'BoolOp',
+ 'In', 'NotIn'])
+_numexpr_supported_calls = frozenset(_reductions + _mathops)
+
+
+@disallow((_unsupported_nodes | _python_not_supported) -
+ (_boolop_nodes | frozenset(['BoolOp', 'Attribute', 'In', 'NotIn',
+ 'Tuple'])))
+class PandasExprVisitor(BaseExprVisitor):
+ def __init__(self, env, engine, parser,
+ preparser=lambda x: _replace_locals(_replace_booleans(x))):
+ super(PandasExprVisitor, self).__init__(env, engine, parser, preparser)
+
+
+@disallow(_unsupported_nodes | _python_not_supported | frozenset(['Not']))
+class PythonExprVisitor(BaseExprVisitor):
+ def __init__(self, env, engine, parser, preparser=lambda x: x):
+ super(PythonExprVisitor, self).__init__(env, engine, parser,
+ preparser=preparser)
+
+
+class Expr(StringMixin):
+ """Object encapsulating an expression.
+
+ Parameters
+ ----------
+ expr : str
+ engine : str, optional, default 'numexpr'
+ parser : str, optional, default 'pandas'
+ env : Scope, optional, default None
+ truediv : bool, optional, default True
+ level : int, optional, default 2
+ """
+ def __init__(self, expr, engine='numexpr', parser='pandas', env=None,
+ truediv=True, level=2):
+ self.expr = expr
+ self.env = _ensure_scope(level=level, local_dict=env)
+ self.engine = engine
+ self.parser = parser
+ self._visitor = _parsers[parser](self.env, self.engine, self.parser)
+ self.terms = self.parse()
+ self.truediv = truediv
+
+ def __call__(self):
+ self.env.locals['truediv'] = self.truediv
+ return self.terms(self.env)
+
+ def __unicode__(self):
+ return com.pprint_thing(self.terms)
+
+ def __len__(self):
+ return len(self.expr)
+
+ def parse(self):
+ """Parse an expression"""
+ return self._visitor.visit(self.expr)
+
+ def align(self):
+ """align a set of Terms"""
+ return self.terms.align(self.env)
+
+ @property
+ def names(self):
+ """Get the names in an expression"""
+ if is_term(self.terms):
+ return frozenset([self.terms.name])
+ return frozenset(term.name for term in com.flatten(self.terms))
+
+ def check_name_clashes(self):
+ env = self.env
+ names = self.names
+ res_keys = frozenset(env.resolver_dict.keys()) & names
+ lcl_keys = frozenset(env.locals.keys()) & names
+ gbl_keys = frozenset(env.globals.keys()) & names
+ _check_disjoint_resolver_names(res_keys, lcl_keys, gbl_keys)
+
+ def add_resolvers_to_locals(self):
+ """Add the extra scope (resolvers) to local scope
+
+ Notes
+ -----
+ This should be done after parsing and pre-evaluation, otherwise
+ unnecessary name clashes will occur.
+ """
+ self.env.locals.update(self.env.resolver_dict)
+
+
+def isexpr(s, check_names=True):
+ """Strict checking for a valid expression."""
+ try:
+ Expr(s, env=_ensure_scope() if check_names else None)
+ except SyntaxError:
+ return False
+ except NameError:
+ return not check_names
+ return True
+
+
+_parsers = {'python': PythonExprVisitor, 'pandas': PandasExprVisitor}
diff --git a/pandas/core/expressions.py b/pandas/computation/expressions.py
similarity index 67%
rename from pandas/core/expressions.py
rename to pandas/computation/expressions.py
index b1bd104ce48a5..45c9a2d5259cb 100644
--- a/pandas/core/expressions.py
+++ b/pandas/computation/expressions.py
@@ -5,6 +5,7 @@
Offer fast expression evaluation thru numexpr
"""
+
import numpy as np
from pandas.core.common import _values_from_object
@@ -15,17 +16,19 @@
_NUMEXPR_INSTALLED = False
_USE_NUMEXPR = _NUMEXPR_INSTALLED
-_evaluate = None
-_where = None
+_evaluate = None
+_where = None
# the set of dtypes that we will allow pass to numexpr
-_ALLOWED_DTYPES = dict(evaluate = set(['int64','int32','float64','float32','bool']),
- where = set(['int64','float64','bool']))
+_ALLOWED_DTYPES = dict(
+ evaluate=set(['int64', 'int32', 'float64', 'float32', 'bool']),
+ where=set(['int64', 'float64', 'bool']))
# the minimum prod shape that we will use numexpr
-_MIN_ELEMENTS = 10000
+_MIN_ELEMENTS = 10000
+
-def set_use_numexpr(v = True):
+def set_use_numexpr(v=True):
# set/unset to use numexpr
global _USE_NUMEXPR
if _NUMEXPR_INSTALLED:
@@ -35,26 +38,25 @@ def set_use_numexpr(v = True):
global _evaluate, _where
if not _USE_NUMEXPR:
_evaluate = _evaluate_standard
- _where = _where_standard
+ _where = _where_standard
else:
_evaluate = _evaluate_numexpr
- _where = _where_numexpr
+ _where = _where_numexpr
-def set_numexpr_threads(n = None):
+
+def set_numexpr_threads(n=None):
# if we are using numexpr, set the threads to n
# otherwise reset
- try:
- if _NUMEXPR_INSTALLED and _USE_NUMEXPR:
- if n is None:
- n = ne.detect_number_of_cores()
- ne.set_num_threads(n)
- except:
- pass
+ if _NUMEXPR_INSTALLED and _USE_NUMEXPR:
+ if n is None:
+ n = ne.detect_number_of_cores()
+ ne.set_num_threads(n)
def _evaluate_standard(op, op_str, a, b, raise_on_error=True, **eval_kwargs):
""" standard evaluation """
- return op(a,b)
+ return op(a, b)
+
def _can_use_numexpr(op, op_str, a, b, dtype_check):
""" return a boolean if we WILL be using numexpr """
@@ -65,13 +67,13 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check):
# check for dtype compatiblity
dtypes = set()
- for o in [ a, b ]:
- if hasattr(o,'get_dtype_counts'):
+ for o in [a, b]:
+ if hasattr(o, 'get_dtype_counts'):
s = o.get_dtype_counts()
if len(s) > 1:
return False
dtypes |= set(s.index)
- elif isinstance(o,np.ndarray):
+ elif isinstance(o, np.ndarray):
dtypes |= set([o.dtype.name])
# allowed are a superset
@@ -80,52 +82,54 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check):
return False
-def _evaluate_numexpr(op, op_str, a, b, raise_on_error = False, **eval_kwargs):
+
+def _evaluate_numexpr(op, op_str, a, b, raise_on_error=False, **eval_kwargs):
result = None
if _can_use_numexpr(op, op_str, a, b, 'evaluate'):
try:
a_value, b_value = a, b
- if hasattr(a_value,'values'):
+ if hasattr(a_value, 'values'):
a_value = a_value.values
- if hasattr(b_value,'values'):
+ if hasattr(b_value, 'values'):
b_value = b_value.values
result = ne.evaluate('a_value %s b_value' % op_str,
- local_dict={ 'a_value' : a_value,
- 'b_value' : b_value },
+ local_dict={'a_value': a_value,
+ 'b_value': b_value},
casting='safe', **eval_kwargs)
except (ValueError) as detail:
if 'unknown type object' in str(detail):
pass
except (Exception) as detail:
if raise_on_error:
- raise TypeError(str(detail))
+ raise
if result is None:
- result = _evaluate_standard(op,op_str,a,b,raise_on_error)
+ result = _evaluate_standard(op, op_str, a, b, raise_on_error)
return result
def _where_standard(cond, a, b, raise_on_error=True):
- return np.where(_values_from_object(cond), _values_from_object(a), _values_from_object(b))
+ return np.where(_values_from_object(cond), _values_from_object(a),
+ _values_from_object(b))
-def _where_numexpr(cond, a, b, raise_on_error = False):
+def _where_numexpr(cond, a, b, raise_on_error=False):
result = None
if _can_use_numexpr(None, 'where', a, b, 'where'):
try:
cond_value, a_value, b_value = cond, a, b
- if hasattr(cond_value,'values'):
+ if hasattr(cond_value, 'values'):
cond_value = cond_value.values
- if hasattr(a_value,'values'):
+ if hasattr(a_value, 'values'):
a_value = a_value.values
- if hasattr(b_value,'values'):
+ if hasattr(b_value, 'values'):
b_value = b_value.values
- result = ne.evaluate('where(cond_value,a_value,b_value)',
- local_dict={ 'cond_value' : cond_value,
- 'a_value' : a_value,
- 'b_value' : b_value },
+ result = ne.evaluate('where(cond_value, a_value, b_value)',
+ local_dict={'cond_value': cond_value,
+ 'a_value': a_value,
+ 'b_value': b_value},
casting='safe')
except (ValueError) as detail:
if 'unknown type object' in str(detail):
@@ -135,7 +139,7 @@ def _where_numexpr(cond, a, b, raise_on_error = False):
raise TypeError(str(detail))
if result is None:
- result = _where_standard(cond,a,b,raise_on_error)
+ result = _where_standard(cond, a, b, raise_on_error)
return result
@@ -143,7 +147,9 @@ def _where_numexpr(cond, a, b, raise_on_error = False):
# turn myself on
set_use_numexpr(True)
-def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True, **eval_kwargs):
+
+def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True,
+ **eval_kwargs):
""" evaluate and return the expression of the op on a and b
Parameters
@@ -153,15 +159,18 @@ def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True, **eval_kw
op_str: the string version of the op
a : left operand
b : right operand
- raise_on_error : pass the error to the higher level if indicated (default is False),
- otherwise evaluate the op with and return the results
+ raise_on_error : pass the error to the higher level if indicated
+ (default is False), otherwise evaluate the op with and
+ return the results
use_numexpr : whether to try to use numexpr (default True)
"""
if use_numexpr:
- return _evaluate(op, op_str, a, b, raise_on_error=raise_on_error, **eval_kwargs)
+ return _evaluate(op, op_str, a, b, raise_on_error=raise_on_error,
+ **eval_kwargs)
return _evaluate_standard(op, op_str, a, b, raise_on_error=raise_on_error)
+
def where(cond, a, b, raise_on_error=False, use_numexpr=True):
""" evaluate the where condition cond on a and b
@@ -171,8 +180,9 @@ def where(cond, a, b, raise_on_error=False, use_numexpr=True):
cond : a boolean array
a : return if cond is True
b : return if cond is False
- raise_on_error : pass the error to the higher level if indicated (default is False),
- otherwise evaluate the op with and return the results
+ raise_on_error : pass the error to the higher level if indicated
+ (default is False), otherwise evaluate the op with and
+ return the results
use_numexpr : whether to try to use numexpr (default True)
"""
diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py
new file mode 100644
index 0000000000000..debc79e33968c
--- /dev/null
+++ b/pandas/computation/ops.py
@@ -0,0 +1,510 @@
+"""Operator classes for eval.
+"""
+
+import re
+import operator as op
+from functools import partial
+from itertools import product, islice, chain
+
+import numpy as np
+
+import pandas as pd
+from pandas.compat import PY3, string_types, text_type
+import pandas.core.common as com
+from pandas.core.base import StringMixin
+from pandas.computation.common import _ensure_decoded
+
+
+_reductions = 'sum', 'prod'
+_mathops = ('sin', 'cos', 'exp', 'log', 'expm1', 'log1p', 'pow', 'div', 'sqrt',
+ 'inv', 'sinh', 'cosh', 'tanh', 'arcsin', 'arccos', 'arctan',
+ 'arccosh', 'arcsinh', 'arctanh', 'arctan2', 'abs')
+
+
+_LOCAL_TAG = '__pd_eval_local_'
+_TAG_RE = re.compile('^{0}'.format(_LOCAL_TAG))
+
+
+class UndefinedVariableError(NameError):
+ """NameError subclass for local variables."""
+ def __init__(self, *args):
+ msg = 'name {0!r} is not defined'
+ subbed = _TAG_RE.sub('', args[0])
+ if subbed != args[0]:
+ subbed = '@' + subbed
+ msg = 'local variable {0!r} is not defined'
+ super(UndefinedVariableError, self).__init__(msg.format(subbed))
+
+
+def _possibly_update_key(d, value, old_key, new_key=None):
+ if new_key is None:
+ new_key = old_key
+
+ try:
+ del d[old_key]
+ except KeyError:
+ return False
+ else:
+ d[new_key] = value
+ return True
+
+
+class Term(StringMixin):
+ def __new__(cls, name, env, side=None, encoding=None):
+ klass = Constant if not isinstance(name, string_types) else cls
+ supr_new = super(Term, klass).__new__
+ if PY3:
+ return supr_new(klass)
+ return supr_new(klass, name, env, side=side, encoding=encoding)
+
+ def __init__(self, name, env, side=None, encoding=None):
+ self._name = name
+ self.env = env
+ self.side = side
+ self.local = _TAG_RE.search(text_type(name)) is not None
+ self._value = self._resolve_name()
+ self.encoding = encoding
+
+ @property
+ def local_name(self):
+ return _TAG_RE.sub('', self.name)
+
+ def __unicode__(self):
+ return com.pprint_thing(self.name)
+
+ def __call__(self, *args, **kwargs):
+ return self.value
+
+ def evaluate(self, *args, **kwargs):
+ return self
+
+ def _resolve_name(self):
+ env = self.env
+ key = self.name
+ res = env.resolve(self.local_name, globally=not self.local)
+ self.update(res)
+
+ if res is None:
+ if not isinstance(key, string_types):
+ return key
+ raise UndefinedVariableError(key)
+
+ if hasattr(res, 'ndim') and res.ndim > 2:
+ raise NotImplementedError("N-dimensional objects, where N > 2, are"
+ " not supported with eval")
+ return res
+
+ def update(self, value):
+ """
+ search order for local (i.e., @variable) variables:
+
+ scope, key_variable
+ [('locals', 'local_name'),
+ ('globals', 'local_name'),
+ ('locals', 'key'),
+ ('globals', 'key')]
+ """
+ env = self.env
+ key = self.name
+
+ # if it's a variable name (otherwise a constant)
+ if isinstance(key, string_types):
+ if self.local:
+ # get it's name WITHOUT the local tag (defined above)
+ local_name = self.local_name
+
+ # search for the local in the above specified order
+ scope_pairs = product([env.locals, env.globals],
+ [local_name, key])
+
+ # a[::2] + a[1::2] but iterators
+ scope_iter = chain(islice(scope_pairs, None, None, 2),
+ islice(scope_pairs, 1, None, 2))
+ for d, k in scope_iter:
+ if _possibly_update_key(d, value, k, key):
+ break
+ else:
+ raise UndefinedVariableError(key)
+ else:
+ # otherwise we look in resolvers -> locals -> globals
+ for r in (env.resolver_dict, env.locals, env.globals):
+ if _possibly_update_key(r, value, key):
+ break
+ else:
+ raise UndefinedVariableError(key)
+
+ self.value = value
+
+ @property
+ def isscalar(self):
+ return np.isscalar(self._value)
+
+ @property
+ def type(self):
+ try:
+ # potentially very slow for large, mixed dtype frames
+ return self._value.values.dtype
+ except AttributeError:
+ try:
+ # ndarray
+ return self._value.dtype
+ except AttributeError:
+ # scalar
+ return type(self._value)
+
+ return_type = type
+
+ @property
+ def raw(self):
+ return com.pprint_thing('{0}(name={1!r}, type={2})'
+ ''.format(self.__class__.__name__, self.name,
+ self.type))
+
+ @property
+ def kind(self):
+ try:
+ return self.type.__name__
+ except AttributeError:
+ return self.type.type.__name__
+
+ @property
+ def value(self):
+ kind = self.kind.lower()
+ if kind == 'datetime64':
+ try:
+ return self._value.asi8
+ except AttributeError:
+ return self._value.view('i8')
+ elif kind == 'datetime':
+ return pd.Timestamp(self._value)
+ elif kind == 'timestamp':
+ return self._value.asm8.view('i8')
+ return self._value
+
+ @value.setter
+ def value(self, new_value):
+ self._value = new_value
+
+ @property
+ def name(self):
+ return self._name
+
+ @name.setter
+ def name(self, new_name):
+ self._name = new_name
+
+ @property
+ def ndim(self):
+ try:
+ return self._value.ndim
+ except AttributeError:
+ return 0
+
+
+class Constant(Term):
+ def __init__(self, value, env, side=None, encoding=None):
+ super(Constant, self).__init__(value, env, side=side,
+ encoding=encoding)
+
+ def _resolve_name(self):
+ return self._name
+
+ @property
+ def name(self):
+ return self.value
+
+
+
+_bool_op_map = {'not': '~', 'and': '&', 'or': '|'}
+
+
+class Op(StringMixin):
+ """Hold an operator of unknown arity
+ """
+ def __init__(self, op, operands, *args, **kwargs):
+ self.op = _bool_op_map.get(op, op)
+ self.operands = operands
+ self.encoding = kwargs.get('encoding', None)
+
+ def __iter__(self):
+ return iter(self.operands)
+
+ def __unicode__(self):
+ """Print a generic n-ary operator and its operands using infix
+ notation"""
+ # recurse over the operands
+ parened = ('({0})'.format(com.pprint_thing(opr))
+ for opr in self.operands)
+ return com.pprint_thing(' {0} '.format(self.op).join(parened))
+
+ @property
+ def return_type(self):
+ # clobber types to bool if the op is a boolean operator
+ if self.op in (_cmp_ops_syms + _bool_ops_syms):
+ return np.bool_
+ return np.result_type(*(term.type for term in com.flatten(self)))
+
+ @property
+ def isscalar(self):
+ return all(operand.isscalar for operand in self.operands)
+
+
+def _in(x, y):
+ """Compute the vectorized membership of ``x in y`` if possible, otherwise
+ use Python.
+ """
+ try:
+ return x.isin(y)
+ except AttributeError:
+ if com.is_list_like(x):
+ try:
+ return y.isin(x)
+ except AttributeError:
+ pass
+ return x in y
+
+
+def _not_in(x, y):
+ """Compute the vectorized membership of ``x not in y`` if possible,
+ otherwise use Python.
+ """
+ try:
+ return ~x.isin(y)
+ except AttributeError:
+ if com.is_list_like(x):
+ try:
+ return ~y.isin(x)
+ except AttributeError:
+ pass
+ return x not in y
+
+
+_cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=', 'in', 'not in'
+_cmp_ops_funcs = op.gt, op.lt, op.ge, op.le, op.eq, op.ne, _in, _not_in
+_cmp_ops_dict = dict(zip(_cmp_ops_syms, _cmp_ops_funcs))
+
+_bool_ops_syms = '&', '|', 'and', 'or'
+_bool_ops_funcs = op.and_, op.or_, op.and_, op.or_
+_bool_ops_dict = dict(zip(_bool_ops_syms, _bool_ops_funcs))
+
+_arith_ops_syms = '+', '-', '*', '/', '**', '//', '%'
+_arith_ops_funcs = (op.add, op.sub, op.mul, op.truediv if PY3 else op.div,
+ op.pow, op.floordiv, op.mod)
+_arith_ops_dict = dict(zip(_arith_ops_syms, _arith_ops_funcs))
+
+_special_case_arith_ops_syms = '**', '//', '%'
+_special_case_arith_ops_funcs = op.pow, op.floordiv, op.mod
+_special_case_arith_ops_dict = dict(zip(_special_case_arith_ops_syms,
+ _special_case_arith_ops_funcs))
+
+_binary_ops_dict = {}
+
+for d in (_cmp_ops_dict, _bool_ops_dict, _arith_ops_dict):
+ _binary_ops_dict.update(d)
+
+
+def _cast_inplace(terms, dtype):
+ """Cast an expression inplace.
+
+ Parameters
+ ----------
+ terms : Op
+ The expression that should cast.
+ dtype : str or numpy.dtype
+ The dtype to cast to.
+ """
+ dt = np.dtype(dtype)
+ for term in terms:
+ try:
+ new_value = term.value.astype(dt)
+ except AttributeError:
+ new_value = dt.type(term.value)
+ term.update(new_value)
+
+
+def is_term(obj):
+ return isinstance(obj, Term)
+
+
+class BinOp(Op):
+ """Hold a binary operator and its operands
+
+ Parameters
+ ----------
+ op : str
+ left : Term or Op
+ right : Term or Op
+ """
+ def __init__(self, op, lhs, rhs, **kwargs):
+ super(BinOp, self).__init__(op, (lhs, rhs))
+ self.lhs = lhs
+ self.rhs = rhs
+
+ self._disallow_scalar_only_bool_ops()
+
+ self.convert_values()
+
+ try:
+ self.func = _binary_ops_dict[op]
+ except KeyError:
+ # has to be made a list for python3
+ keys = list(_binary_ops_dict.keys())
+ raise ValueError('Invalid binary operator {0!r}, valid'
+ ' operators are {1}'.format(op, keys))
+
+ def __call__(self, env):
+ """Recursively evaluate an expression in Python space.
+
+ Parameters
+ ----------
+ env : Scope
+
+ Returns
+ -------
+ object
+ The result of an evaluated expression.
+ """
+ # handle truediv
+ if self.op == '/' and env.locals['truediv']:
+ self.func = op.truediv
+
+ # recurse over the left/right nodes
+ left = self.lhs(env)
+ right = self.rhs(env)
+
+ return self.func(left, right)
+
+ def evaluate(self, env, engine, parser, term_type, eval_in_python):
+ """Evaluate a binary operation *before* being passed to the engine.
+
+ Parameters
+ ----------
+ env : Scope
+ engine : str
+ parser : str
+ term_type : type
+ eval_in_python : list
+
+ Returns
+ -------
+ term_type
+ The "pre-evaluated" expression as an instance of ``term_type``
+ """
+ if engine == 'python':
+ res = self(env)
+ else:
+ # recurse over the left/right nodes
+ left = self.lhs.evaluate(env, engine=engine, parser=parser,
+ term_type=term_type,
+ eval_in_python=eval_in_python)
+ right = self.rhs.evaluate(env, engine=engine, parser=parser,
+ term_type=term_type,
+ eval_in_python=eval_in_python)
+
+ # base cases
+ if self.op in eval_in_python:
+ res = self.func(left.value, right.value)
+ else:
+ res = pd.eval(self, local_dict=env, engine=engine,
+ parser=parser)
+
+ name = env.add_tmp(res)
+ return term_type(name, env=env)
+
+ def convert_values(self):
+ """Convert datetimes to a comparable value in an expression.
+ """
+ def stringify(value):
+ if self.encoding is not None:
+ encoder = partial(com.pprint_thing_encoded,
+ encoding=self.encoding)
+ else:
+ encoder = com.pprint_thing
+ return encoder(value)
+
+ lhs, rhs = self.lhs, self.rhs
+
+ if (is_term(lhs) and lhs.kind.startswith('datetime') and is_term(rhs)
+ and rhs.isscalar):
+ v = rhs.value
+ if isinstance(v, (int, float)):
+ v = stringify(v)
+ v = _ensure_decoded(v)
+ v = pd.Timestamp(v)
+ if v.tz is not None:
+ v = v.tz_convert('UTC')
+ self.rhs.update(v)
+
+ if (is_term(rhs) and rhs.kind.startswith('datetime') and
+ is_term(lhs) and lhs.isscalar):
+ v = lhs.value
+ if isinstance(v, (int, float)):
+ v = stringify(v)
+ v = _ensure_decoded(v)
+ v = pd.Timestamp(v)
+ if v.tz is not None:
+ v = v.tz_convert('UTC')
+ self.lhs.update(v)
+
+ def _disallow_scalar_only_bool_ops(self):
+ if ((self.lhs.isscalar or self.rhs.isscalar) and
+ self.op in _bool_ops_dict and
+ (not (issubclass(self.rhs.return_type, (bool, np.bool_)) and
+ issubclass(self.lhs.return_type, (bool, np.bool_))))):
+ raise NotImplementedError("cannot evaluate scalar only bool ops")
+
+
+class Div(BinOp):
+ """Div operator to special case casting.
+
+ Parameters
+ ----------
+ lhs, rhs : Term or Op
+ The Terms or Ops in the ``/`` expression.
+ truediv : bool
+ Whether or not to use true division. With Python 3 this happens
+ regardless of the value of ``truediv``.
+ """
+ def __init__(self, lhs, rhs, truediv=True, *args, **kwargs):
+ super(Div, self).__init__('/', lhs, rhs, *args, **kwargs)
+
+ if truediv or PY3:
+ _cast_inplace(com.flatten(self), np.float_)
+
+
+_unary_ops_syms = '+', '-', '~', 'not'
+_unary_ops_funcs = op.pos, op.neg, op.invert, op.invert
+_unary_ops_dict = dict(zip(_unary_ops_syms, _unary_ops_funcs))
+
+
+class UnaryOp(Op):
+ """Hold a unary operator and its operands
+
+ Parameters
+ ----------
+ op : str
+ The token used to represent the operator.
+ operand : Term or Op
+ The Term or Op operand to the operator.
+
+ Raises
+ ------
+ ValueError
+ * If no function associated with the passed operator token is found.
+ """
+ def __init__(self, op, operand):
+ super(UnaryOp, self).__init__(op, (operand,))
+ self.operand = operand
+
+ try:
+ self.func = _unary_ops_dict[op]
+ except KeyError:
+ raise ValueError('Invalid unary operator {0!r}, valid operators '
+ 'are {1}'.format(op, _unary_ops_syms))
+
+ def __call__(self, env):
+ operand = self.operand(env)
+ return self.func(operand)
+
+ def __unicode__(self):
+ return com.pprint_thing('{0}({1})'.format(self.op, self.operand))
diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py
new file mode 100644
index 0000000000000..9ffae5edd93bc
--- /dev/null
+++ b/pandas/computation/pytables.py
@@ -0,0 +1,573 @@
+""" manage PyTables query interface via Expressions """
+
+import ast
+import time
+import warnings
+from functools import partial
+from datetime import datetime
+
+import pandas as pd
+from pandas.compat import u, string_types, PY3
+from pandas.core.base import StringMixin
+import pandas.core.common as com
+from pandas.computation import expr, ops
+from pandas.computation.ops import is_term
+from pandas.computation.expr import BaseExprVisitor
+from pandas.computation.common import _ensure_decoded
+from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type
+
+class Scope(expr.Scope):
+ __slots__ = 'globals', 'locals', 'queryables'
+
+ def __init__(self, gbls=None, lcls=None, queryables=None, level=1):
+ super(
+ Scope,
+ self).__init__(gbls=gbls,
+ lcls=lcls,
+ level=level)
+ self.queryables = queryables or dict()
+
+
+class Term(ops.Term):
+ def __new__(cls, name, env, side=None, encoding=None):
+ klass = Constant if not isinstance(name, string_types) else cls
+ supr_new = StringMixin.__new__
+ if PY3:
+ return supr_new(klass)
+ return supr_new(klass, name, env, side=side, encoding=encoding)
+
+ def __init__(self, name, env, side=None, encoding=None):
+ super(Term, self).__init__(name, env, side=side, encoding=encoding)
+
+ def _resolve_name(self):
+ # must be a queryables
+ if self.side == 'left':
+ if self.name not in self.env.queryables:
+ raise NameError('name {0!r} is not defined'.format(self.name))
+ return self.name
+
+ # resolve the rhs (and allow to be None)
+ return self.env.locals.get(self.name,
+ self.env.globals.get(self.name, self.name))
+
+ @property
+ def value(self):
+ return self._value
+
+
+class Constant(Term):
+ def __init__(self, value, env, side=None, encoding=None):
+ super(Constant, self).__init__(value, env, side=side,
+ encoding=encoding)
+
+ def _resolve_name(self):
+ return self._name
+
+ @property
+ def name(self):
+ return self._value
+
+
+class BinOp(ops.BinOp):
+
+ _max_selectors = 31
+
+ def __init__(self, op, lhs, rhs, queryables, encoding):
+ super(BinOp, self).__init__(op, lhs, rhs)
+ self.queryables = queryables
+ self.encoding = encoding
+ self.filter = None
+ self.condition = None
+
+ def _disallow_scalar_only_bool_ops(self):
+ pass
+
+ def prune(self, klass):
+
+ def pr(left, right):
+ """ create and return a new specilized BinOp from myself """
+
+ if left is None:
+ return right
+ elif right is None:
+ return left
+
+ k = klass
+ if isinstance(left, ConditionBinOp):
+ if (isinstance(left, ConditionBinOp) and
+ isinstance(right, ConditionBinOp)):
+ k = JointConditionBinOp
+ elif isinstance(left, k):
+ return left
+ elif isinstance(right, k):
+ return right
+
+ elif isinstance(left, FilterBinOp):
+ if (isinstance(left, FilterBinOp) and
+ isinstance(right, FilterBinOp)):
+ k = JointFilterBinOp
+ elif isinstance(left, k):
+ return left
+ elif isinstance(right, k):
+ return right
+
+ return k(self.op, left, right, queryables=self.queryables,
+ encoding=self.encoding).evaluate()
+
+ left, right = self.lhs, self.rhs
+
+ if is_term(left) and is_term(right):
+ res = pr(left.value, right.value)
+ elif not is_term(left) and is_term(right):
+ res = pr(left.prune(klass), right.value)
+ elif is_term(left) and not is_term(right):
+ res = pr(left.value, right.prune(klass))
+ elif not (is_term(left) or is_term(right)):
+ res = pr(left.prune(klass), right.prune(klass))
+
+ return res
+
+ def conform(self, rhs):
+ """ inplace conform rhs """
+ if not com.is_list_like(rhs):
+ rhs = [rhs]
+ if hasattr(self.rhs, 'ravel'):
+ rhs = rhs.ravel()
+ return rhs
+
+ @property
+ def is_valid(self):
+ """ return True if this is a valid field """
+ return self.lhs in self.queryables
+
+ @property
+ def is_in_table(self):
+ """ return True if this is a valid column name for generation (e.g. an
+ actual column in the table) """
+ return self.queryables.get(self.lhs) is not None
+
+ @property
+ def kind(self):
+ """ the kind of my field """
+ return self.queryables.get(self.lhs)
+
+ def generate(self, v):
+ """ create and return the op string for this TermValue """
+ val = v.tostring(self.encoding)
+ return "(%s %s %s)" % (self.lhs, self.op, val)
+
+ def convert_value(self, v):
+ """ convert the expression that is in the term to something that is
+ accepted by pytables """
+
+ def stringify(value):
+ if self.encoding is not None:
+ encoder = partial(com.pprint_thing_encoded,
+ encoding=self.encoding)
+ else:
+ encoder = com.pprint_thing
+ return encoder(value)
+
+ kind = _ensure_decoded(self.kind)
+ if kind == u('datetime64') or kind == u('datetime'):
+ if isinstance(v, (int, float)):
+ v = stringify(v)
+ v = _ensure_decoded(v)
+ v = pd.Timestamp(v)
+ if v.tz is not None:
+ v = v.tz_convert('UTC')
+ return TermValue(v, v.value, kind)
+ elif isinstance(v, datetime) or hasattr(v, 'timetuple') or kind == u('date'):
+ v = time.mktime(v.timetuple())
+ return TermValue(v, pd.Timestamp(v), kind)
+ elif kind == u('timedelta64') or kind == u('timedelta'):
+ v = _coerce_scalar_to_timedelta_type(v,unit='s').item()
+ return TermValue(int(v), v, kind)
+ elif kind == u('integer'):
+ v = int(float(v))
+ return TermValue(v, v, kind)
+ elif kind == u('float'):
+ v = float(v)
+ return TermValue(v, v, kind)
+ elif kind == u('bool'):
+ if isinstance(v, string_types):
+ v = not v.strip().lower() in [u('false'), u('f'), u('no'),
+ u('n'), u('none'), u('0'),
+ u('[]'), u('{}'), u('')]
+ else:
+ v = bool(v)
+ return TermValue(v, v, kind)
+ elif not isinstance(v, string_types):
+ v = stringify(v)
+ return TermValue(v, stringify(v), u('string'))
+
+ # string quoting
+ return TermValue(v, stringify(v), u('string'))
+
+ def convert_values(self):
+ pass
+
+
+class FilterBinOp(BinOp):
+
+ def __unicode__(self):
+ return com.pprint_thing("[Filter : [{0}] -> "
+ "[{1}]".format(self.filter[0], self.filter[1]))
+
+ def invert(self):
+ """ invert the filter """
+ if self.filter is not None:
+ f = list(self.filter)
+ f[1] = self.generate_filter_op(invert=True)
+ self.filter = tuple(f)
+ return self
+
+ def format(self):
+ """ return the actual filter format """
+ return [self.filter]
+
+ def evaluate(self):
+
+ if not isinstance(self.lhs, string_types):
+ return self
+
+ if not self.is_valid:
+ raise ValueError("query term is not valid [%s]" % self)
+
+ rhs = self.conform(self.rhs)
+ values = [TermValue(v, v, self.kind) for v in rhs]
+
+ if self.is_in_table:
+
+ # if too many values to create the expression, use a filter instead
+ if self.op in ['==', '!='] and len(values) > self._max_selectors:
+
+ filter_op = self.generate_filter_op()
+ self.filter = (
+ self.lhs,
+ filter_op,
+ pd.Index([v.value for v in values]))
+
+ return self
+ return None
+
+ # equality conditions
+ if self.op in ['==', '!=']:
+
+ filter_op = self.generate_filter_op()
+ self.filter = (
+ self.lhs,
+ filter_op,
+ pd.Index([v.value for v in values]))
+
+ else:
+ raise TypeError(
+ "passing a filterable condition to a non-table indexer [%s]" %
+ self)
+
+ return self
+
+ def generate_filter_op(self, invert=False):
+ if (self.op == '!=' and not invert) or (self.op == '==' and invert):
+ return lambda axis, vals: ~axis.isin(vals)
+ else:
+ return lambda axis, vals: axis.isin(vals)
+
+
+class JointFilterBinOp(FilterBinOp):
+
+ def format(self):
+ raise NotImplementedError("unable to collapse Joint Filters")
+
+ def evaluate(self):
+ return self
+
+
+class ConditionBinOp(BinOp):
+
+ def __unicode__(self):
+ return com.pprint_thing("[Condition : [{0}]]".format(self.condition))
+
+ def invert(self):
+ """ invert the condition """
+ #if self.condition is not None:
+ # self.condition = "~(%s)" % self.condition
+ #return self
+ raise NotImplementedError("cannot use an invert condition when passing to numexpr")
+
+ def format(self):
+ """ return the actual ne format """
+ return self.condition
+
+ def evaluate(self):
+
+ if not isinstance(self.lhs, string_types):
+ return self
+
+ if not self.is_valid:
+ raise ValueError("query term is not valid [%s]" % self)
+
+ # convert values if we are in the table
+ if not self.is_in_table:
+ return None
+
+ rhs = self.conform(self.rhs)
+ values = [self.convert_value(v) for v in rhs]
+
+ # equality conditions
+ if self.op in ['==', '!=']:
+
+ # too many values to create the expression?
+ if len(values) <= self._max_selectors:
+ vs = [self.generate(v) for v in values]
+ self.condition = "(%s)" % ' | '.join(vs)
+
+ # use a filter after reading
+ else:
+ return None
+ else:
+ self.condition = self.generate(values[0])
+
+ return self
+
+
+class JointConditionBinOp(ConditionBinOp):
+
+ def evaluate(self):
+ self.condition = "(%s %s %s)" % (
+ self.lhs.condition,
+ self.op,
+ self.rhs.condition)
+ return self
+
+
+class UnaryOp(ops.UnaryOp):
+
+ def prune(self, klass):
+
+ if self.op != '~':
+ raise NotImplementedError("UnaryOp only support invert type ops")
+
+ operand = self.operand
+ operand = operand.prune(klass)
+
+ if operand is not None:
+ if issubclass(klass,ConditionBinOp):
+ if operand.condition is not None:
+ return operand.invert()
+ elif issubclass(klass,FilterBinOp):
+ if operand.filter is not None:
+ return operand.invert()
+
+ return None
+
+
+_op_classes = {'unary': UnaryOp}
+
+class ExprVisitor(BaseExprVisitor):
+ const_type = Constant
+ term_type = Term
+
+ def __init__(self, env, engine, parser, **kwargs):
+ super(ExprVisitor, self).__init__(env, engine, parser)
+ for bin_op in self.binary_ops:
+ setattr(self, 'visit_{0}'.format(self.binary_op_nodes_map[bin_op]),
+ lambda node, bin_op=bin_op: partial(BinOp, bin_op,
+ **kwargs))
+
+ def visit_UnaryOp(self, node, **kwargs):
+ if isinstance(node.op, (ast.Not, ast.Invert)):
+ return UnaryOp('~', self.visit(node.operand))
+ elif isinstance(node.op, ast.USub):
+ return self.const_type(-self.visit(node.operand).value, self.env)
+ elif isinstance(node.op, ast.UAdd):
+ raise NotImplementedError('Unary addition not supported')
+
+ def visit_USub(self, node, **kwargs):
+ return self.const_type(-self.visit(node.operand).value, self.env)
+
+ def visit_Index(self, node, **kwargs):
+ return self.visit(node.value).value
+
+ def visit_Subscript(self, node, **kwargs):
+ value = self.visit(node.value)
+ slobj = self.visit(node.slice)
+ try:
+ return self.const_type(value[slobj], self.env)
+ except TypeError:
+ raise ValueError("cannot subscript {0!r} with "
+ "{1!r}".format(value, slobj))
+
+ def visit_Attribute(self, node, **kwargs):
+ attr = node.attr
+ value = node.value
+
+ ctx = node.ctx.__class__
+ if ctx == ast.Load:
+ # resolve the value
+ resolved = self.visit(value).value
+ try:
+ return getattr(resolved, attr)
+ except AttributeError:
+
+ # something like datetime.datetime where scope is overriden
+ if isinstance(value, ast.Name) and value.id == attr:
+ return resolved
+
+ raise ValueError("Invalid Attribute context {0}".format(ctx.__name__))
+
+ def translate_In(self, op):
+ return ast.Eq() if isinstance(op, ast.In) else op
+
+ def _rewrite_membership_op(self, node, left, right):
+ return self.visit(node.op), node.op, left, right
+
+
+class Expr(expr.Expr):
+
+ """ hold a pytables like expression, comprised of possibly multiple 'terms'
+
+ Parameters
+ ----------
+ where : string term expression, Expr, or list-like of Exprs
+ queryables : a "kinds" map (dict of column name -> kind), or None if column is non-indexable
+ encoding : an encoding that will encode the query terms
+
+ Returns
+ -------
+ an Expr object
+
+ Examples
+ --------
+
+ 'index>=date'
+ "columns=['A', 'D']"
+ 'columns=A'
+ 'columns==A'
+ "~(columns=['A','B'])"
+ 'index>df.index[3] & string="bar"'
+ '(index>df.index[3] & index<=df.index[6]) | string="bar"'
+ "ts>=Timestamp('2012-02-01')"
+ "major_axis>=20130101"
+ """
+
+ def __init__(self, where, op=None, value=None, queryables=None,
+ encoding=None, scope_level=None):
+
+ # try to be back compat
+ where = self.parse_back_compat(where, op, value)
+
+ self.encoding = encoding
+ self.condition = None
+ self.filter = None
+ self.terms = None
+ self._visitor = None
+
+ # capture the environement if needed
+ lcls = dict()
+ if isinstance(where, Expr):
+
+ lcls.update(where.env.locals)
+ where = where.expr
+
+ elif isinstance(where, (list, tuple)):
+
+ for w in where:
+ if isinstance(w, Expr):
+ lcls.update(w.env.locals)
+ else:
+ w = self.parse_back_compat(w)
+
+ where = ' & ' .join(["(%s)" % w for w in where])
+
+ self.expr = where
+ self.env = Scope(lcls=lcls)
+ self.env.update(scope_level)
+
+ if queryables is not None and isinstance(self.expr, string_types):
+ self.env.queryables.update(queryables)
+ self._visitor = ExprVisitor(self.env, queryables=queryables,
+ parser='pytables', engine='pytables',
+ encoding=encoding)
+ self.terms = self.parse()
+
+ def parse_back_compat(self, w, op=None, value=None):
+ """ allow backward compatibility for passed arguments """
+
+ if isinstance(w, dict):
+ w, op, value = w.get('field'), w.get('op'), w.get('value')
+ if not isinstance(w, string_types):
+ raise TypeError(
+ "where must be passed as a string if op/value are passed")
+ warnings.warn("passing a dict to Expr is deprecated, "
+ "pass the where as a single string",
+ DeprecationWarning)
+
+ if op is not None:
+ if not isinstance(w, string_types):
+ raise TypeError(
+ "where must be passed as a string if op/value are passed")
+
+ if isinstance(op, Expr):
+ raise TypeError("invalid op passed, must be a string")
+ w = "{0}{1}".format(w, op)
+ if value is not None:
+ if isinstance(value, Expr):
+ raise TypeError("invalid value passed, must be a string")
+ w = "{0}{1}".format(w, value)
+
+ warnings.warn("passing multiple values to Expr is deprecated, "
+ "pass the where as a single string",
+ DeprecationWarning)
+
+ return w
+
+ def __unicode__(self):
+ if self.terms is not None:
+ return com.pprint_thing(self.terms)
+ return com.pprint_thing(self.expr)
+
+ def evaluate(self):
+ """ create and return the numexpr condition and filter """
+
+ try:
+ self.condition = self.terms.prune(ConditionBinOp)
+ except AttributeError:
+ raise ValueError(
+ "cannot process expression [{0}], [{1}] is not a valid condition".format(self.expr,self))
+ try:
+ self.filter = self.terms.prune(FilterBinOp)
+ except AttributeError:
+ raise ValueError(
+ "cannot process expression [{0}], [{1}] is not a valid filter".format(self.expr,self))
+
+ return self.condition, self.filter
+
+
+class TermValue(object):
+
+ """ hold a term value the we use to construct a condition/filter """
+
+ def __init__(self, value, converted, kind):
+ self.value = value
+ self.converted = converted
+ self.kind = kind
+
+ def tostring(self, encoding):
+ """ quote the string if not encoded
+ else encode and return """
+ if self.kind == u('string'):
+ if encoding is not None:
+ return self.converted
+ return '"%s"' % self.converted
+ return self.converted
+
+
+def maybe_expression(s):
+ """ loose checking if s is a pytables-acceptable expression """
+ if not isinstance(s, string_types):
+ return False
+ ops = ExprVisitor.binary_ops + ExprVisitor.unary_ops + ('=',)
+
+ # make sure we have an op at least
+ return any(op in s for op in ops)
diff --git a/pandas/computation/tests/__init__.py b/pandas/computation/tests/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py
new file mode 100755
index 0000000000000..d5bcf85d4de03
--- /dev/null
+++ b/pandas/computation/tests/test_eval.py
@@ -0,0 +1,1473 @@
+#!/usr/bin/env python
+
+import unittest
+import functools
+import numbers
+from itertools import product
+import ast
+
+import nose
+from nose.tools import assert_raises, assert_true, assert_false, assert_equal
+
+from numpy.random import randn, rand, randint
+import numpy as np
+from numpy.testing import assert_array_equal, assert_allclose
+from numpy.testing.decorators import slow
+
+import pandas as pd
+from pandas.core import common as com
+from pandas import DataFrame, Series, Panel, date_range
+from pandas.util.testing import makeCustomDataframe as mkdf
+
+from pandas.computation import pytables
+from pandas.computation.expressions import _USE_NUMEXPR
+from pandas.computation.engines import _engines
+from pandas.computation.expr import PythonExprVisitor, PandasExprVisitor
+from pandas.computation.ops import (_binary_ops_dict, _unary_ops_dict,
+ _special_case_arith_ops_syms,
+ _arith_ops_syms, _bool_ops_syms)
+import pandas.computation.expr as expr
+import pandas.util.testing as tm
+from pandas.util.testing import (assert_frame_equal, randbool,
+ assertRaisesRegexp,
+ assert_produces_warning, assert_series_equal)
+from pandas.compat import PY3, u
+
+_series_frame_incompatible = _bool_ops_syms
+_scalar_skip = 'in', 'not in'
+
+def skip_if_no_ne(engine='numexpr'):
+ if not _USE_NUMEXPR and engine == 'numexpr':
+ raise nose.SkipTest("numexpr engine not installed or disabled")
+
+
+def engine_has_neg_frac(engine):
+ return _engines[engine].has_neg_frac
+
+
+def _eval_single_bin(lhs, cmp1, rhs, engine):
+ c = _binary_ops_dict[cmp1]
+ if engine_has_neg_frac(engine):
+ try:
+ return c(lhs, rhs)
+ except ValueError as e:
+ try:
+ msg = e.message
+ except AttributeError:
+ msg = e
+ msg = u(msg)
+ if msg == u('negative number cannot be raised to a fractional'
+ ' power'):
+ return np.nan
+ raise
+ return c(lhs, rhs)
+
+
+def _series_and_2d_ndarray(lhs, rhs):
+ return ((isinstance(lhs, Series) and
+ isinstance(rhs, np.ndarray) and rhs.ndim > 1)
+ or (isinstance(rhs, Series) and
+ isinstance(lhs, np.ndarray) and lhs.ndim > 1))
+
+
+def _series_and_frame(lhs, rhs):
+ return ((isinstance(lhs, Series) and isinstance(rhs, DataFrame))
+ or (isinstance(rhs, Series) and isinstance(lhs, DataFrame)))
+
+
+def _bool_and_frame(lhs, rhs):
+ return isinstance(lhs, bool) and isinstance(rhs, pd.core.generic.NDFrame)
+
+
+def skip_incompatible_operand(f):
+ @functools.wraps(f)
+ def wrapper(self, lhs, arith1, rhs, *args, **kwargs):
+ if _series_and_2d_ndarray(lhs, rhs):
+ self.assertRaises(Exception, pd.eval, 'lhs {0} rhs'.format(arith1),
+ local_dict={'lhs': lhs, 'rhs': rhs},
+ engine=self.engine, parser=self.parser)
+ elif (np.isscalar(lhs) and np.isscalar(rhs) and arith1 in
+ _bool_ops_syms):
+ with tm.assertRaises(NotImplementedError):
+ pd.eval('lhs {0} rhs'.format(arith1), engine=self.engine,
+ parser=self.parser)
+ else:
+ f(self, lhs, arith1, rhs, *args, **kwargs)
+ return wrapper
+
+
+def _is_py3_complex_incompat(result, expected):
+ return (PY3 and isinstance(expected, (complex, np.complexfloating)) and
+ np.isnan(result))
+
+
+_good_arith_ops = com.difference(_arith_ops_syms, _special_case_arith_ops_syms)
+
+
+class TestEvalNumexprPandas(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ skip_if_no_ne()
+ import numexpr as ne
+ cls.ne = ne
+ cls.engine = 'numexpr'
+ cls.parser = 'pandas'
+
+ @classmethod
+ def tearDownClass(cls):
+ del cls.engine, cls.parser
+ if hasattr(cls, 'ne'):
+ del cls.ne
+
+ def setup_data(self):
+ nan_df1 = DataFrame(rand(10, 5))
+ nan_df1[nan_df1 > 0.5] = np.nan
+ nan_df2 = DataFrame(rand(10, 5))
+ nan_df2[nan_df2 > 0.5] = np.nan
+
+ self.pandas_lhses = (DataFrame(randn(10, 5)), Series(randn(5)),
+ Series([1, 2, np.nan, np.nan, 5]), nan_df1)
+ self.pandas_rhses = (DataFrame(randn(10, 5)), Series(randn(5)),
+ Series([1, 2, np.nan, np.nan, 5]), nan_df2)
+ self.scalar_lhses = randn(), np.float64(randn()), np.nan
+ self.scalar_rhses = randn(), np.float64(randn()), np.nan
+
+ self.lhses = self.pandas_lhses + self.scalar_lhses
+ self.rhses = self.pandas_rhses + self.scalar_rhses
+
+ def setup_ops(self):
+ self.cmp_ops = expr._cmp_ops_syms
+ self.cmp2_ops = self.cmp_ops[::-1]
+ self.bin_ops = expr._bool_ops_syms
+ self.special_case_ops = _special_case_arith_ops_syms
+ self.arith_ops = _good_arith_ops
+ self.unary_ops = '-', '~', 'not '
+
+ def setUp(self):
+ self.setup_ops()
+ self.setup_data()
+ self.current_engines = filter(lambda x: x != self.engine, _engines)
+
+ def tearDown(self):
+ del self.lhses, self.rhses, self.scalar_rhses, self.scalar_lhses
+ del self.pandas_rhses, self.pandas_lhses, self.current_engines
+
+ @slow
+ def test_complex_cmp_ops(self):
+ for lhs, cmp1, rhs, binop, cmp2 in product(self.lhses, self.cmp_ops,
+ self.rhses, self.bin_ops,
+ self.cmp2_ops):
+ self.check_complex_cmp_op(lhs, cmp1, rhs, binop, cmp2)
+
+ def test_simple_cmp_ops(self):
+ bool_lhses = (DataFrame(randbool(size=(10, 5))),
+ Series(randbool((5,))), randbool())
+ bool_rhses = (DataFrame(randbool(size=(10, 5))),
+ Series(randbool((5,))), randbool())
+ for lhs, rhs, cmp_op in product(bool_lhses, bool_rhses, self.cmp_ops):
+ self.check_simple_cmp_op(lhs, cmp_op, rhs)
+
+ @slow
+ def test_binary_arith_ops(self):
+ for lhs, op, rhs in product(self.lhses, self.arith_ops, self.rhses):
+ self.check_binary_arith_op(lhs, op, rhs)
+
+ def test_modulus(self):
+ for lhs, rhs in product(self.lhses, self.rhses):
+ self.check_modulus(lhs, '%', rhs)
+
+ def test_floor_division(self):
+ for lhs, rhs in product(self.lhses, self.rhses):
+ self.check_floor_division(lhs, '//', rhs)
+
+ @slow
+ def test_pow(self):
+ for lhs, rhs in product(self.lhses, self.rhses):
+ self.check_pow(lhs, '**', rhs)
+
+ @slow
+ def test_single_invert_op(self):
+ for lhs, op, rhs in product(self.lhses, self.cmp_ops, self.rhses):
+ self.check_single_invert_op(lhs, op, rhs)
+
+ @slow
+ def test_compound_invert_op(self):
+ for lhs, op, rhs in product(self.lhses, self.cmp_ops, self.rhses):
+ self.check_compound_invert_op(lhs, op, rhs)
+
+ @slow
+ def test_chained_cmp_op(self):
+ mids = self.lhses
+ cmp_ops = tuple(set(self.cmp_ops) - set(['==', '!=', '<=', '>=']))
+ for lhs, cmp1, mid, cmp2, rhs in product(self.lhses, cmp_ops,
+ mids, cmp_ops, self.rhses):
+ self.check_chained_cmp_op(lhs, cmp1, mid, cmp2, rhs)
+
+ def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2):
+ skip_these = 'in', 'not in'
+ ex = '(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)'.format(cmp1=cmp1,
+ binop=binop,
+ cmp2=cmp2)
+ scalar_with_in_notin = (np.isscalar(rhs) and (cmp1 in skip_these or
+ cmp2 in skip_these))
+ if scalar_with_in_notin:
+ self.assertRaises(TypeError, pd.eval, ex, engine=self.engine,
+ parser=self.parser, local_dict={'lhs': lhs,
+ 'rhs': rhs})
+ elif (_series_and_frame(lhs, rhs) and (cmp1 in
+ _series_frame_incompatible or
+ cmp2 in _series_frame_incompatible)):
+ self.assertRaises(TypeError, pd.eval, ex,
+ local_dict={'lhs': lhs, 'rhs': rhs},
+ engine=self.engine, parser=self.parser)
+ elif _bool_and_frame(lhs, rhs):
+ self.assertRaises(TypeError, _eval_single_bin, lhs_new, '&',
+ rhs_new, self.engine)
+ self.assertRaises(TypeError, pd.eval, ex,
+ local_dict={'lhs': lhs, 'rhs': rhs},
+ engine=self.engine, parser=self.parser)
+ elif (np.isscalar(lhs) and np.isnan(lhs) and
+ not np.isscalar(rhs) and (cmp1 in skip_these or cmp2 in
+ skip_these)):
+ with tm.assertRaises(TypeError):
+ _eval_single_bin(lhs, binop, rhs, self.engine)
+ else:
+ lhs_new = _eval_single_bin(lhs, cmp1, rhs, self.engine)
+ rhs_new = _eval_single_bin(lhs, cmp2, rhs, self.engine)
+ if (isinstance(lhs_new, Series) and isinstance(rhs_new, DataFrame)
+ and binop in _series_frame_incompatible):
+ pass
+ # TODO: the code below should be added back when left and right
+ # hand side bool ops are fixed.
+
+ #try:
+ #self.assertRaises(Exception, pd.eval, ex,
+ #local_dict={'lhs': lhs, 'rhs': rhs},
+ #engine=self.engine, parser=self.parser)
+ #except AssertionError:
+ #import ipdb; ipdb.set_trace()
+ #raise
+ elif (np.isscalar(lhs_new) and np.isnan(lhs_new) and
+ not np.isscalar(rhs_new) and binop in skip_these):
+ with tm.assertRaises(TypeError):
+ _eval_single_bin(lhs_new, binop, rhs_new, self.engine)
+ elif _bool_and_frame(lhs_new, rhs_new):
+ with tm.assertRaises(TypeError):
+ _eval_single_bin(lhs_new, binop, rhs_new, self.engine)
+ with tm.assertRaises(TypeError):
+ pd.eval('lhs_new & rhs_new'.format(binop),
+ engine=self.engine, parser=self.parser)
+ else:
+ expected = _eval_single_bin(lhs_new, binop, rhs_new, self.engine)
+ result = pd.eval(ex, engine=self.engine, parser=self.parser)
+ assert_array_equal(result, expected)
+
+ @skip_incompatible_operand
+ def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs):
+ skip_these = 'in', 'not in'
+
+ def check_operands(left, right, cmp_op):
+ if (np.isscalar(left) and np.isnan(left) and not np.isscalar(right)
+ and cmp_op in skip_these):
+ ex = 'left {0} right'.format(cmp_op)
+ with tm.assertRaises(ValueError):
+ pd.eval(ex, engine=self.engine, parser=self.parser)
+ return
+ if (np.isscalar(left) and np.isscalar(right) and
+ cmp_op in _bool_ops_syms):
+ ex1 = 'lhs {0} mid {1} rhs'.format(cmp1, cmp2)
+ ex2 = 'lhs {0} mid and mid {1} rhs'.format(cmp1, cmp2)
+ ex3 = '(lhs {0} mid) & (mid {1} rhs)'.format(cmp1, cmp2)
+ for ex in (ex1, ex2, ex3):
+ with assertRaises(NotImplementedError):
+ pd.eval(ex, engine=self.engine, parser=self.parser)
+ return
+ if (np.isscalar(right) and not np.isscalar(left) and cmp_op in
+ skip_these):
+ self.assertRaises(Exception, _eval_single_bin, left, cmp_op,
+ right, self.engine)
+ elif _series_and_2d_ndarray(right, left):
+ self.assertRaises(Exception, _eval_single_bin, right, cmp_op,
+ left, self.engine)
+ elif (np.isscalar(right) and np.isscalar(left) and cmp_op in
+ skip_these):
+ self.assertRaises(Exception, _eval_single_bin, right, cmp_op,
+ left, self.engine)
+ else:
+ new = _eval_single_bin(left, cmp_op, right, self.engine)
+ return new
+ return
+
+ lhs_new = check_operands(lhs, mid, cmp1)
+ rhs_new = check_operands(mid, rhs, cmp2)
+
+ if lhs_new is not None and rhs_new is not None:
+ # these are not compatible operands
+ if isinstance(lhs_new, Series) and isinstance(rhs_new, DataFrame):
+ self.assertRaises(TypeError, _eval_single_bin, lhs_new, '&',
+ rhs_new, self.engine)
+ elif (_bool_and_frame(lhs_new, rhs_new)):
+ self.assertRaises(TypeError, _eval_single_bin, lhs_new, '&',
+ rhs_new, self.engine)
+ elif _series_and_2d_ndarray(lhs_new, rhs_new):
+ # TODO: once #4319 is fixed add this test back in
+ #self.assertRaises(Exception, _eval_single_bin, lhs_new, '&',
+ #rhs_new, self.engine)
+ pass
+ else:
+ ex1 = 'lhs {0} mid {1} rhs'.format(cmp1, cmp2)
+ ex2 = 'lhs {0} mid and mid {1} rhs'.format(cmp1, cmp2)
+ ex3 = '(lhs {0} mid) & (mid {1} rhs)'.format(cmp1, cmp2)
+ try:
+ expected = _eval_single_bin(lhs_new, '&', rhs_new, self.engine)
+ except TypeError:
+ import ipdb; ipdb.set_trace()
+ raise
+
+ for ex in (ex1, ex2, ex3):
+ result = pd.eval(ex, engine=self.engine,
+ parser=self.parser)
+ assert_array_equal(result, expected)
+
+ @skip_incompatible_operand
+ def check_simple_cmp_op(self, lhs, cmp1, rhs):
+ ex = 'lhs {0} rhs'.format(cmp1)
+ if cmp1 in ('in', 'not in') and not com.is_list_like(rhs):
+ self.assertRaises(TypeError, pd.eval, ex, engine=self.engine,
+ parser=self.parser, local_dict={'lhs': lhs,
+ 'rhs': rhs})
+ else:
+ expected = _eval_single_bin(lhs, cmp1, rhs, self.engine)
+ result = pd.eval(ex, engine=self.engine, parser=self.parser)
+ assert_array_equal(result, expected)
+
+ @skip_incompatible_operand
+ def check_binary_arith_op(self, lhs, arith1, rhs):
+ ex = 'lhs {0} rhs'.format(arith1)
+ result = pd.eval(ex, engine=self.engine, parser=self.parser)
+ expected = _eval_single_bin(lhs, arith1, rhs, self.engine)
+ assert_array_equal(result, expected)
+ ex = 'lhs {0} rhs {0} rhs'.format(arith1)
+ result = pd.eval(ex, engine=self.engine, parser=self.parser)
+ nlhs = _eval_single_bin(lhs, arith1, rhs,
+ self.engine)
+ self.check_alignment(result, nlhs, rhs, arith1)
+
+ def check_alignment(self, result, nlhs, ghs, op):
+ try:
+ nlhs, ghs = nlhs.align(ghs)
+ except (ValueError, TypeError, AttributeError):
+ # ValueError: series frame or frame series align
+ # TypeError, AttributeError: series or frame with scalar align
+ pass
+ else:
+ expected = self.ne.evaluate('nlhs {0} ghs'.format(op))
+ assert_array_equal(result, expected)
+
+ # the following 3 tests require special casing
+
+ @skip_incompatible_operand
+ def check_modulus(self, lhs, arith1, rhs):
+ ex = 'lhs {0} rhs'.format(arith1)
+ result = pd.eval(ex, engine=self.engine, parser=self.parser)
+ expected = lhs % rhs
+ assert_allclose(result, expected)
+ expected = self.ne.evaluate('expected {0} rhs'.format(arith1))
+ assert_allclose(result, expected)
+
+ @skip_incompatible_operand
+ def check_floor_division(self, lhs, arith1, rhs):
+ ex = 'lhs {0} rhs'.format(arith1)
+
+ if self.engine == 'python':
+ res = pd.eval(ex, engine=self.engine, parser=self.parser)
+ expected = lhs // rhs
+ assert_array_equal(res, expected)
+ else:
+ self.assertRaises(TypeError, pd.eval, ex, local_dict={'lhs': lhs,
+ 'rhs': rhs},
+ engine=self.engine, parser=self.parser)
+
+ def get_expected_pow_result(self, lhs, rhs):
+ try:
+ expected = _eval_single_bin(lhs, '**', rhs, self.engine)
+ except ValueError as e:
+ msg = 'negative number cannot be raised to a fractional power'
+ try:
+ emsg = e.message
+ except AttributeError:
+ emsg = e
+
+ emsg = u(emsg)
+
+ if emsg == msg:
+ if self.engine == 'python':
+ raise nose.SkipTest(emsg)
+ else:
+ expected = np.nan
+ else:
+ raise
+ return expected
+
+ @skip_incompatible_operand
+ def check_pow(self, lhs, arith1, rhs):
+ ex = 'lhs {0} rhs'.format(arith1)
+ expected = self.get_expected_pow_result(lhs, rhs)
+ result = pd.eval(ex, engine=self.engine, parser=self.parser)
+
+ if (np.isscalar(lhs) and np.isscalar(rhs) and
+ _is_py3_complex_incompat(result, expected)):
+ self.assertRaises(AssertionError, assert_array_equal, result,
+ expected)
+ else:
+ assert_array_equal(result, expected)
+
+ ex = '(lhs {0} rhs) {0} rhs'.format(arith1)
+ result = pd.eval(ex, engine=self.engine, parser=self.parser)
+ expected = self.get_expected_pow_result(
+ self.get_expected_pow_result(lhs, rhs), rhs)
+ assert_array_equal(result, expected)
+
+ @skip_incompatible_operand
+ def check_single_invert_op(self, lhs, cmp1, rhs):
+ # simple
+ for el in (lhs, rhs):
+ try:
+ elb = el.astype(bool)
+ except AttributeError:
+ elb = np.array([bool(el)])
+ expected = ~elb
+ result = pd.eval('~elb', engine=self.engine, parser=self.parser)
+ assert_array_equal(expected, result)
+
+ for engine in self.current_engines:
+ skip_if_no_ne(engine)
+ assert_array_equal(result, pd.eval('~elb', engine=engine,
+ parser=self.parser))
+
+ @skip_incompatible_operand
+ def check_compound_invert_op(self, lhs, cmp1, rhs):
+ skip_these = 'in', 'not in'
+ ex = '~(lhs {0} rhs)'.format(cmp1)
+
+ if np.isscalar(rhs) and cmp1 in skip_these:
+ self.assertRaises(TypeError, pd.eval, ex, engine=self.engine,
+ parser=self.parser, local_dict={'lhs': lhs,
+ 'rhs': rhs})
+ elif (np.isscalar(lhs) and np.isnan(lhs) and not np.isscalar(rhs)
+ and cmp1 in skip_these):
+ with tm.assertRaises(ValueError):
+ pd.eval(ex, engine=self.engine, parser=self.parser)
+ else:
+ # compound
+ if np.isscalar(lhs) and np.isscalar(rhs):
+ lhs, rhs = map(lambda x: np.array([x]), (lhs, rhs))
+ expected = _eval_single_bin(lhs, cmp1, rhs, self.engine)
+ if np.isscalar(expected):
+ expected = not expected
+ else:
+ expected = ~expected
+ result = pd.eval(ex, engine=self.engine, parser=self.parser)
+ assert_array_equal(expected, result)
+
+ # make sure the other engines work the same as this one
+ for engine in self.current_engines:
+ skip_if_no_ne(engine)
+ ev = pd.eval(ex, engine=self.engine, parser=self.parser)
+ assert_array_equal(ev, result)
+
+ def ex(self, op, var_name='lhs'):
+ return '{0}{1}'.format(op, var_name)
+
+ def test_frame_invert(self):
+ expr = self.ex('~')
+
+ ## ~ ##
+ # frame
+ ## float always raises
+ lhs = DataFrame(randn(5, 2))
+ if self.engine == 'numexpr':
+ with tm.assertRaises(NotImplementedError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ else:
+ with tm.assertRaises(TypeError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+
+ ## int raises on numexpr
+ lhs = DataFrame(randint(5, size=(5, 2)))
+ if self.engine == 'numexpr':
+ with tm.assertRaises(NotImplementedError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ else:
+ expect = ~lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_frame_equal(expect, result)
+
+ ## bool always works
+ lhs = DataFrame(rand(5, 2) > 0.5)
+ expect = ~lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_frame_equal(expect, result)
+
+ ## object raises
+ lhs = DataFrame({'b': ['a', 1, 2.0], 'c': rand(3) > 0.5})
+ if self.engine == 'numexpr':
+ with tm.assertRaises(ValueError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ else:
+ with tm.assertRaises(TypeError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+
+ def test_series_invert(self):
+ #### ~ ####
+ expr = self.ex('~')
+
+ # series
+ ## float raises
+ lhs = Series(randn(5))
+ if self.engine == 'numexpr':
+ with tm.assertRaises(NotImplementedError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ else:
+ with tm.assertRaises(TypeError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+
+ ## int raises on numexpr
+ lhs = Series(randint(5, size=5))
+ if self.engine == 'numexpr':
+ with tm.assertRaises(NotImplementedError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ else:
+ expect = ~lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_series_equal(expect, result)
+
+ ## bool
+ lhs = Series(rand(5) > 0.5)
+ expect = ~lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_series_equal(expect, result)
+
+ # float
+ # int
+ # bool
+
+ # object
+ lhs = Series(['a', 1, 2.0])
+ if self.engine == 'numexpr':
+ with tm.assertRaises(ValueError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ else:
+ with tm.assertRaises(TypeError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+
+ def test_frame_negate(self):
+ expr = self.ex('-')
+
+ # float
+ lhs = DataFrame(randn(5, 2))
+ expect = -lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_frame_equal(expect, result)
+
+ # int
+ lhs = DataFrame(randint(5, size=(5, 2)))
+ expect = -lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_frame_equal(expect, result)
+
+ # bool doesn't work with numexpr but works elsewhere
+ lhs = DataFrame(rand(5, 2) > 0.5)
+ if self.engine == 'numexpr':
+ with tm.assertRaises(NotImplementedError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ else:
+ expect = -lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_frame_equal(expect, result)
+
+ def test_series_negate(self):
+ expr = self.ex('-')
+
+ # float
+ lhs = Series(randn(5))
+ expect = -lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_series_equal(expect, result)
+
+ # int
+ lhs = Series(randint(5, size=5))
+ expect = -lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_series_equal(expect, result)
+
+ # bool doesn't work with numexpr but works elsewhere
+ lhs = Series(rand(5) > 0.5)
+ if self.engine == 'numexpr':
+ with tm.assertRaises(NotImplementedError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ else:
+ expect = -lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_series_equal(expect, result)
+
+ def test_frame_pos(self):
+ expr = self.ex('+')
+
+ # float
+ lhs = DataFrame(randn(5, 2))
+ if self.engine == 'python':
+ with tm.assertRaises(TypeError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ else:
+ expect = lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_frame_equal(expect, result)
+
+ # int
+ lhs = DataFrame(randint(5, size=(5, 2)))
+ if self.engine == 'python':
+ with tm.assertRaises(TypeError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ else:
+ expect = lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_frame_equal(expect, result)
+
+ # bool doesn't work with numexpr but works elsewhere
+ lhs = DataFrame(rand(5, 2) > 0.5)
+ if self.engine == 'python':
+ with tm.assertRaises(TypeError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ else:
+ expect = lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_frame_equal(expect, result)
+
+ def test_series_pos(self):
+ expr = self.ex('+')
+
+ # float
+ lhs = Series(randn(5))
+ if self.engine == 'python':
+ with tm.assertRaises(TypeError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ else:
+ expect = lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_series_equal(expect, result)
+
+ # int
+ lhs = Series(randint(5, size=5))
+ if self.engine == 'python':
+ with tm.assertRaises(TypeError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ else:
+ expect = lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_series_equal(expect, result)
+
+ # bool doesn't work with numexpr but works elsewhere
+ lhs = Series(rand(5) > 0.5)
+ if self.engine == 'python':
+ with tm.assertRaises(TypeError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ else:
+ expect = lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_series_equal(expect, result)
+
+ def test_scalar_unary(self):
+ with tm.assertRaises(TypeError):
+ pd.eval('~1.0', engine=self.engine, parser=self.parser)
+
+ self.assertEqual(pd.eval('-1.0', parser=self.parser, engine=self.engine), -1.0)
+ self.assertEqual(pd.eval('+1.0', parser=self.parser, engine=self.engine), +1.0)
+
+ self.assertEqual(pd.eval('~1', parser=self.parser, engine=self.engine), ~1)
+ self.assertEqual(pd.eval('-1', parser=self.parser, engine=self.engine), -1)
+ self.assertEqual(pd.eval('+1', parser=self.parser, engine=self.engine), +1)
+
+ self.assertEqual(pd.eval('~True', parser=self.parser, engine=self.engine), ~True)
+ self.assertEqual(pd.eval('~False', parser=self.parser, engine=self.engine), ~False)
+ self.assertEqual(pd.eval('-True', parser=self.parser, engine=self.engine), -True)
+ self.assertEqual(pd.eval('-False', parser=self.parser, engine=self.engine), -False)
+ self.assertEqual(pd.eval('+True', parser=self.parser, engine=self.engine), +True)
+ self.assertEqual(pd.eval('+False', parser=self.parser, engine=self.engine), +False)
+
+ def test_disallow_scalar_bool_ops(self):
+ exprs = '1 or 2', '1 and 2'
+ exprs += 'a and b', 'a or b'
+ exprs += '1 or 2 and (3 + 2) > 3',
+ exprs += '2 * x > 2 or 1 and 2',
+ exprs += '2 * df > 3 and 1 or a',
+
+ x, a, b, df = np.random.randn(3), 1, 2, DataFrame(randn(3, 2))
+ for ex in exprs:
+ with tm.assertRaises(NotImplementedError):
+ pd.eval(ex, engine=self.engine, parser=self.parser)
+
+
+class TestEvalNumexprPython(TestEvalNumexprPandas):
+ @classmethod
+ def setUpClass(cls):
+ skip_if_no_ne()
+ import numexpr as ne
+ cls.ne = ne
+ cls.engine = 'numexpr'
+ cls.parser = 'python'
+
+ def setup_ops(self):
+ self.cmp_ops = list(filter(lambda x: x not in ('in', 'not in'),
+ expr._cmp_ops_syms))
+ self.cmp2_ops = self.cmp_ops[::-1]
+ self.bin_ops = [s for s in expr._bool_ops_syms
+ if s not in ('and', 'or')]
+ self.special_case_ops = _special_case_arith_ops_syms
+ self.arith_ops = _good_arith_ops
+ self.unary_ops = '+', '-', '~'
+
+ def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs):
+ ex1 = 'lhs {0} mid {1} rhs'.format(cmp1, cmp2)
+ self.assertRaises(NotImplementedError, pd.eval, ex1,
+ local_dict={'lhs': lhs, 'mid': mid, 'rhs': rhs},
+ engine=self.engine, parser=self.parser)
+
+
+class TestEvalPythonPython(TestEvalNumexprPython):
+ @classmethod
+ def setUpClass(cls):
+ cls.engine = 'python'
+ cls.parser = 'python'
+
+ @skip_incompatible_operand
+ def check_modulus(self, lhs, arith1, rhs):
+ ex = 'lhs {0} rhs'.format(arith1)
+ result = pd.eval(ex, engine=self.engine)
+ expected = lhs % rhs
+ assert_allclose(result, expected)
+ expected = eval('expected {0} rhs'.format(arith1))
+ assert_allclose(result, expected)
+
+ def check_alignment(self, result, nlhs, ghs, op):
+ try:
+ nlhs, ghs = nlhs.align(ghs)
+ except (ValueError, TypeError, AttributeError):
+ # ValueError: series frame or frame series align
+ # TypeError, AttributeError: series or frame with scalar align
+ pass
+ else:
+ expected = eval('nlhs {0} ghs'.format(op))
+ assert_array_equal(result, expected)
+
+
+class TestEvalPythonPandas(TestEvalPythonPython):
+ @classmethod
+ def setUpClass(cls):
+ cls.engine = 'python'
+ cls.parser = 'pandas'
+
+ def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs):
+ TestEvalNumexprPandas.check_chained_cmp_op(self, lhs, cmp1, mid, cmp2,
+ rhs)
+
+
+f = lambda *args, **kwargs: np.random.randn()
+
+
+ENGINES_PARSERS = list(product(_engines, expr._parsers))
+
+
+#-------------------------------------
+# basic and complex alignment
+
+class TestAlignment(object):
+
+ index_types = 'i', 'f', 's', 'u', 'dt', # 'p'
+
+ def check_align_nested_unary_op(self, engine, parser):
+ skip_if_no_ne(engine)
+ s = 'df * ~2'
+ df = mkdf(5, 3, data_gen_f=f)
+ res = pd.eval(s, engine=engine, parser=parser)
+ assert_frame_equal(res, df * ~2)
+
+ def test_align_nested_unary_op(self):
+ for engine, parser in ENGINES_PARSERS:
+ yield self.check_align_nested_unary_op, engine, parser
+
+ def check_basic_frame_alignment(self, engine, parser):
+ skip_if_no_ne(engine)
+ args = product(self.index_types, repeat=2)
+ for r_idx_type, c_idx_type in args:
+ df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type,
+ c_idx_type=c_idx_type)
+ df2 = mkdf(20, 10, data_gen_f=f, r_idx_type=r_idx_type,
+ c_idx_type=c_idx_type)
+ res = pd.eval('df + df2', engine=engine, parser=parser)
+ assert_frame_equal(res, df + df2)
+
+ @slow
+ def test_basic_frame_alignment(self):
+ for engine, parser in ENGINES_PARSERS:
+ yield self.check_basic_frame_alignment, engine, parser
+
+ def check_frame_comparison(self, engine, parser):
+ skip_if_no_ne(engine)
+ args = product(self.index_types, repeat=2)
+ for r_idx_type, c_idx_type in args:
+ df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type,
+ c_idx_type=c_idx_type)
+ res = pd.eval('df < 2', engine=engine, parser=parser)
+ assert_frame_equal(res, df < 2)
+
+ df3 = DataFrame(randn(*df.shape), index=df.index,
+ columns=df.columns)
+ res = pd.eval('df < df3', engine=engine, parser=parser)
+ assert_frame_equal(res, df < df3)
+
+ @slow
+ def test_frame_comparison(self):
+ for engine, parser in ENGINES_PARSERS:
+ yield self.check_frame_comparison, engine, parser
+
+ def check_medium_complex_frame_alignment(self, engine, parser):
+ skip_if_no_ne(engine)
+ args = product(self.index_types, repeat=4)
+ for r1, c1, r2, c2 in args:
+ df = mkdf(5, 2, data_gen_f=f, r_idx_type=r1, c_idx_type=c1)
+ df2 = mkdf(10, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2)
+ df3 = mkdf(15, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2)
+ res = pd.eval('df + df2 + df3', engine=engine, parser=parser)
+ assert_frame_equal(res, df + df2 + df3)
+
+ @slow
+ def test_medium_complex_frame_alignment(self):
+ for engine, parser in ENGINES_PARSERS:
+ yield self.check_medium_complex_frame_alignment, engine, parser
+
+ def check_basic_frame_series_alignment(self, engine, parser):
+ skip_if_no_ne(engine)
+ def testit(r_idx_type, c_idx_type, index_name):
+ df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type,
+ c_idx_type=c_idx_type)
+ index = getattr(df, index_name)
+ s = Series(np.random.randn(5), index[:5])
+
+ res = pd.eval('df + s', engine=engine, parser=parser)
+ if r_idx_type == 'dt' or c_idx_type == 'dt':
+ if engine == 'numexpr':
+ expected = df.add(s)
+ else:
+ expected = df + s
+ else:
+ expected = df + s
+ assert_frame_equal(res, expected)
+
+ args = product(self.index_types, self.index_types, ('index',
+ 'columns'))
+ for r_idx_type, c_idx_type, index_name in args:
+ testit(r_idx_type, c_idx_type, index_name)
+
+ @slow
+ def test_basic_frame_series_alignment(self):
+ for engine, parser in ENGINES_PARSERS:
+ yield self.check_basic_frame_series_alignment, engine, parser
+
+ def check_basic_series_frame_alignment(self, engine, parser):
+ skip_if_no_ne(engine)
+ def testit(r_idx_type, c_idx_type, index_name):
+ df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type,
+ c_idx_type=c_idx_type)
+ index = getattr(df, index_name)
+ s = Series(np.random.randn(5), index[:5])
+
+ res = pd.eval('s + df', engine=engine, parser=parser)
+ if r_idx_type == 'dt' or c_idx_type == 'dt':
+ if engine == 'numexpr':
+ expected = df.add(s)
+ else:
+ expected = s + df
+ else:
+ expected = s + df
+ assert_frame_equal(res, expected)
+
+ args = product(self.index_types, self.index_types, ('index',
+ 'columns'))
+ for r_idx_type, c_idx_type, index_name in args:
+ testit(r_idx_type, c_idx_type, index_name)
+
+ @slow
+ def test_basic_series_frame_alignment(self):
+ for engine, parser in ENGINES_PARSERS:
+ yield self.check_basic_series_frame_alignment, engine, parser
+
+ def check_series_frame_commutativity(self, engine, parser):
+ skip_if_no_ne(engine)
+ args = product(self.index_types, self.index_types, ('+', '*'),
+ ('index', 'columns'))
+ for r_idx_type, c_idx_type, op, index_name in args:
+ df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type,
+ c_idx_type=c_idx_type)
+ index = getattr(df, index_name)
+ s = Series(np.random.randn(5), index[:5])
+
+ lhs = 's {0} df'.format(op)
+ rhs = 'df {0} s'.format(op)
+ a = pd.eval(lhs, engine=engine, parser=parser)
+ b = pd.eval(rhs, engine=engine, parser=parser)
+
+ if r_idx_type != 'dt' and c_idx_type != 'dt':
+ if engine == 'numexpr':
+ assert_frame_equal(a, b)
+
+ @slow
+ def test_series_frame_commutativity(self):
+ for engine, parser in ENGINES_PARSERS:
+ yield self.check_series_frame_commutativity, engine, parser
+
+ def check_complex_series_frame_alignment(self, engine, parser):
+ skip_if_no_ne(engine)
+ index_types = [self.index_types] * 4
+ args = product(('index', 'columns'), ('df', 'df2'), *index_types)
+ for index_name, obj, r1, r2, c1, c2 in args:
+ df = mkdf(10, 5, data_gen_f=f, r_idx_type=r1, c_idx_type=c1)
+ df2 = mkdf(20, 5, data_gen_f=f, r_idx_type=r2, c_idx_type=c2)
+ index = getattr(locals()[obj], index_name)
+ s = Series(np.random.randn(5), index[:5])
+
+ if r2 == 'dt' or c2 == 'dt':
+ if engine == 'numexpr':
+ expected2 = df2.add(s)
+ else:
+ expected2 = df2 + s
+ else:
+ expected2 = df2 + s
+
+ if r1 == 'dt' or c1 == 'dt':
+ if engine == 'numexpr':
+ expected = expected2.add(df)
+ else:
+ expected = expected2 + df
+ else:
+ expected = expected2 + df
+
+ res = pd.eval('df2 + s + df', engine=engine, parser=parser)
+ assert_equal(res.shape, expected.shape)
+ assert_frame_equal(res, expected)
+
+ @slow
+ def test_complex_series_frame_alignment(self):
+ for engine, parser in ENGINES_PARSERS:
+ yield self.check_complex_series_frame_alignment, engine, parser
+
+ def check_performance_warning_for_poor_alignment(self, engine, parser):
+ skip_if_no_ne(engine)
+ df = DataFrame(randn(1000, 10))
+ s = Series(randn(10000))
+ if engine == 'numexpr':
+ seen = pd.io.common.PerformanceWarning
+ else:
+ seen = False
+
+ with assert_produces_warning(seen):
+ pd.eval('df + s', engine=engine, parser=parser)
+
+ s = Series(randn(1000))
+ with assert_produces_warning(False):
+ pd.eval('df + s', engine=engine, parser=parser)
+
+ df = DataFrame(randn(10, 10000))
+ s = Series(randn(10000))
+ with assert_produces_warning(False):
+ pd.eval('df + s', engine=engine, parser=parser)
+
+ df = DataFrame(randn(10, 10))
+ s = Series(randn(10000))
+
+ is_python_engine = engine == 'python'
+
+ if not is_python_engine:
+ wrn = pd.io.common.PerformanceWarning
+ else:
+ wrn = False
+
+ with assert_produces_warning(wrn) as w:
+ pd.eval('df + s', engine=engine, parser=parser)
+
+ if not is_python_engine:
+ assert_equal(len(w), 1)
+ msg = str(w[0].message)
+ expected = ("Alignment difference on axis {0} is larger"
+ " than an order of magnitude on term {1!r}, "
+ "by more than {2:.4g}; performance may suffer"
+ "".format(1, 's', np.log10(s.size - df.shape[1])))
+ assert_equal(msg, expected)
+
+
+ def test_performance_warning_for_poor_alignment(self):
+ for engine, parser in ENGINES_PARSERS:
+ yield self.check_performance_warning_for_poor_alignment, engine, parser
+
+
+#------------------------------------
+# slightly more complex ops
+
+class TestOperationsNumExprPandas(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ skip_if_no_ne()
+ cls.engine = 'numexpr'
+ cls.parser = 'pandas'
+ cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms
+
+ @classmethod
+ def tearDownClass(cls):
+ del cls.engine, cls.parser
+
+ def eval(self, *args, **kwargs):
+ kwargs['engine'] = self.engine
+ kwargs['parser'] = self.parser
+ return pd.eval(*args, **kwargs)
+
+ def test_simple_arith_ops(self):
+ ops = self.arith_ops
+
+ for op in filter(lambda x: x != '//', ops):
+ ex = '1 {0} 1'.format(op)
+ ex2 = 'x {0} 1'.format(op)
+ ex3 = '1 {0} (x + 1)'.format(op)
+
+ if op in ('in', 'not in'):
+ self.assertRaises(TypeError, pd.eval, ex,
+ engine=self.engine, parser=self.parser)
+ else:
+ expec = _eval_single_bin(1, op, 1, self.engine)
+ x = self.eval(ex, engine=self.engine, parser=self.parser)
+ assert_equal(x, expec)
+
+ expec = _eval_single_bin(x, op, 1, self.engine)
+ y = self.eval(ex2, local_dict={'x': x}, engine=self.engine,
+ parser=self.parser)
+ assert_equal(y, expec)
+
+ expec = _eval_single_bin(1, op, x + 1, self.engine)
+ y = self.eval(ex3, local_dict={'x': x},
+ engine=self.engine, parser=self.parser)
+ assert_equal(y, expec)
+
+ def test_simple_bool_ops(self):
+ for op, lhs, rhs in product(expr._bool_ops_syms, (True, False),
+ (True, False)):
+ ex = '{0} {1} {2}'.format(lhs, op, rhs)
+ res = self.eval(ex)
+ exp = eval(ex)
+ self.assertEqual(res, exp)
+
+ def test_bool_ops_with_constants(self):
+ for op, lhs, rhs in product(expr._bool_ops_syms, ('True', 'False'),
+ ('True', 'False')):
+ ex = '{0} {1} {2}'.format(lhs, op, rhs)
+ res = self.eval(ex)
+ exp = eval(ex)
+ self.assertEqual(res, exp)
+
+ def test_panel_fails(self):
+ x = Panel(randn(3, 4, 5))
+ y = Series(randn(10))
+ assert_raises(NotImplementedError, self.eval, 'x + y',
+ local_dict={'x': x, 'y': y})
+
+ def test_4d_ndarray_fails(self):
+ x = randn(3, 4, 5, 6)
+ y = Series(randn(10))
+ assert_raises(NotImplementedError, self.eval, 'x + y',
+ local_dict={'x': x, 'y': y})
+
+ def test_constant(self):
+ x = self.eval('1')
+ assert_equal(x, 1)
+
+ def test_single_variable(self):
+ df = DataFrame(randn(10, 2))
+ df2 = self.eval('df', local_dict={'df': df})
+ assert_frame_equal(df, df2)
+
+ def test_truediv(self):
+ s = np.array([1])
+ ex = 's / 1'
+ d = {'s': s}
+
+ if PY3:
+ res = self.eval(ex, truediv=False, local_dict=d)
+ assert_array_equal(res, np.array([1.0]))
+
+ res = self.eval(ex, truediv=True, local_dict=d)
+ assert_array_equal(res, np.array([1.0]))
+
+ res = self.eval('1 / 2', truediv=True)
+ expec = 0.5
+ self.assertEqual(res, expec)
+
+ res = self.eval('1 / 2', truediv=False)
+ expec = 0.5
+ self.assertEqual(res, expec)
+
+ res = self.eval('s / 2', truediv=False, local_dict={'s': s})
+ expec = 0.5
+ self.assertEqual(res, expec)
+
+ res = self.eval('s / 2', truediv=True, local_dict={'s': s})
+ expec = 0.5
+ self.assertEqual(res, expec)
+ else:
+ res = self.eval(ex, truediv=False, local_dict=d)
+ assert_array_equal(res, np.array([1]))
+
+ res = self.eval(ex, truediv=True, local_dict=d)
+ assert_array_equal(res, np.array([1.0]))
+
+ res = self.eval('1 / 2', truediv=True)
+ expec = 0.5
+ self.assertEqual(res, expec)
+
+ res = self.eval('1 / 2', truediv=False)
+ expec = 0
+ self.assertEqual(res, expec)
+
+ res = self.eval('s / 2', truediv=False, local_dict={'s': s})
+ expec = 0
+ self.assertEqual(res, expec)
+
+ res = self.eval('s / 2', truediv=True, local_dict={'s': s})
+ expec = 0.5
+ self.assertEqual(res, expec)
+
+ def test_failing_subscript_with_name_error(self):
+ df = DataFrame(np.random.randn(5, 3))
+ self.assertRaises(NameError, self.eval, 'df[x > 2] > 2',
+ local_dict={'df': df})
+
+ def test_lhs_expression_subscript(self):
+ df = DataFrame(np.random.randn(5, 3))
+ result = self.eval('(df + 1)[df > 2]', local_dict={'df': df})
+ expected = (df + 1)[df > 2]
+ assert_frame_equal(result, expected)
+
+ def test_attr_expression(self):
+ df = DataFrame(np.random.randn(5, 3), columns=list('abc'))
+ expr1 = 'df.a < df.b'
+ expec1 = df.a < df.b
+ expr2 = 'df.a + df.b + df.c'
+ expec2 = df.a + df.b + df.c
+ expr3 = 'df.a + df.b + df.c[df.b < 0]'
+ expec3 = df.a + df.b + df.c[df.b < 0]
+ exprs = expr1, expr2, expr3
+ expecs = expec1, expec2, expec3
+ for e, expec in zip(exprs, expecs):
+ assert_series_equal(expec, self.eval(e, local_dict={'df': df}))
+
+ def test_assignment_fails(self):
+ df = DataFrame(np.random.randn(5, 3), columns=list('abc'))
+ df2 = DataFrame(np.random.randn(5, 3))
+ expr1 = 'df = df2'
+ self.assertRaises(NotImplementedError, self.eval, expr1,
+ local_dict={'df': df, 'df2': df2})
+
+ def test_basic_period_index_boolean_expression(self):
+ df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i')
+
+ e = df < 2
+ r = self.eval('df < 2', local_dict={'df': df})
+ x = df < 2
+
+ assert_frame_equal(r, e)
+ assert_frame_equal(x, e)
+
+ def test_basic_period_index_subscript_expression(self):
+ df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i')
+ r = self.eval('df[df < 2 + 3]', local_dict={'df': df})
+ e = df[df < 2 + 3]
+ assert_frame_equal(r, e)
+
+ def test_nested_period_index_subscript_expression(self):
+ df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i')
+ r = self.eval('df[df[df < 2] < 2] + df * 2', local_dict={'df': df})
+ e = df[df[df < 2] < 2] + df * 2
+ assert_frame_equal(r, e)
+
+ def test_date_boolean(self):
+ df = DataFrame(randn(5, 3))
+ df['dates1'] = date_range('1/1/2012', periods=5)
+ res = self.eval('df.dates1 < 20130101', local_dict={'df': df},
+ engine=self.engine, parser=self.parser)
+ expec = df.dates1 < '20130101'
+ assert_series_equal(res, expec)
+
+ def test_simple_in_ops(self):
+ if self.parser != 'python':
+ res = pd.eval('1 in [1, 2]', engine=self.engine,
+ parser=self.parser)
+ self.assertTrue(res)
+
+ res = pd.eval('2 in (1, 2)', engine=self.engine,
+ parser=self.parser)
+ self.assertTrue(res)
+
+ res = pd.eval('3 in (1, 2)', engine=self.engine,
+ parser=self.parser)
+ self.assertFalse(res)
+
+ res = pd.eval('3 not in (1, 2)', engine=self.engine,
+ parser=self.parser)
+ self.assertTrue(res)
+
+ res = pd.eval('[3] not in (1, 2)', engine=self.engine,
+ parser=self.parser)
+ self.assertTrue(res)
+
+ res = pd.eval('[3] in ([3], 2)', engine=self.engine,
+ parser=self.parser)
+ self.assertTrue(res)
+
+ res = pd.eval('[[3]] in [[[3]], 2]', engine=self.engine,
+ parser=self.parser)
+ self.assertTrue(res)
+
+ res = pd.eval('(3,) in [(3,), 2]', engine=self.engine,
+ parser=self.parser)
+ self.assertTrue(res)
+
+ res = pd.eval('(3,) not in [(3,), 2]', engine=self.engine,
+ parser=self.parser)
+ self.assertFalse(res)
+
+ res = pd.eval('[(3,)] in [[(3,)], 2]', engine=self.engine,
+ parser=self.parser)
+ self.assertTrue(res)
+ else:
+ with tm.assertRaises(NotImplementedError):
+ pd.eval('1 in [1, 2]', engine=self.engine, parser=self.parser)
+ with tm.assertRaises(NotImplementedError):
+ pd.eval('2 in (1, 2)', engine=self.engine, parser=self.parser)
+ with tm.assertRaises(NotImplementedError):
+ pd.eval('3 in (1, 2)', engine=self.engine, parser=self.parser)
+ with tm.assertRaises(NotImplementedError):
+ pd.eval('3 not in (1, 2)', engine=self.engine,
+ parser=self.parser)
+ with tm.assertRaises(NotImplementedError):
+ pd.eval('[(3,)] in (1, 2, [(3,)])', engine=self.engine,
+ parser=self.parser)
+ with tm.assertRaises(NotImplementedError):
+ pd.eval('[3] not in (1, 2, [[3]])', engine=self.engine,
+ parser=self.parser)
+
+
+class TestOperationsNumExprPython(TestOperationsNumExprPandas):
+ @classmethod
+ def setUpClass(cls):
+ if not _USE_NUMEXPR:
+ raise nose.SkipTest("numexpr engine not installed")
+ cls.engine = 'numexpr'
+ cls.parser = 'python'
+ cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms
+ cls.arith_ops = filter(lambda x: x not in ('in', 'not in'),
+ cls.arith_ops)
+
+ def test_fails_and(self):
+ df = DataFrame(np.random.randn(5, 3))
+ self.assertRaises(NotImplementedError, pd.eval, 'df > 2 and df > 3',
+ local_dict={'df': df}, parser=self.parser,
+ engine=self.engine)
+
+ def test_fails_or(self):
+ df = DataFrame(np.random.randn(5, 3))
+ self.assertRaises(NotImplementedError, pd.eval, 'df > 2 or df > 3',
+ local_dict={'df': df}, parser=self.parser,
+ engine=self.engine)
+
+ def test_fails_not(self):
+ df = DataFrame(np.random.randn(5, 3))
+ self.assertRaises(NotImplementedError, pd.eval, 'not df > 2',
+ local_dict={'df': df}, parser=self.parser,
+ engine=self.engine)
+
+ def test_fails_ampersand(self):
+ df = DataFrame(np.random.randn(5, 3))
+ ex = '(df + 2)[df > 1] > 0 & (df > 0)'
+ with tm.assertRaises(NotImplementedError):
+ pd.eval(ex, parser=self.parser, engine=self.engine)
+
+ def test_fails_pipe(self):
+ df = DataFrame(np.random.randn(5, 3))
+ ex = '(df + 2)[df > 1] > 0 | (df > 0)'
+ with tm.assertRaises(NotImplementedError):
+ pd.eval(ex, parser=self.parser, engine=self.engine)
+
+ def test_bool_ops_with_constants(self):
+ for op, lhs, rhs in product(expr._bool_ops_syms, ('True', 'False'),
+ ('True', 'False')):
+ ex = '{0} {1} {2}'.format(lhs, op, rhs)
+ if op in ('and', 'or'):
+ with tm.assertRaises(NotImplementedError):
+ self.eval(ex)
+ else:
+ res = self.eval(ex)
+ exp = eval(ex)
+ self.assertEqual(res, exp)
+
+ def test_simple_bool_ops(self):
+ for op, lhs, rhs in product(expr._bool_ops_syms, (True, False),
+ (True, False)):
+ ex = 'lhs {0} rhs'.format(op)
+ if op in ('and', 'or'):
+ with tm.assertRaises(NotImplementedError):
+ pd.eval(ex, engine=self.engine, parser=self.parser)
+ else:
+ res = pd.eval(ex, engine=self.engine, parser=self.parser)
+ exp = eval(ex)
+ self.assertEqual(res, exp)
+
+
+class TestOperationsPythonPython(TestOperationsNumExprPython):
+ @classmethod
+ def setUpClass(cls):
+ cls.engine = cls.parser = 'python'
+ cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms
+ cls.arith_ops = filter(lambda x: x not in ('in', 'not in'),
+ cls.arith_ops)
+
+
+class TestOperationsPythonPandas(TestOperationsNumExprPandas):
+ @classmethod
+ def setUpClass(cls):
+ cls.engine = 'python'
+ cls.parser = 'pandas'
+ cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms
+
+
+_var_s = randn(10)
+
+
+class TestScope(object):
+ def check_global_scope(self, e, engine, parser):
+ skip_if_no_ne(engine)
+ assert_array_equal(_var_s * 2, pd.eval(e, engine=engine,
+ parser=parser))
+
+ def test_global_scope(self):
+ e = '_var_s * 2'
+ for engine, parser in product(_engines, expr._parsers):
+ yield self.check_global_scope, e, engine, parser
+
+ def check_no_new_locals(self, engine, parser):
+ skip_if_no_ne(engine)
+ x = 1
+ lcls = locals().copy()
+ pd.eval('x + 1', local_dict=lcls, engine=engine, parser=parser)
+ lcls2 = locals().copy()
+ lcls2.pop('lcls')
+ assert_equal(lcls, lcls2)
+
+ def test_no_new_locals(self):
+ for engine, parser in product(_engines, expr._parsers):
+ yield self.check_no_new_locals, engine, parser
+
+ def check_no_new_globals(self, engine, parser):
+ skip_if_no_ne(engine)
+ x = 1
+ gbls = globals().copy()
+ pd.eval('x + 1', engine=engine, parser=parser)
+ gbls2 = globals().copy()
+ assert_equal(gbls, gbls2)
+
+ def test_no_new_globals(self):
+ for engine, parser in product(_engines, expr._parsers):
+ yield self.check_no_new_globals, engine, parser
+
+
+def test_invalid_engine():
+ skip_if_no_ne()
+ assertRaisesRegexp(KeyError, 'Invalid engine \'asdf\' passed',
+ pd.eval, 'x + y', local_dict={'x': 1, 'y': 2},
+ engine='asdf')
+
+
+def test_invalid_parser():
+ skip_if_no_ne()
+ assertRaisesRegexp(KeyError, 'Invalid parser \'asdf\' passed',
+ pd.eval, 'x + y', local_dict={'x': 1, 'y': 2},
+ parser='asdf')
+
+
+def check_is_expr_syntax(engine):
+ skip_if_no_ne(engine)
+ s = 1
+ valid1 = 's + 1'
+ valid2 = '__y + _xx'
+ assert_true(expr.isexpr(valid1, check_names=False))
+ assert_true(expr.isexpr(valid2, check_names=False))
+
+
+def check_is_expr_names(engine):
+ skip_if_no_ne(engine)
+ r, s = 1, 2
+ valid = 's + r'
+ invalid = '__y + __x'
+ assert_true(expr.isexpr(valid, check_names=True))
+ assert_false(expr.isexpr(invalid, check_names=True))
+
+
+def test_is_expr_syntax():
+ for engine in _engines:
+ yield check_is_expr_syntax, engine
+
+
+def test_is_expr_names():
+ for engine in _engines:
+ yield check_is_expr_names, engine
+
+
+_parsers = {'python': PythonExprVisitor, 'pytables': pytables.ExprVisitor,
+ 'pandas': PandasExprVisitor}
+
+def check_disallowed_nodes(engine, parser):
+ skip_if_no_ne(engine)
+ VisitorClass = _parsers[parser]
+ uns_ops = VisitorClass.unsupported_nodes
+ inst = VisitorClass('x + 1', engine, parser)
+
+ for ops in uns_ops:
+ assert_raises(NotImplementedError, getattr(inst, ops))
+
+
+def test_disallowed_nodes():
+ for engine, visitor in product(_parsers, repeat=2):
+ yield check_disallowed_nodes, engine, visitor
+
+
+def check_syntax_error_exprs(engine, parser):
+ skip_if_no_ne(engine)
+ e = 's +'
+ assert_raises(SyntaxError, pd.eval, e, engine=engine, parser=parser)
+
+
+def test_syntax_error_exprs():
+ for engine, parser in ENGINES_PARSERS:
+ yield check_syntax_error_exprs, engine, parser
+
+
+def check_name_error_exprs(engine, parser):
+ skip_if_no_ne(engine)
+ e = 's + t'
+ assert_raises(NameError, pd.eval, e, engine=engine, parser=parser)
+
+
+def test_name_error_exprs():
+ for engine, parser in ENGINES_PARSERS:
+ yield check_name_error_exprs, engine, parser
+
+
+if __name__ == '__main__':
+ nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
+ exit=False)
diff --git a/pandas/core/base.py b/pandas/core/base.py
index a2f7f04053b9f..fb0d56113ede9 100644
--- a/pandas/core/base.py
+++ b/pandas/core/base.py
@@ -48,6 +48,7 @@ def __repr__(self):
"""
return str(self)
+
class PandasObject(StringMixin):
"""baseclass for various pandas objects"""
diff --git a/pandas/core/common.py b/pandas/core/common.py
index 34aaa08b57171..d3fa10abc7681 100644
--- a/pandas/core/common.py
+++ b/pandas/core/common.py
@@ -3,17 +3,25 @@
"""
import re
+import collections
+import numbers
import codecs
import csv
import sys
+from datetime import timedelta
+
+from distutils.version import LooseVersion
+
from numpy.lib.format import read_array, write_array
import numpy as np
+
import pandas.algos as algos
import pandas.lib as lib
import pandas.tslib as tslib
from pandas import compat
-from pandas.compat import StringIO, BytesIO, range, long, u, zip, map
+from pandas.compat import (StringIO, BytesIO, range, long, u, zip, map,
+ string_types)
from datetime import timedelta
from pandas.core.config import get_option
@@ -27,14 +35,18 @@ class AmbiguousIndexError(PandasError, KeyError):
pass
_POSSIBLY_CAST_DTYPES = set([np.dtype(t)
- for t in ['M8[ns]', 'm8[ns]', 'O', 'int8', 'uint8', 'int16', 'uint16', 'int32', 'uint32', 'int64', 'uint64']])
+ for t in ['M8[ns]', 'm8[ns]', 'O', 'int8',
+ 'uint8', 'int16', 'uint16', 'int32',
+ 'uint32', 'int64', 'uint64']])
_NS_DTYPE = np.dtype('M8[ns]')
_TD_DTYPE = np.dtype('m8[ns]')
_INT64_DTYPE = np.dtype(np.int64)
_DATELIKE_DTYPES = set([np.dtype(t) for t in ['M8[ns]', 'm8[ns]']])
-# define abstract base classes to enable isinstance type checking on our objects
+
+# define abstract base classes to enable isinstance type checking on our
+# objects
def create_pandas_abc_type(name, attr, comp):
@classmethod
def _check(cls, inst):
@@ -44,15 +56,22 @@ def _check(cls, inst):
meta = type("ABCBase", (type,), dct)
return meta(name, tuple(), dct)
+
ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series",))
ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",))
ABCPanel = create_pandas_abc_type("ABCPanel", "_typ", ("panel",))
-ABCSparseSeries = create_pandas_abc_type("ABCSparseSeries", "_subtyp", ('sparse_series', 'sparse_time_series'))
-ABCSparseArray = create_pandas_abc_type("ABCSparseArray", "_subtyp", ('sparse_array', 'sparse_series'))
+ABCSparseSeries = create_pandas_abc_type("ABCSparseSeries", "_subtyp",
+ ('sparse_series',
+ 'sparse_time_series'))
+ABCSparseArray = create_pandas_abc_type("ABCSparseArray", "_subtyp",
+ ('sparse_array', 'sparse_series'))
+
class _ABCGeneric(type):
def __instancecheck__(cls, inst):
return hasattr(inst, "_data")
+
+
ABCGeneric = _ABCGeneric("ABCGeneric", tuple(), {})
def isnull(obj):
@@ -223,6 +242,35 @@ def notnull(obj):
return -res
+def _iterable_not_string(x):
+ return (isinstance(x, collections.Iterable) and
+ not isinstance(x, compat.string_types))
+
+
+def flatten(l):
+ """Flatten an arbitrarily nested sequence.
+
+ Parameters
+ ----------
+ l : sequence
+ The non string sequence to flatten
+
+ Notes
+ -----
+ This doesn't consider strings sequences.
+
+ Returns
+ -------
+ flattened : generator
+ """
+ for el in l:
+ if _iterable_not_string(el):
+ for s in flatten(el):
+ yield s
+ else:
+ yield el
+
+
def mask_missing(arr, values_to_mask):
"""
Return a masking array of same size/shape as arr
@@ -1657,7 +1705,7 @@ def is_bool(obj):
def is_integer(obj):
- return isinstance(obj, (int, long, np.integer))
+ return isinstance(obj, (numbers.Integral, np.integer))
def is_float(obj):
@@ -1665,7 +1713,7 @@ def is_float(obj):
def is_complex(obj):
- return isinstance(obj, (complex, np.complexfloating))
+ return isinstance(obj, (numbers.Complex, np.complexfloating))
def is_iterator(obj):
@@ -1674,7 +1722,7 @@ def is_iterator(obj):
def is_number(obj):
- return isinstance(obj, (np.number, int, long, float, complex))
+ return isinstance(obj, (numbers.Number, np.number))
def is_integer_dtype(arr_or_dtype):
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index f56b6bc00cf15..86565a3a1d9e5 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -28,15 +28,16 @@
_coerce_to_dtypes, _DATELIKE_DTYPES, is_list_like)
from pandas.core.generic import NDFrame
from pandas.core.index import Index, MultiIndex, _ensure_index
-from pandas.core.indexing import (_NDFrameIndexer, _maybe_droplevels,
- _convert_to_index_sliceable, _check_bool_indexer,
- _maybe_convert_indices)
+from pandas.core.indexing import (_maybe_droplevels,
+ _convert_to_index_sliceable,
+ _check_bool_indexer, _maybe_convert_indices)
from pandas.core.internals import (BlockManager,
create_block_manager_from_arrays,
create_block_manager_from_blocks)
from pandas.core.series import Series, _radd_compat
-import pandas.core.expressions as expressions
-from pandas.sparse.array import SparseArray
+import pandas.computation.expressions as expressions
+from pandas.computation.eval import eval as _eval
+from pandas.computation.expr import _ensure_scope
from pandas.compat.scipy import scoreatpercentile as _quantile
from pandas.compat import(range, zip, lrange, lmap, lzip, StringIO, u,
OrderedDict, raise_with_traceback)
@@ -51,14 +52,12 @@
import pandas.core.datetools as datetools
import pandas.core.common as com
import pandas.core.format as fmt
-import pandas.core.generic as generic
import pandas.core.nanops as nanops
import pandas.lib as lib
-import pandas.tslib as tslib
import pandas.algos as _algos
-from pandas.core.config import get_option, set_option
+from pandas.core.config import get_option
#----------------------------------------------------------------------
# Docstring templates
@@ -1898,6 +1897,155 @@ def _getitem_frame(self, key):
raise ValueError('Must pass DataFrame with boolean values only')
return self.where(key)
+ def _get_index_resolvers(self, axis):
+ # index or columns
+ axis_index = getattr(self, axis)
+ d = dict()
+
+ for i, name in enumerate(axis_index.names):
+ if name is not None:
+ key = level = name
+ else:
+ # prefix with 'i' or 'c' depending on the input axis
+ # e.g., you must do ilevel_0 for the 0th level of an unnamed
+ # multiiindex
+ level_string = '{prefix}level_{i}'.format(prefix=axis[0], i=i)
+ key = level_string
+ level = i
+
+ d[key] = Series(axis_index.get_level_values(level).values,
+ index=axis_index, name=level)
+
+ # put the index/columns itself in the dict
+ d[axis] = axis_index
+ return d
+
+ def query(self, expr, **kwargs):
+ """Query the columns of a frame with a boolean expression.
+
+ Parameters
+ ----------
+ expr : string
+ The query string to evaluate. The result of the evaluation of this
+ expression is first passed to :attr:`~pandas.DataFrame.loc` and if
+ that fails because of a multidimensional key (e.g., a DataFrame)
+ then the result will be passed to
+ :meth:`~pandas.DataFrame.__getitem__`.
+ kwargs : dict
+ See the documentation for :func:`~pandas.eval` for complete details
+ on the keyword arguments accepted by
+ :meth:`~pandas.DataFrame.query`.
+
+ Returns
+ -------
+ q : DataFrame or Series
+
+ Notes
+ -----
+ This method uses the top-level :func:`~pandas.eval` function to
+ evaluate the passed query.
+
+ The :meth:`~pandas.DataFrame.query` method uses a slightly
+ modified Python syntax by default. For example, the ``&`` and ``|``
+ (bitwise) operators have the precedence of their boolean cousins,
+ :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,
+ however the semantics are different.
+
+ You can change the semantics of the expression by passing the keyword
+ argument ``parser='python'``. This enforces the same semantics as
+ evaluation in Python space. Likewise, you can pass ``engine='python'``
+ to evaluate an expression using Python itself as a backend. This is not
+ recommended as it is inefficient compared to using ``numexpr`` as the
+ engine.
+
+ The :attr:`~pandas.DataFrame.index` and
+ :attr:`~pandas.DataFrame.columns` attributes of the
+ :class:`~pandas.DataFrame` instance is placed in the namespace by
+ default, which allows you to treat both the index and columns of the
+ frame as a column in the frame.
+ The identifier ``index`` is used for this variable, and you can also
+ use the name of the index to identify it in a query.
+
+ For further details and examples see the ``query`` documentation in
+ :ref:`indexing `.
+
+ See Also
+ --------
+ pandas.eval
+ DataFrame.eval
+
+ Examples
+ --------
+ >>> from numpy.random import randn
+ >>> from pandas import DataFrame
+ >>> df = DataFrame(randn(10, 2), columns=list('ab'))
+ >>> df.query('a > b')
+ >>> df[df.a > df.b] # same result as the previous expression
+ """
+ # need to go up at least 4 stack frames
+ # 4 expr.Scope
+ # 3 expr._ensure_scope
+ # 2 self.eval
+ # 1 self.query
+ # 0 self.query caller (implicit)
+ level = kwargs.setdefault('level', 4)
+ if level < 4:
+ raise ValueError("Going up fewer than 4 stack frames will not"
+ " capture the necessary variable scope for a "
+ "query expression")
+
+ res = self.eval(expr, **kwargs)
+
+ try:
+ return self.loc[res]
+ except ValueError:
+ # when res is multi-dimensional loc raises, but this is sometimes a
+ # valid query
+ return self[res]
+
+ def eval(self, expr, **kwargs):
+ """Evaluate an expression in the context of the calling DataFrame
+ instance.
+
+ Parameters
+ ----------
+ expr : string
+ The expression string to evaluate.
+ kwargs : dict
+ See the documentation for :func:`~pandas.eval` for complete details
+ on the keyword arguments accepted by
+ :meth:`~pandas.DataFrame.query`.
+
+ Returns
+ -------
+ ret : ndarray, scalar, or pandas object
+
+ See Also
+ --------
+ pandas.DataFrame.query
+ pandas.eval
+
+ Notes
+ -----
+ For more details see the API documentation for :func:`~pandas.eval`.
+ For detailed examples see :ref:`enhancing performance with eval
+ `.
+
+ Examples
+ --------
+ >>> from numpy.random import randn
+ >>> from pandas import DataFrame
+ >>> df = DataFrame(randn(10, 2), columns=list('ab'))
+ >>> df.eval('a + b')
+ """
+ resolvers = kwargs.pop('resolvers', None)
+ if resolvers is None:
+ index_resolvers = self._get_index_resolvers('index')
+ index_resolvers.update(self._get_index_resolvers('columns'))
+ resolvers = [self, index_resolvers]
+ kwargs['local_dict'] = _ensure_scope(resolvers=resolvers, **kwargs)
+ return _eval(expr, **kwargs)
+
def _slice(self, slobj, axis=0, raise_on_error=False):
axis = self._get_block_manager_axis(axis)
new_data = self._data.get_slice(
@@ -4599,6 +4747,7 @@ def combineMult(self, other):
DataFrame._setup_axes(
['index', 'columns'], info_axis=1, stat_axis=0, axes_are_reversed=True)
+
_EMPTY_SERIES = Series([])
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
index c265d1590af95..11ce27b078b18 100644
--- a/pandas/core/internals.py
+++ b/pandas/core/internals.py
@@ -18,8 +18,7 @@
from pandas.sparse.array import _maybe_to_sparse, SparseArray
import pandas.lib as lib
import pandas.tslib as tslib
-import pandas.core.expressions as expressions
-from pandas.util.decorators import cache_readonly
+import pandas.computation.expressions as expressions
from pandas.tslib import Timestamp
from pandas import compat
diff --git a/pandas/core/series.py b/pandas/core/series.py
index 893483f0f2636..beb398dfe6fd0 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -8,7 +8,6 @@
import operator
from distutils.version import LooseVersion
import types
-import warnings
from numpy import nan, ndarray
import numpy as np
@@ -18,8 +17,10 @@
_default_index, _maybe_promote, _maybe_upcast,
_asarray_tuplesafe, is_integer_dtype,
_NS_DTYPE, _TD_DTYPE,
- _infer_dtype_from_scalar, is_list_like, _values_from_object,
- _possibly_cast_to_datetime, _possibly_castable, _possibly_convert_platform,
+ _infer_dtype_from_scalar, is_list_like,
+ _values_from_object,
+ _possibly_cast_to_datetime, _possibly_castable,
+ _possibly_convert_platform,
ABCSparseArray)
from pandas.core.index import (Index, MultiIndex, InvalidIndexError,
_ensure_index, _handle_legacy_indexes)
@@ -29,7 +30,6 @@
from pandas.core import generic
from pandas.core.internals import SingleBlockManager
from pandas.core.categorical import Categorical
-import pandas.core.expressions as expressions
from pandas.tseries.index import DatetimeIndex
from pandas.tseries.period import PeriodIndex, Period
from pandas.tseries.offsets import DateOffset
@@ -775,12 +775,9 @@ def put(self, *args, **kwargs):
def __len__(self):
return len(self._data)
- @property
- def size(self):
- return self.__len__()
-
def view(self, dtype=None):
- return self._constructor(self.values.view(dtype), index=self.index, name=self.name)
+ return self._constructor(self.values.view(dtype), index=self.index,
+ name=self.name)
def __array__(self, result=None):
""" the array interface, return my values """
@@ -790,7 +787,8 @@ def __array_wrap__(self, result):
"""
Gets called prior to a ufunc (and after)
"""
- return self._constructor(result, index=self.index, name=self.name, copy=False)
+ return self._constructor(result, index=self.index, name=self.name,
+ copy=False)
def __contains__(self, key):
return key in self.index
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index c8224f761ce17..b79408a1bf8d2 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -2,12 +2,9 @@
High level interface to PyTables for reading and writing pandas data structures
to disk
"""
-from __future__ import print_function
# pylint: disable-msg=E1101,W0613,W0603
from datetime import datetime, date
-from pandas.compat import map, range, zip, lrange, lmap, u
-from pandas import compat
import time
import re
import copy
@@ -15,14 +12,13 @@
import warnings
import numpy as np
-import pandas
from pandas import (Series, TimeSeries, DataFrame, Panel, Panel4D, Index,
MultiIndex, Int64Index, Timestamp, _np_version_under1p7)
from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel
from pandas.sparse.array import BlockIndex, IntIndex
from pandas.tseries.api import PeriodIndex, DatetimeIndex
from pandas.core.base import StringMixin
-from pandas.core.common import adjoin, is_list_like, pprint_thing
+from pandas.core.common import adjoin, pprint_thing
from pandas.core.algorithms import match, unique
from pandas.core.categorical import Categorical
from pandas.core.common import _asarray_tuplesafe
@@ -33,8 +29,10 @@
import pandas.core.common as com
from pandas.tools.merge import concat
from pandas import compat
+from pandas.compat import u, PY3, range, lrange
from pandas.io.common import PerformanceWarning
from pandas.core.config import get_option
+from pandas.computation.pytables import Expr, maybe_expression
import pandas.lib as lib
import pandas.algos as algos
@@ -59,11 +57,31 @@ def _ensure_decoded(s):
def _ensure_encoding(encoding):
# set the encoding if we need
if encoding is None:
- if compat.PY3:
+ if PY3:
encoding = _default_encoding
return encoding
+Term = Expr
+
+
+def _ensure_term(where):
+ """
+ ensure that the where is a Term or a list of Term
+ this makes sure that we are capturing the scope of variables
+ that are passed
+ create the terms here with a frame_level=2 (we are 2 levels down)
+ """
+
+ # only consider list/tuple here as an ndarray is automaticaly a coordinate list
+ if isinstance(where, (list,tuple)):
+ where = [w if not maybe_expression(w) else Term(w, scope_level=2)
+ for w in where if w is not None ]
+ elif maybe_expression(where):
+ where = Term(where, scope_level=2)
+ return where
+
+
class PossibleDataLossError(Exception):
pass
@@ -222,9 +240,12 @@ def get_store(path, **kwargs):
Examples
--------
+ >>> from pandas import DataFrame
+ >>> from numpy.random import randn
+ >>> bar = DataFrame(randn(10, 4))
>>> with get_store('test.h5') as store:
- >>> store['foo'] = bar # write to HDF5
- >>> bar = store['foo'] # retrieve
+ ... store['foo'] = bar # write to HDF5
+ ... bar = store['foo'] # retrieve
"""
store = None
try:
@@ -237,7 +258,8 @@ def get_store(path, **kwargs):
# interface to/from ###
-def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, append=None, **kwargs):
+def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None,
+ append=None, **kwargs):
""" store this object, close it if we opened it """
if append:
f = lambda store: store.append(key, value, **kwargs)
@@ -245,7 +267,8 @@ def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, app
f = lambda store: store.put(key, value, **kwargs)
if isinstance(path_or_buf, compat.string_types):
- with get_store(path_or_buf, mode=mode, complevel=complevel, complib=complib) as store:
+ with get_store(path_or_buf, mode=mode, complevel=complevel,
+ complib=complib) as store:
f(store)
else:
f(path_or_buf)
@@ -332,6 +355,9 @@ class HDFStore(StringMixin):
Examples
--------
+ >>> from pandas import DataFrame
+ >>> from numpy.random import randn
+ >>> bar = DataFrame(randn(10, 4))
>>> store = HDFStore('test.h5')
>>> store['foo'] = bar # write to HDF5
>>> bar = store['foo'] # retrieve
@@ -341,9 +367,9 @@ class HDFStore(StringMixin):
def __init__(self, path, mode=None, complevel=None, complib=None,
fletcher32=False, **kwargs):
try:
- import tables as _
+ import tables
except ImportError: # pragma: no cover
- raise Exception('HDFStore requires PyTables')
+ raise ImportError('HDFStore requires PyTables')
self._path = path
if mode is None:
@@ -477,7 +503,7 @@ def open(self, mode='a'):
self._handle = h5_open(self._path, self._mode)
except IOError as e: # pragma: no cover
if 'can not be written' in str(e):
- print('Opening %s in read-only mode' % self._path)
+ print ('Opening %s in read-only mode' % self._path)
self._handle = h5_open(self._path, 'r')
else:
raise
@@ -523,7 +549,8 @@ def get(self, key):
raise KeyError('No object named %s in the file' % key)
return self._read_group(group)
- def select(self, key, where=None, start=None, stop=None, columns=None, iterator=False, chunksize=None, auto_close=False, **kwargs):
+ def select(self, key, where=None, start=None, stop=None, columns=None,
+ iterator=False, chunksize=None, auto_close=False, **kwargs):
"""
Retrieve pandas object stored in file, optionally based on where
criteria
@@ -549,22 +576,28 @@ def select(self, key, where=None, start=None, stop=None, columns=None, iterator=
raise KeyError('No object named %s in the file' % key)
# create the storer and axes
+ where = _ensure_term(where)
s = self._create_storer(group)
s.infer_axes()
# what we are actually going to do for a chunk
def func(_start, _stop):
- return s.read(where=where, start=_start, stop=_stop, columns=columns, **kwargs)
+ return s.read(where=where, start=_start, stop=_stop,
+ columns=columns, **kwargs)
if iterator or chunksize is not None:
if not s.is_table:
raise TypeError(
"can only use an iterator or chunksize on a table")
- return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, chunksize=chunksize, auto_close=auto_close)
+ return TableIterator(self, func, nrows=s.nrows, start=start,
+ stop=stop, chunksize=chunksize,
+ auto_close=auto_close)
- return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, auto_close=auto_close).get_values()
+ return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop,
+ auto_close=auto_close).get_values()
- def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs):
+ def select_as_coordinates(
+ self, key, where=None, start=None, stop=None, **kwargs):
"""
return the selection as an Index
@@ -575,6 +608,7 @@ def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs
start : integer (defaults to None), row number to start selection
stop : integer (defaults to None), row number to stop selection
"""
+ where = _ensure_term(where)
return self.get_storer(key).read_coordinates(where=where, start=start, stop=stop, **kwargs)
def unique(self, key, column, **kwargs):
@@ -599,7 +633,9 @@ def select_column(self, key, column, **kwargs):
"""
return self.get_storer(key).read_column(column=column, **kwargs)
- def select_as_multiple(self, keys, where=None, selector=None, columns=None, start=None, stop=None, iterator=False, chunksize=None, auto_close=False, **kwargs):
+ def select_as_multiple(self, keys, where=None, selector=None, columns=None,
+ start=None, stop=None, iterator=False,
+ chunksize=None, auto_close=False, **kwargs):
""" Retrieve pandas objects from multiple tables
Parameters
@@ -618,16 +654,19 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, star
"""
# default to single select
+ where = _ensure_term(where)
if isinstance(keys, (list, tuple)) and len(keys) == 1:
keys = keys[0]
if isinstance(keys, compat.string_types):
- return self.select(key=keys, where=where, columns=columns, start=start, stop=stop, iterator=iterator, chunksize=chunksize, **kwargs)
+ return self.select(key=keys, where=where, columns=columns,
+ start=start, stop=stop, iterator=iterator,
+ chunksize=chunksize, **kwargs)
if not isinstance(keys, (list, tuple)):
- raise Exception("keys must be a list/tuple")
+ raise TypeError("keys must be a list/tuple")
- if len(keys) == 0:
- raise Exception("keys must have a non-zero length")
+ if not len(keys):
+ raise ValueError("keys must have a non-zero length")
if selector is None:
selector = keys[0]
@@ -642,7 +681,8 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, star
raise TypeError("Invalid table [%s]" % k)
if not t.is_table:
raise TypeError(
- "object [%s] is not a table, and cannot be used in all select as multiple" % t.pathname)
+ "object [%s] is not a table, and cannot be used in all select as multiple" %
+ t.pathname)
if nrows is None:
nrows = t.nrows
@@ -655,7 +695,7 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, star
c = self.select_as_coordinates(
selector, where, start=start, stop=stop)
nrows = len(c)
- except (Exception) as detail:
+ except Exception:
raise ValueError("invalid selector [%s]" % selector)
def func(_start, _stop):
@@ -720,6 +760,7 @@ def remove(self, key, where=None, start=None, stop=None):
raises KeyError if key is not a valid store
"""
+ where = _ensure_term(where)
try:
s = self.get_storer(key)
except:
@@ -777,8 +818,8 @@ def append(self, key, value, format=None, append=True, columns=None, dropna=None
data in the table, so be careful
"""
if columns is not None:
- raise Exception(
- "columns is not a supported keyword in append, try data_columns")
+ raise TypeError("columns is not a supported keyword in append, "
+ "try data_columns")
if dropna is None:
dropna = get_option("io.hdf.dropna_table")
@@ -809,8 +850,9 @@ def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, d
"""
if axes is not None:
- raise Exception(
- "axes is currently not accepted as a paremter to append_to_multiple; you can create the tables indepdently instead")
+ raise TypeError("axes is currently not accepted as a parameter to"
+ " append_to_multiple; you can create the "
+ "tables indepdently instead")
if not isinstance(d, dict):
raise ValueError(
@@ -876,7 +918,7 @@ def create_table_index(self, key, **kwargs):
# version requirements
_tables()
if not _table_supports_index:
- raise Exception("PyTables >= 2.3 is required for table indexing")
+ raise ValueError("PyTables >= 2.3 is required for table indexing")
s = self.get_storer(key)
if s is None:
@@ -930,7 +972,11 @@ def copy(
"""
new_store = HDFStore(
- file, mode=mode, complib=complib, complevel=complevel, fletcher32 = fletcher32)
+ file,
+ mode=mode,
+ complib=complib,
+ complevel=complevel,
+ fletcher32=fletcher32)
if keys is None:
keys = list(self.keys())
if not isinstance(keys, (tuple, list)):
@@ -1142,7 +1188,8 @@ class TableIterator(object):
kwargs : the passed kwargs
"""
- def __init__(self, store, func, nrows, start=None, stop=None, chunksize=None, auto_close=False):
+ def __init__(self, store, func, nrows, start=None, stop=None,
+ chunksize=None, auto_close=False):
self.store = store
self.func = func
self.nrows = nrows or 0
@@ -1251,7 +1298,12 @@ def set_table(self, table):
def __unicode__(self):
temp = tuple(
- map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind)))
+ map(pprint_thing,
+ (self.name,
+ self.cname,
+ self.axis,
+ self.pos,
+ self.kind)))
return "name->%s,cname->%s,axis->%s,pos->%s,kind->%s" % temp
def __eq__(self, other):
@@ -1361,9 +1413,7 @@ def validate_col(self, itemsize=None):
""" validate this column: return the compared against itemsize """
# validate this column for string truncation (or reset to the max size)
- dtype = getattr(self, 'dtype', None)
if _ensure_decoded(self.kind) == u('string'):
-
c = self.col
if c is not None:
if itemsize is None:
@@ -1467,7 +1517,8 @@ class DataCol(IndexCol):
_info_fields = ['tz']
@classmethod
- def create_for_block(cls, i=None, name=None, cname=None, version=None, **kwargs):
+ def create_for_block(
+ cls, i=None, name=None, cname=None, version=None, **kwargs):
""" return a new datacol with the block i """
if cname is None:
@@ -1487,11 +1538,12 @@ def create_for_block(cls, i=None, name=None, cname=None, version=None, **kwargs)
return cls(name=name, cname=cname, **kwargs)
- def __init__(self, values=None, kind=None, typ=None, cname=None, data=None, block=None, **kwargs):
+ def __init__(self, values=None, kind=None, typ=None,
+ cname=None, data=None, block=None, **kwargs):
super(DataCol, self).__init__(
values=values, kind=kind, typ=typ, cname=cname, **kwargs)
self.dtype = None
- self.dtype_attr = u("%s_dtype") % self.name
+ self.dtype_attr = u("%s_dtype" % self.name)
self.set_data(data)
def __unicode__(self):
@@ -1540,7 +1592,8 @@ def set_kind(self):
if self.typ is None:
self.typ = getattr(self.description, self.cname, None)
- def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=None, **kwargs):
+ def set_atom(self, block, existing_col, min_itemsize,
+ nan_rep, info, encoding=None, **kwargs):
""" create and setup my atom from the block b """
self.values = list(block.items)
@@ -1596,7 +1649,11 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No
# end up here ###
elif inferred_type == 'string' or dtype == 'object':
self.set_atom_string(
- block, existing_col, min_itemsize, nan_rep, encoding)
+ block,
+ existing_col,
+ min_itemsize,
+ nan_rep,
+ encoding)
else:
self.set_atom_data(block)
@@ -1605,7 +1662,8 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No
def get_atom_string(self, block, itemsize):
return _tables().StringCol(itemsize=itemsize, shape=block.shape[0])
- def set_atom_string(self, block, existing_col, min_itemsize, nan_rep, encoding):
+ def set_atom_string(
+ self, block, existing_col, min_itemsize, nan_rep, encoding):
# fill nan items with myself
block = block.fillna(nan_rep)[0]
data = block.values
@@ -1701,13 +1759,13 @@ def validate_attr(self, append):
if (existing_fields is not None and
existing_fields != list(self.values)):
raise ValueError("appended items do not match existing items"
- " in table!")
+ " in table!")
existing_dtype = getattr(self.attrs, self.dtype_attr, None)
if (existing_dtype is not None and
existing_dtype != self.dtype):
raise ValueError("appended items dtype do not match existing items dtype"
- " in table!")
+ " in table!")
def convert(self, values, nan_rep, encoding):
""" set the data from this selection (and convert to the correct dtype if we can) """
@@ -1855,6 +1913,9 @@ def __unicode__(self):
return "%-12.12s (shape->%s)" % (self.pandas_type, s)
return self.pandas_type
+ def __str__(self):
+ return self.__repr__()
+
def set_object_info(self):
""" set my pandas type & version """
self.attrs.pandas_type = str(self.pandas_kind)
@@ -2058,7 +2119,7 @@ def read_index(self, key):
_, index = self.read_index_node(getattr(self.group, key))
return index
else: # pragma: no cover
- raise Exception('unrecognized index variety: %s' % variety)
+ raise TypeError('unrecognized index variety: %s' % variety)
def write_index(self, key, index):
if isinstance(index, MultiIndex):
@@ -2241,7 +2302,7 @@ def write_array(self, key, value, items=None):
warnings.warn(ws, PerformanceWarning)
vlarr = self._handle.createVLArray(self.group, key,
- _tables().ObjectAtom())
+ _tables().ObjectAtom())
vlarr.append(value)
elif value.dtype.type == np.datetime64:
self._handle.createArray(self.group, key, value.view('i8'))
@@ -2381,7 +2442,6 @@ def read(self, **kwargs):
sdict = {}
for name in items:
key = 'sparse_frame_%s' % name
- node = getattr(self.group, key)
s = SparseFrameFixed(self.parent, getattr(self.group, key))
s.infer_axes()
sdict[name] = s.read()
@@ -2574,7 +2634,8 @@ def validate(self, other):
oax = ov[i]
if sax != oax:
raise ValueError(
- "invalid combinate of [%s] on appending data [%s] vs current table [%s]" % (c, sax, oax))
+ "invalid combinate of [%s] on appending data [%s] vs current table [%s]" %
+ (c, sax, oax))
# should never get here
raise Exception(
@@ -2706,14 +2767,14 @@ def validate_min_itemsize(self, min_itemsize):
continue
if k not in q:
raise ValueError(
- "min_itemsize has the key [%s] which is not an axis or data_column" % k)
+ "min_itemsize has the key [%s] which is not an axis or data_column" %
+ k)
@property
def indexables(self):
""" create/cache the indexables if they don't exist """
if self._indexables is None:
- d = self.description
self._indexables = []
# index columns
@@ -2848,7 +2909,8 @@ def validate_data_columns(self, data_columns, min_itemsize):
# return valid columns in the order of our axis
return [c for c in data_columns if c in axis_labels]
- def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, min_itemsize=None, **kwargs):
+ def create_axes(self, axes, obj, validate=True, nan_rep=None,
+ data_columns=None, min_itemsize=None, **kwargs):
""" create and return the axes
leagcy tables create an indexable column, indexable index, non-indexable fields
@@ -2869,8 +2931,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None,
try:
axes = _AXES_MAP[type(obj)]
except:
- raise TypeError(
- "cannot properly create the storer for: [group->%s,value->%s]" %
+ raise TypeError("cannot properly create the storer for: [group->%s,value->%s]" %
(self.group._v_name, type(obj)))
# map axes to numbers
@@ -2995,8 +3056,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None,
try:
existing_col = existing_table.values_axes[i]
except:
- raise ValueError(
- "Incompatible appended table [%s] with existing table [%s]" %
+ raise ValueError("Incompatible appended table [%s] with existing table [%s]" %
(blocks, existing_table.values_axes))
else:
existing_col = None
@@ -3036,8 +3096,8 @@ def process_axes(self, obj, columns=None):
obj = _reindex_axis(obj, axis, labels, columns)
# apply the selection filters (but keep in the same order)
- if self.selection.filter:
- for field, op, filt in self.selection.filter:
+ if self.selection.filter is not None:
+ for field, op, filt in self.selection.filter.format():
def process_filter(field, filt):
@@ -3070,7 +3130,8 @@ def process_filter(field, filt):
return obj
- def create_description(self, complib=None, complevel=None, fletcher32=False, expectedrows=None):
+ def create_description(
+ self, complib=None, complevel=None, fletcher32=False, expectedrows=None):
""" create the description of the table from the axes & values """
# expected rows estimate
@@ -3119,8 +3180,8 @@ def read_column(self, column, where=None, **kwargs):
return False
if where is not None:
- raise Exception(
- "read_column does not currently accept a where clause")
+ raise TypeError("read_column does not currently accept a where "
+ "clause")
# find the axes
for a in self.axes:
@@ -3128,7 +3189,8 @@ def read_column(self, column, where=None, **kwargs):
if not a.is_data_indexable:
raise ValueError(
- "column [%s] can not be extracted individually; it is not data indexable" % column)
+ "column [%s] can not be extracted individually; it is not data indexable" %
+ column)
# column must be an indexable or a data column
c = getattr(self.table.cols, column)
@@ -3174,7 +3236,7 @@ class LegacyTable(Table):
ndim = 3
def write(self, **kwargs):
- raise Exception("write operations are not allowed on legacy tables!")
+ raise TypeError("write operations are not allowed on legacy tables!")
def read(self, where=None, columns=None, **kwargs):
""" we have n indexable columns, with an arbitrary number of data axes """
@@ -3418,15 +3480,14 @@ def write_data_chunk(self, indexes, mask, values):
rows = rows[~mask.ravel().astype(bool)]
except Exception as detail:
- raise Exception("cannot create row-data -> %s" % str(detail))
+ raise Exception("cannot create row-data -> %s" % detail)
try:
if len(rows):
self.table.append(rows)
self.table.flush()
except Exception as detail:
- raise Exception(
- "tables cannot write this data -> %s" % str(detail))
+ raise TypeError("tables cannot write this data -> %s" % detail)
def delete(self, where=None, **kwargs):
@@ -3626,9 +3687,9 @@ def get_attrs(self):
self.levels = []
t = self.table
self.index_axes = [a.infer(t)
- for a in self.indexables if a.is_an_indexable]
+ for a in self.indexables if a.is_an_indexable]
self.values_axes = [a.infer(t)
- for a in self.indexables if not a.is_an_indexable]
+ for a in self.indexables if not a.is_an_indexable]
self.data_columns = [a.name for a in self.values_axes]
@property
@@ -3755,7 +3816,7 @@ def _convert_index(index, encoding=None):
index_name=index_name)
if isinstance(index, MultiIndex):
- raise Exception('MultiIndex not supported here!')
+ raise TypeError('MultiIndex not supported here!')
inferred_type = lib.infer_dtype(index)
@@ -3904,32 +3965,13 @@ def _need_convert(kind):
return False
-class Term(StringMixin):
-
- """create a term object that holds a field, op, and value
-
- Parameters
- ----------
- field : dict, string term expression, or the field to operate (must be a valid index/column type of DataFrame/Panel)
- op : a valid op (defaults to '=') (optional)
- >, >=, <, <=, =, != (not equal) are allowed
- value : a value or list of values (required)
- queryables : a kinds map (dict of column name -> kind), or None i column is non-indexable
- encoding : an encoding that will encode the query terms
+class Coordinates(object):
- Returns
- -------
- a Term object
+ """ holds a returned coordinates list, useful to select the same rows from different tables
- Examples
- --------
- >>> Term(dict(field = 'index', op = '>', value = '20121114'))
- >>> Term('index', '20121114')
- >>> Term('index', '>', '20121114')
- >>> Term('index', ['20121114','20121114'])
- >>> Term('index', datetime(2012,11,14))
- >>> Term('major_axis>20121114')
- >>> Term('minor_axis', ['A','U'])
+ coordinates : holds the array of coordinates
+ group : the source group
+ where : the source where
"""
_ops = ['<=', '<', '>=', '>', '!=', '==', '=']
@@ -4134,23 +4176,13 @@ def stringify(value):
return TermValue(v, stringify(v), u('string'))
-class TermValue(object):
-
- """ hold a term value the we use to construct a condition/filter """
- def __init__(self, value, converted, kind):
- self.value = value
- self.converted = converted
- self.kind = kind
+ def __len__(self):
+ return len(self.values)
- def tostring(self, encoding):
- """ quote the string if not encoded
- else encode and return """
- if self.kind == u('string'):
- if encoding is not None:
- return self.converted
- return '"%s"' % self.converted
- return self.converted
+ def __getitem__(self, key):
+ """ return a new coordinates object, sliced by the key """
+ return Coordinates(self.values[key], self.group, self.where)
class Selection(object):
@@ -4204,41 +4236,32 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs):
self.terms = self.generate(where)
# create the numexpr & the filter
- if self.terms:
- terms = [t for t in self.terms if t.condition is not None]
- if len(terms):
- self.condition = "(%s)" % ' & '.join(
- [t.condition for t in terms])
- self.filter = []
- for t in self.terms:
- if t.filter is not None:
- self.filter.append(t.filter)
+ if self.terms is not None:
+ self.condition, self.filter = self.terms.evaluate()
def generate(self, where):
""" where can be a : dict,list,tuple,string """
if where is None:
return None
- if not isinstance(where, (list, tuple)):
- where = [where]
- else:
-
- # make this a list of we think that we only have a sigle term & no
- # operands inside any terms
- if not any([isinstance(w, (list, tuple, Term)) for w in where]):
-
- if not any([isinstance(w, compat.string_types) and Term._search.match(w) for w in where]):
- where = [where]
+ q = self.table.queryables()
+ try:
+ return Expr(where, queryables=q, encoding=self.table.encoding)
+ except (NameError) as detail:
- queryables = self.table.queryables()
- return [Term(c, queryables=queryables, encoding=self.table.encoding) for c in where]
+ # raise a nice message, suggesting that the user should use data_columns
+ raise ValueError("The passed where expression: {0}\n"
+ " contains an invalid variable reference\n"
+ " all of the variable refrences must be a reference to\n"
+ " an axis (e.g. 'index' or 'columns'), or a data_column\n"
+ " The currently defined references are: {1}\n".format(where,','.join(q.keys())))
def select(self):
"""
generate the selection
"""
if self.condition is not None:
- return self.table.table.readWhere(self.condition, start=self.start, stop=self.stop)
+ return self.table.table.readWhere(self.condition.format(), start=self.start, stop=self.stop)
elif self.coordinates is not None:
return self.table.table.readCoordinates(self.coordinates)
return self.table.table.read(start=self.start, stop=self.stop)
@@ -4250,7 +4273,7 @@ def select_coords(self):
if self.condition is None:
return np.arange(self.table.nrows)
- return self.table.table.getWhereList(self.condition, start=self.start, stop=self.stop, sort=True)
+ return self.table.table.getWhereList(self.condition.format(), start=self.start, stop=self.stop, sort=True)
# utilities ###
diff --git a/pandas/io/tests/test_data.py b/pandas/io/tests/test_data.py
index 34b2811876f30..1cffccea2289f 100644
--- a/pandas/io/tests/test_data.py
+++ b/pandas/io/tests/test_data.py
@@ -277,7 +277,7 @@ def setUpClass(cls):
except ImportError:
raise nose.SkipTest
- with assert_produces_warning():
+ with assert_produces_warning(FutureWarning):
cls.aapl = web.Options('aapl')
today = datetime.today()
diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py
index 861b4dd7567a0..322b626acc0ad 100644
--- a/pandas/io/tests/test_pytables.py
+++ b/pandas/io/tests/test_pytables.py
@@ -1,10 +1,9 @@
-from __future__ import print_function
-from pandas.compat import range, lrange, u
import nose
import unittest
-import os
import sys
+import os
import warnings
+from contextlib import contextmanager
import datetime
import numpy as np
@@ -23,9 +22,8 @@
assert_series_equal)
from pandas import concat, Timestamp
from pandas import compat, _np_version_under1p7
-from pandas.core import common as com
-
-from numpy.testing.decorators import slow
+from pandas.compat import range, lrange, u
+from pandas.util.testing import assert_produces_warning
try:
import tables
@@ -42,12 +40,12 @@
# contextmanager to ensure the file cleanup
def safe_remove(path):
if path is not None:
- import os
try:
os.remove(path)
except:
pass
+
def safe_close(store):
try:
if store is not None:
@@ -55,7 +53,6 @@ def safe_close(store):
except:
pass
-from contextlib import contextmanager
@contextmanager
def ensure_clean(path, mode='a', complevel=None, complib=None,
@@ -82,6 +79,7 @@ def _maybe_remove(store, key):
except:
pass
+
def compat_assert_produces_warning(w,f):
""" don't produce a warning under PY3 """
if compat.PY3:
@@ -90,6 +88,7 @@ def compat_assert_produces_warning(w,f):
with tm.assert_produces_warning(expected_warning=w):
f()
+
class TestHDFStore(unittest.TestCase):
def setUp(self):
@@ -329,8 +328,8 @@ def test_contains(self):
self.assert_('bar' not in store)
# GH 2694
- with tm.assert_produces_warning(expected_warning=tables.NaturalNameWarning):
- store['node())'] = tm.makeDataFrame()
+ warnings.filterwarnings('ignore', category=tables.NaturalNameWarning)
+ store['node())'] = tm.makeDataFrame()
self.assert_('node())' in store)
def test_versioning(self):
@@ -751,7 +750,7 @@ def test_encoding(self):
raise nose.SkipTest('system byteorder is not little, skipping test_encoding!')
with ensure_clean(self.path) as store:
- df = DataFrame(dict(A='foo',B='bar'),index=lrange(5))
+ df = DataFrame(dict(A='foo',B='bar'),index=range(5))
df.loc[2,'A'] = np.nan
df.loc[3,'B'] = np.nan
_maybe_remove(store, 'df')
@@ -887,16 +886,16 @@ def test_append_frame_column_oriented(self):
expected = df.reindex(columns=['A'])
tm.assert_frame_equal(expected, result)
- # this isn't supported
- self.assertRaises(TypeError, store.select, 'df1', (
- 'columns=A', Term('index', '>', df.index[4])))
-
# selection on the non-indexable
result = store.select(
- 'df1', ('columns=A', Term('index', '=', df.index[0:4])))
+ 'df1', ('columns=A', Term('index=df.index[0:4]')))
expected = df.reindex(columns=['A'], index=df.index[0:4])
tm.assert_frame_equal(expected, result)
+ # this isn't supported
+ self.assertRaises(TypeError, store.select, 'df1', (
+ 'columns=A', Term('index>df.index[4]')))
+
def test_append_with_different_block_ordering(self):
#GH 4096; using same frames, but different block orderings
@@ -905,7 +904,7 @@ def test_append_with_different_block_ordering(self):
for i in range(10):
df = DataFrame(np.random.randn(10,2),columns=list('AB'))
- df['index'] = lrange(10)
+ df['index'] = range(10)
df['index'] += i*10
df['int64'] = Series([1]*len(df),dtype='int64')
df['int16'] = Series([1]*len(df),dtype='int16')
@@ -1081,7 +1080,7 @@ def check_col(key,name,size):
def check_col(key,name,size):
self.assert_(getattr(store.get_storer(key).table.description,name).itemsize == size)
- df = DataFrame(dict(A = 'foo', B = 'bar'),index=lrange(10))
+ df = DataFrame(dict(A = 'foo', B = 'bar'),index=range(10))
# a min_itemsize that creates a data_column
_maybe_remove(store, 'df')
@@ -1134,7 +1133,7 @@ def test_append_with_data_columns(self):
# data column searching (with an indexable and a data_columns)
result = store.select(
- 'df', [Term('B>0'), Term('index', '>', df.index[3])])
+ 'df', [Term('B>0'), Term('index>df.index[3]')])
df_new = df.reindex(index=df.index[4:])
expected = df_new[df_new.B > 0]
tm.assert_frame_equal(result, expected)
@@ -1146,7 +1145,7 @@ def test_append_with_data_columns(self):
df_new['string'][5:6] = 'bar'
_maybe_remove(store, 'df')
store.append('df', df_new, data_columns=['string'])
- result = store.select('df', [Term('string', '=', 'foo')])
+ result = store.select('df', [Term('string=foo')])
expected = df_new[df_new.string == 'foo']
tm.assert_frame_equal(result, expected)
@@ -1192,14 +1191,14 @@ def check_col(key,name,size):
_maybe_remove(store, 'df')
store.append(
'df', df_new, data_columns=['A', 'B', 'string', 'string2'])
- result = store.select('df', [Term('string', '=', 'foo'), Term(
+ result = store.select('df', [Term('string=foo'), Term(
'string2=foo'), Term('A>0'), Term('B<0')])
expected = df_new[(df_new.string == 'foo') & (
df_new.string2 == 'foo') & (df_new.A > 0) & (df_new.B < 0)]
tm.assert_frame_equal(result, expected)
# yield an empty frame
- result = store.select('df', [Term('string', '=', 'foo'), Term(
+ result = store.select('df', [Term('string=foo'), Term(
'string2=cool')])
expected = df_new[(df_new.string == 'foo') & (
df_new.string2 == 'cool')]
@@ -1316,9 +1315,8 @@ def test_big_table_frame(self):
raise nose.SkipTest('no big table frame')
# create and write a big table
- df = DataFrame(np.random.randn(2000 * 100, 100),
- index=lrange(2000 * 100),
- columns=['E%03d' % i for i in range(100)])
+ df = DataFrame(np.random.randn(2000 * 100, 100), index=range(
+ 2000 * 100), columns=['E%03d' % i for i in range(100)])
for x in range(20):
df['String%03d' % x] = 'string%03d' % x
@@ -1328,8 +1326,9 @@ def test_big_table_frame(self):
store.append('df', df)
rows = store.root.df.table.nrows
recons = store.select('df')
+ assert isinstance(recons, DataFrame)
- print("\nbig_table frame [%s] -> %5.2f" % (rows, time.time() - x))
+ print ("\nbig_table frame [%s] -> %5.2f" % (rows, time.time() - x))
def test_big_table2_frame(self):
# this is a really big table: 1m rows x 60 float columns, 20 string, 20 datetime
@@ -1340,15 +1339,14 @@ def test_big_table2_frame(self):
print ("\nbig_table2 start")
import time
start_time = time.time()
- df = DataFrame(np.random.randn(1000 * 1000, 60),
- index=lrange(int(1000 * 1000)),
- columns=['E%03d' % i for i in range(60)])
+ df = DataFrame(np.random.randn(1000 * 1000, 60), index=range(int(
+ 1000 * 1000)), columns=['E%03d' % i for i in range(60)])
for x in range(20):
df['String%03d' % x] = 'string%03d' % x
for x in range(20):
df['datetime%03d' % x] = datetime.datetime(2001, 1, 2, 0, 0)
- print("\nbig_table2 frame (creation of df) [rows->%s] -> %5.2f"
+ print ("\nbig_table2 frame (creation of df) [rows->%s] -> %5.2f"
% (len(df.index), time.time() - start_time))
def f(chunksize):
@@ -1359,9 +1357,9 @@ def f(chunksize):
for c in [10000, 50000, 250000]:
start_time = time.time()
- print("big_table2 frame [chunk->%s]" % c)
+ print ("big_table2 frame [chunk->%s]" % c)
rows = f(c)
- print("big_table2 frame [rows->%s,chunk->%s] -> %5.2f"
+ print ("big_table2 frame [rows->%s,chunk->%s] -> %5.2f"
% (rows, c, time.time() - start_time))
def test_big_put_frame(self):
@@ -1370,23 +1368,23 @@ def test_big_put_frame(self):
print ("\nbig_put start")
import time
start_time = time.time()
- df = DataFrame(np.random.randn(1000 * 1000, 60), index=lrange(int(
+ df = DataFrame(np.random.randn(1000 * 1000, 60), index=range(int(
1000 * 1000)), columns=['E%03d' % i for i in range(60)])
for x in range(20):
df['String%03d' % x] = 'string%03d' % x
for x in range(20):
df['datetime%03d' % x] = datetime.datetime(2001, 1, 2, 0, 0)
- print("\nbig_put frame (creation of df) [rows->%s] -> %5.2f"
+ print ("\nbig_put frame (creation of df) [rows->%s] -> %5.2f"
% (len(df.index), time.time() - start_time))
with ensure_clean(self.path, mode='w') as store:
start_time = time.time()
- store = HDFStore(fn, mode='w')
+ store = HDFStore(self.path, mode='w')
store.put('df', df)
- print(df.get_dtype_counts())
- print("big_put frame [shape->%s] -> %5.2f"
+ print (df.get_dtype_counts())
+ print ("big_put frame [shape->%s] -> %5.2f"
% (df.shape, time.time() - start_time))
def test_big_table_panel(self):
@@ -1410,8 +1408,9 @@ def test_big_table_panel(self):
store.append('wp', wp)
rows = store.root.wp.table.nrows
recons = store.select('wp')
+ assert isinstance(recons, Panel)
- print("\nbig_table panel [%s] -> %5.2f" % (rows, time.time() - x))
+ print ("\nbig_table panel [%s] -> %5.2f" % (rows, time.time() - x))
def test_append_diff_item_order(self):
@@ -1654,7 +1653,6 @@ def test_table_values_dtypes_roundtrip(self):
expected.sort()
tm.assert_series_equal(result,expected)
-
def test_table_mixed_dtypes(self):
# frame
@@ -1713,7 +1711,7 @@ def test_unimplemented_dtypes_table_columns(self):
# py3 ok for unicode
if not compat.PY3:
- l.append(('unicode', u('\u03c3')))
+ l.append(('unicode', u('\\u03c3')))
### currently not supported dtypes ####
for n, f in l:
@@ -1759,17 +1757,17 @@ def compare(a,b):
assert_frame_equal(result,df)
# select with tz aware
- compare(store.select('df_tz',where=Term('A','>=',df.A[3])),df[df.A>=df.A[3]])
+ compare(store.select('df_tz',where=Term('A>=df.A[3]')),df[df.A>=df.A[3]])
_maybe_remove(store, 'df_tz')
- df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130103',tz='US/Eastern')),index=lrange(5))
+ df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130103',tz='US/Eastern')),index=range(5))
store.append('df_tz',df)
result = store['df_tz']
compare(result,df)
assert_frame_equal(result,df)
_maybe_remove(store, 'df_tz')
- df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130102',tz='EET')),index=lrange(5))
+ df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130102',tz='EET')),index=range(5))
self.assertRaises(TypeError, store.append, 'df_tz', df)
# this is ok
@@ -1780,7 +1778,7 @@ def compare(a,b):
assert_frame_equal(result,df)
# can't append with diff timezone
- df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130102',tz='CET')),index=lrange(5))
+ df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130102',tz='CET')),index=range(5))
self.assertRaises(ValueError, store.append, 'df_tz', df)
# as index
@@ -1866,16 +1864,16 @@ def test_append_with_timedelta(self):
result = store.select('df',Term("C","<",-3*86400))
assert_frame_equal(result,df.iloc[3:])
- result = store.select('df',Term("C","<",'-3D'))
+ result = store.select('df',"C<'-3D'")
assert_frame_equal(result,df.iloc[3:])
# a bit hacky here as we don't really deal with the NaT properly
- result = store.select('df',Term("C","<",'-500000s'))
+ result = store.select('df',"C<'-500000s'")
result = result.dropna(subset=['C'])
assert_frame_equal(result,df.iloc[6:])
- result = store.select('df',Term("C","<",'-3.5D'))
+ result = store.select('df',"C<'-3.5D'")
result = result.iloc[1:]
assert_frame_equal(result,df.iloc[4:])
@@ -1927,14 +1925,14 @@ def test_remove_where(self):
with ensure_clean(self.path) as store:
# non-existance
- crit1 = Term('index', '>', 'foo')
+ crit1 = Term('index>foo')
self.assertRaises(KeyError, store.remove, 'a', [crit1])
# try to remove non-table (with crit)
# non-table ok (where = None)
wp = tm.makePanel()
- store.put('wp', wp, format='t')
- store.remove('wp', [('minor_axis', ['A', 'D'])])
+ store.put('wp', wp, format='table')
+ store.remove('wp', ["minor_axis=['A', 'D']"])
rs = store.select('wp')
expected = wp.reindex(minor_axis=['B', 'C'])
assert_panel_equal(rs, expected)
@@ -1966,8 +1964,8 @@ def test_remove_crit(self):
# group row removal
date4 = wp.major_axis.take([0, 1, 2, 4, 5, 6, 8, 9, 10])
- crit4 = Term('major_axis', date4)
- store.put('wp3', wp, format='table')
+ crit4 = Term('major_axis=date4')
+ store.put('wp3', wp, format='t')
n = store.remove('wp3', where=[crit4])
assert(n == 36)
result = store.select('wp3')
@@ -1978,8 +1976,8 @@ def test_remove_crit(self):
store.put('wp', wp, format='table')
date = wp.major_axis[len(wp.major_axis) // 2]
- crit1 = Term('major_axis', '>', date)
- crit2 = Term('minor_axis', ['A', 'D'])
+ crit1 = Term('major_axis>date')
+ crit2 = Term("minor_axis=['A', 'D']")
n = store.remove('wp', where=[crit1])
assert(n == 56)
@@ -1995,14 +1993,14 @@ def test_remove_crit(self):
store.put('wp2', wp, format='table')
date1 = wp.major_axis[1:3]
- crit1 = Term('major_axis', date1)
+ crit1 = Term('major_axis=date1')
store.remove('wp2', where=[crit1])
result = store.select('wp2')
expected = wp.reindex(major_axis=wp.major_axis - date1)
assert_panel_equal(result, expected)
date2 = wp.major_axis[5]
- crit2 = Term('major_axis', date2)
+ crit2 = Term('major_axis=date2')
store.remove('wp2', where=[crit2])
result = store['wp2']
expected = wp.reindex(
@@ -2010,7 +2008,7 @@ def test_remove_crit(self):
assert_panel_equal(result, expected)
date3 = [wp.major_axis[7], wp.major_axis[9]]
- crit3 = Term('major_axis', date3)
+ crit3 = Term('major_axis=date3')
store.remove('wp2', where=[crit3])
result = store['wp2']
expected = wp.reindex(
@@ -2020,62 +2018,102 @@ def test_remove_crit(self):
# corners
store.put('wp4', wp, format='table')
n = store.remove(
- 'wp4', where=[Term('major_axis', '>', wp.major_axis[-1])])
+ 'wp4', where=[Term('major_axis>wp.major_axis[-1]')])
result = store.select('wp4')
assert_panel_equal(result, wp)
- def test_terms(self):
+ def test_invalid_terms(self):
with ensure_clean(self.path) as store:
+ df = tm.makeTimeDataFrame()
+ df['string'] = 'foo'
+ df.ix[0:4,'string'] = 'bar'
wp = tm.makePanel()
p4d = tm.makePanel4D()
+ store.put('df', df, format='table')
store.put('wp', wp, format='table')
store.put('p4d', p4d, format='table')
# some invalid terms
- terms = [
- ['minor', ['A', 'B']],
- ['index', ['20121114']],
- ['index', ['20121114', '20121114']],
- ]
- for t in terms:
- self.assertRaises(Exception, store.select, 'wp', t)
+ self.assertRaises(ValueError, store.select, 'wp', "minor=['A', 'B']")
+ self.assertRaises(ValueError, store.select, 'wp', ["index=['20121114']"])
+ self.assertRaises(ValueError, store.select, 'wp', ["index=['20121114', '20121114']"])
+ self.assertRaises(TypeError, Term)
- self.assertRaises(Exception, Term.__init__)
- self.assertRaises(Exception, Term.__init__, 'blah')
- self.assertRaises(Exception, Term.__init__, 'index')
- self.assertRaises(Exception, Term.__init__, 'index', '==')
- self.assertRaises(Exception, Term.__init__, 'index', '>', 5)
+ # more invalid
+ self.assertRaises(ValueError, store.select, 'df','df.index[3]')
+ self.assertRaises(SyntaxError, store.select, 'df','index>')
+ self.assertRaises(ValueError, store.select, 'wp', "major_axis<'20000108' & minor_axis['A', 'B']")
+
+ # from the docs
+ with tm.ensure_clean(self.path) as path:
+ dfq = DataFrame(np.random.randn(10,4),columns=list('ABCD'),index=date_range('20130101',periods=10))
+ dfq.to_hdf(path,'dfq',format='table',data_columns=True)
+
+ # check ok
+ read_hdf(path,'dfq',where="index>Timestamp('20130104') & columns=['A', 'B']")
+ read_hdf(path,'dfq',where="A>0 or C>0")
+
+ # catch the invalid reference
+ with tm.ensure_clean(self.path) as path:
+ dfq = DataFrame(np.random.randn(10,4),columns=list('ABCD'),index=date_range('20130101',periods=10))
+ dfq.to_hdf(path,'dfq',format='table')
+
+ self.assertRaises(ValueError, read_hdf, path,'dfq',where="A>0 or C>0")
+
+ def test_terms(self):
+
+ with ensure_clean(self.path) as store:
+
+ wp = tm.makePanel()
+ p4d = tm.makePanel4D()
+ store.put('wp', wp, table=True)
+ store.put('p4d', p4d, table=True)
# panel
result = store.select('wp', [Term(
- 'major_axis<20000108'), Term('minor_axis', '=', ['A', 'B'])])
+ 'major_axis<"20000108"'), Term("minor_axis=['A', 'B']")])
expected = wp.truncate(after='20000108').reindex(minor=['A', 'B'])
assert_panel_equal(result, expected)
+ # with deprecation
+ result = store.select('wp', [Term(
+ 'major_axis','<',"20000108"), Term("minor_axis=['A', 'B']")])
+ expected = wp.truncate(after='20000108').reindex(minor=['A', 'B'])
+ tm.assert_panel_equal(result, expected)
+
# p4d
- result = store.select('p4d', [Term('major_axis<20000108'),
- Term('minor_axis', '=', ['A', 'B']),
- Term('items', '=', ['ItemA', 'ItemB'])])
+ result = store.select('p4d', [Term('major_axis<"20000108"'),
+ Term("minor_axis=['A', 'B']"),
+ Term("items=['ItemA', 'ItemB']")])
expected = p4d.truncate(after='20000108').reindex(
minor=['A', 'B'], items=['ItemA', 'ItemB'])
assert_panel4d_equal(result, expected)
- # valid terms
+ # back compat invalid terms
terms = [
dict(field='major_axis', op='>', value='20121114'),
- ('major_axis', '20121114'),
- ('major_axis', '>', '20121114'),
- (('major_axis', ['20121114', '20121114']),),
- ('major_axis', datetime.datetime(2012, 11, 14)),
+ [ dict(field='major_axis', op='>', value='20121114') ],
+ [ "minor_axis=['A','B']", dict(field='major_axis', op='>', value='20121114') ]
+ ]
+ for t in terms:
+ with tm.assert_produces_warning(expected_warning=DeprecationWarning):
+ Term(t)
+
+ # valid terms
+ terms = [
+ ('major_axis=20121114'),
+ ('major_axis>20121114'),
+ (("major_axis=['20121114', '20121114']"),),
+ ('major_axis=datetime.datetime(2012, 11, 14)'),
'major_axis> 20121114',
'major_axis >20121114',
'major_axis > 20121114',
- (('minor_axis', ['A', 'B']),),
- (('minor_axis', ['A', 'B']),),
- ((('minor_axis', ['A', 'B']),),),
- (('items', ['ItemA', 'ItemB']),),
+ (("minor_axis=['A', 'B']"),),
+ (("minor_axis=['A', 'B']"),),
+ ((("minor_axis==['A', 'B']"),),),
+ (("items=['ItemA', 'ItemB']"),),
('items=ItemA'),
]
@@ -2085,13 +2123,53 @@ def test_terms(self):
# valid for p4d only
terms = [
- (('labels', '=', ['l1', 'l2']),),
- Term('labels', '=', ['l1', 'l2']),
+ (("labels=['l1', 'l2']"),),
+ Term("labels=['l1', 'l2']"),
]
for t in terms:
store.select('p4d', t)
+ def test_term_compat(self):
+ with ensure_clean(self.path) as store:
+
+ wp = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'],
+ major_axis=date_range('1/1/2000', periods=5),
+ minor_axis=['A', 'B', 'C', 'D'])
+ store.append('wp',wp)
+
+ result = store.select('wp', [Term('major_axis>20000102'),
+ Term('minor_axis', '=', ['A','B']) ])
+ expected = wp.loc[:,wp.major_axis>Timestamp('20000102'),['A','B']]
+ assert_panel_equal(result, expected)
+
+ store.remove('wp', Term('major_axis>20000103'))
+ result = store.select('wp')
+ expected = wp.loc[:,wp.major_axis<=Timestamp('20000103'),:]
+ assert_panel_equal(result, expected)
+
+ def test_same_name_scoping(self):
+
+ with ensure_clean(self.path) as store:
+
+ import pandas as pd
+ df = DataFrame(np.random.randn(20, 2),index=pd.date_range('20130101',periods=20))
+ store.put('df', df, table=True)
+ expected = df[df.index>pd.Timestamp('20130105')]
+
+ import datetime
+ result = store.select('df','index>datetime.datetime(2013,1,5)')
+ assert_frame_equal(result,expected)
+
+ from datetime import datetime
+
+ # technically an error, but allow it
+ result = store.select('df','index>datetime.datetime(2013,1,5)')
+ assert_frame_equal(result,expected)
+
+ result = store.select('df','index>datetime(2013,1,5)')
+ assert_frame_equal(result,expected)
+
def test_series(self):
s = tm.makeStringSeries()
@@ -2211,7 +2289,7 @@ def test_index_types(self):
self._check_roundtrip(ser, func)
ser = Series(values, [datetime.datetime(
- 2012, 1, 1), datetime.datetime(2012, 1, 2)])
+ 2012, 1, 1), datetime.datetime(2012, 1, 2)])
self._check_roundtrip(ser, func)
def test_timeseries_preepoch(self):
@@ -2525,7 +2603,7 @@ def test_select(self):
_maybe_remove(store, 'wp')
store.append('wp', wp)
items = ['Item%03d' % i for i in range(80)]
- result = store.select('wp', Term('items', items))
+ result = store.select('wp', Term('items=items'))
expected = wp.reindex(items=items)
assert_panel_equal(expected, result)
@@ -2542,7 +2620,7 @@ def test_select(self):
tm.assert_frame_equal(expected, result)
# equivalentsly
- result = store.select('df', [('columns', ['A', 'B'])])
+ result = store.select('df', [("columns=['A', 'B']")])
expected = df.reindex(columns=['A', 'B'])
tm.assert_frame_equal(expected, result)
@@ -2575,7 +2653,8 @@ def test_select_dtypes(self):
df = DataFrame(dict(ts=bdate_range('2012-01-01', periods=300), A=np.random.randn(300)))
_maybe_remove(store, 'df')
store.append('df', df, data_columns=['ts', 'A'])
- result = store.select('df', [Term('ts', '>=', Timestamp('2012-02-01'))])
+
+ result = store.select('df', [Term("ts>=Timestamp('2012-02-01')")])
expected = df[df.ts >= Timestamp('2012-02-01')]
tm.assert_frame_equal(expected, result)
@@ -2602,7 +2681,7 @@ def test_select_dtypes(self):
_maybe_remove(store, 'df_int')
store.append('df_int', df)
result = store.select(
- 'df_int', [Term("index<10"), Term("columns", "=", ["A"])])
+ 'df_int', [Term("index<10"), Term("columns=['A']")])
expected = df.reindex(index=list(df.index)[0:10],columns=['A'])
tm.assert_frame_equal(expected, result)
@@ -2612,7 +2691,7 @@ def test_select_dtypes(self):
_maybe_remove(store, 'df_float')
store.append('df_float', df)
result = store.select(
- 'df_float', [Term("index<10.0"), Term("columns", "=", ["A"])])
+ 'df_float', [Term("index<10.0"), Term("columns=['A']")])
expected = df.reindex(index=list(df.index)[0:10],columns=['A'])
tm.assert_frame_equal(expected, result)
@@ -2622,36 +2701,36 @@ def test_select_with_many_inputs(self):
df = DataFrame(dict(ts=bdate_range('2012-01-01', periods=300),
A=np.random.randn(300),
- B=lrange(300),
+ B=range(300),
users = ['a']*50 + ['b']*50 + ['c']*100 + ['a%03d' % i for i in range(100)]))
_maybe_remove(store, 'df')
store.append('df', df, data_columns=['ts', 'A', 'B', 'users'])
# regular select
- result = store.select('df', [Term('ts', '>=', Timestamp('2012-02-01'))])
+ result = store.select('df', [Term("ts>=Timestamp('2012-02-01')")])
expected = df[df.ts >= Timestamp('2012-02-01')]
tm.assert_frame_equal(expected, result)
# small selector
- result = store.select('df', [Term('ts', '>=', Timestamp('2012-02-01')),Term('users',['a','b','c'])])
+ result = store.select('df', [Term("ts>=Timestamp('2012-02-01') & users=['a','b','c']")])
expected = df[ (df.ts >= Timestamp('2012-02-01')) & df.users.isin(['a','b','c']) ]
tm.assert_frame_equal(expected, result)
# big selector along the columns
selector = [ 'a','b','c' ] + [ 'a%03d' % i for i in range(60) ]
- result = store.select('df', [Term('ts', '>=', Timestamp('2012-02-01')),Term('users',selector)])
+ result = store.select('df', [Term("ts>=Timestamp('2012-02-01')"),Term('users=selector')])
expected = df[ (df.ts >= Timestamp('2012-02-01')) & df.users.isin(selector) ]
tm.assert_frame_equal(expected, result)
- selector = lrange(100,200)
- result = store.select('df', [Term('B', selector)])
+ selector = range(100,200)
+ result = store.select('df', [Term('B=selector')])
expected = df[ df.B.isin(selector) ]
tm.assert_frame_equal(expected, result)
self.assert_(len(result) == 100)
# big selector along the index
selector = Index(df.ts[0:100].values)
- result = store.select('df', [Term('ts', selector)])
+ result = store.select('df', [Term('ts=selector')])
expected = df[ df.ts.isin(selector.values) ]
tm.assert_frame_equal(expected, result)
self.assert_(len(result) == 100)
@@ -2807,15 +2886,15 @@ def test_panel_select(self):
store.put('wp', wp, format='table')
date = wp.major_axis[len(wp.major_axis) // 2]
- crit1 = ('major_axis', '>=', date)
- crit2 = ('minor_axis', '=', ['A', 'D'])
+ crit1 = ('major_axis>=date')
+ crit2 = ("minor_axis=['A', 'D']")
result = store.select('wp', [crit1, crit2])
expected = wp.truncate(before=date).reindex(minor=['A', 'D'])
assert_panel_equal(result, expected)
result = store.select(
- 'wp', ['major_axis>=20000124', ('minor_axis', '=', ['A', 'B'])])
+ 'wp', ['major_axis>="20000124"', ("minor_axis=['A', 'B']")])
expected = wp.truncate(before='20000124').reindex(minor=['A', 'B'])
assert_panel_equal(result, expected)
@@ -2827,9 +2906,9 @@ def test_frame_select(self):
store.put('frame', df,format='table')
date = df.index[len(df) // 2]
- crit1 = ('index', '>=', date)
- crit2 = ('columns', ['A', 'D'])
- crit3 = ('columns', 'A')
+ crit1 = Term('index>=date')
+ crit2 = ("columns=['A', 'D']")
+ crit3 = ('columns=A')
result = store.select('frame', [crit1, crit2])
expected = df.ix[date:, ['A', 'D']]
@@ -2850,6 +2929,67 @@ def test_frame_select(self):
# self.assertRaises(ValueError, store.select,
# 'frame', [crit1, crit2])
+ def test_frame_select_complex(self):
+ """ select via complex criteria """
+
+ df = tm.makeTimeDataFrame()
+ df['string'] = 'foo'
+ df.loc[df.index[0:4],'string'] = 'bar'
+
+ with ensure_clean(self.path) as store:
+ store.put('df', df, table=True, data_columns=['string'])
+
+ # empty
+ result = store.select('df', 'index>df.index[3] & string="bar"')
+ expected = df.loc[(df.index>df.index[3]) & (df.string=='bar')]
+ tm.assert_frame_equal(result, expected)
+
+ result = store.select('df', 'index>df.index[3] & string="foo"')
+ expected = df.loc[(df.index>df.index[3]) & (df.string=='foo')]
+ tm.assert_frame_equal(result, expected)
+
+ # or
+ result = store.select('df', 'index>df.index[3] | string="bar"')
+ expected = df.loc[(df.index>df.index[3]) | (df.string=='bar')]
+ tm.assert_frame_equal(result, expected)
+
+ result = store.select('df', '(index>df.index[3] & index<=df.index[6]) | string="bar"')
+ expected = df.loc[((df.index>df.index[3]) & (df.index<=df.index[6])) | (df.string=='bar')]
+ tm.assert_frame_equal(result, expected)
+
+ # invert
+ result = store.select('df', 'string!="bar"')
+ expected = df.loc[df.string!='bar']
+ tm.assert_frame_equal(result, expected)
+
+ # invert not implemented in numexpr :(
+ self.assertRaises(NotImplementedError, store.select, 'df', '~(string="bar")')
+
+ # invert ok for filters
+ result = store.select('df', "~(columns=['A','B'])")
+ expected = df.loc[:,df.columns-['A','B']]
+ tm.assert_frame_equal(result, expected)
+
+ # in
+ result = store.select('df', "index>df.index[3] & columns in ['A','B']")
+ expected = df.loc[df.index>df.index[3]].reindex(columns=['A','B'])
+ tm.assert_frame_equal(result, expected)
+
+ def test_invalid_filtering(self):
+
+ # can't use more than one filter (atm)
+
+ df = tm.makeTimeDataFrame()
+
+ with ensure_clean(self.path) as store:
+ store.put('df', df, table=True)
+
+ # not implemented
+ self.assertRaises(NotImplementedError, store.select, 'df', "columns=['A'] | columns=['B']")
+
+ # in theory we could deal with this
+ self.assertRaises(NotImplementedError, store.select, 'df', "columns=['A','B'] & columns=['C']")
+
def test_string_select(self):
# GH 2973
@@ -2898,7 +3038,6 @@ def test_string_select(self):
expected = df[df.int!=2]
assert_frame_equal(result,expected)
-
def test_read_column(self):
df = tm.makeTimeDataFrame()
@@ -2917,7 +3056,7 @@ def f():
# valid
result = store.select_column('df', 'index')
tm.assert_almost_equal(result.values, Series(df.index).values)
- tm.assert_isinstance(result,Series)
+ self.assert_(isinstance(result,Series))
# not a data indexable column
self.assertRaises(
@@ -3116,18 +3255,11 @@ def test_select_as_multiple(self):
tm.assert_frame_equal(result, expected)
# multiple (diff selector)
- try:
- result = store.select_as_multiple(['df1', 'df2'], where=[Term(
- 'index', '>', df2.index[4])], selector='df2')
- expected = concat([df1, df2], axis=1)
- expected = expected[5:]
- tm.assert_frame_equal(result, expected)
- except (Exception) as detail:
- print("error in select_as_multiple %s" % str(detail))
- print("store: %s" % store)
- print("df1: %s" % df1)
- print("df2: %s" % df2)
-
+ result = store.select_as_multiple(['df1', 'df2'], where=[Term(
+ 'index>df2.index[4]')], selector='df2')
+ expected = concat([df1, df2], axis=1)
+ expected = expected[5:]
+ tm.assert_frame_equal(result, expected)
# test excpection for diff rows
store.append('df3', tm.makeTimeDataFrame(nper=50))
@@ -3142,15 +3274,15 @@ def test_start_stop(self):
store.append('df', df)
result = store.select(
- 'df', [Term("columns", "=", ["A"])], start=0, stop=5)
+ 'df', [Term("columns=['A']")], start=0, stop=5)
expected = df.ix[0:4, ['A']]
tm.assert_frame_equal(result, expected)
# out of range
result = store.select(
- 'df', [Term("columns", "=", ["A"])], start=30, stop=40)
+ 'df', [Term("columns=['A']")], start=30, stop=40)
assert(len(result) == 0)
- tm.assert_isinstance(result, DataFrame)
+ assert(type(result) == DataFrame)
def test_select_filter_corner(self):
@@ -3161,7 +3293,7 @@ def test_select_filter_corner(self):
with ensure_clean(self.path) as store:
store.put('frame', df, format='table')
- crit = Term('columns', df.columns[:75])
+ crit = Term('columns=df.columns[:75]')
result = store.select('frame', [crit])
tm.assert_frame_equal(result, df.ix[:, df.columns[:75]])
@@ -3190,7 +3322,6 @@ def _check_double_roundtrip(self, obj, comparator, compression=False,
again = store['obj']
comparator(again, obj, **kwargs)
-
def _check_roundtrip_table(self, obj, comparator, compression=False):
options = {}
if compression:
@@ -3296,6 +3427,7 @@ def test_pytables_native_read(self):
try:
store = HDFStore(tm.get_data_path('legacy_hdf/pytables_native.h5'), 'r')
d2 = store['detector/readout']
+ assert isinstance(d2, DataFrame)
finally:
safe_close(store)
@@ -3303,6 +3435,7 @@ def test_pytables_native_read(self):
store = HDFStore(tm.get_data_path('legacy_hdf/pytables_native2.h5'), 'r')
str(store)
d1 = store['detector']
+ assert isinstance(d1, DataFrame)
finally:
safe_close(store)
@@ -3330,11 +3463,12 @@ def test_legacy_table_read(self):
# old version warning
with tm.assert_produces_warning(expected_warning=IncompatibilityWarning):
self.assertRaises(
- Exception, store.select, 'wp1', Term('minor_axis', '=', 'B'))
+ Exception, store.select, 'wp1', Term('minor_axis=B'))
- with tm.assert_produces_warning(expected_warning=IncompatibilityWarning):
df2 = store.select('df2')
- store.select('df2', Term('index', '>', df2.index[2]))
+ result = store.select('df2', Term('index>df2.index[2]'))
+ expected = df2[df2.index > df2.index[2]]
+ assert_frame_equal(expected, result)
finally:
safe_close(store)
@@ -3352,11 +3486,18 @@ def test_legacy_0_10_read(self):
def test_legacy_0_11_read(self):
# legacy from 0.11
try:
- store = HDFStore(tm.get_data_path('legacy_hdf/legacy_table_0.11.h5'), 'r')
+ path = os.path.join('legacy_hdf', 'legacy_table_0.11.h5')
+ store = HDFStore(tm.get_data_path(path), 'r')
str(store)
+ assert 'df' in store
+ assert 'df1' in store
+ assert 'mi' in store
df = store.select('df')
df1 = store.select('df1')
mi = store.select('mi')
+ assert isinstance(df, DataFrame)
+ assert isinstance(df1, DataFrame)
+ assert isinstance(mi, DataFrame)
finally:
safe_close(store)
@@ -3364,10 +3505,9 @@ def test_copy(self):
def do_copy(f = None, new_f = None, keys = None, propindexes = True, **kwargs):
try:
- import os
-
if f is None:
- f = tm.get_data_path('legacy_hdf/legacy_0.10.h5')
+ f = tm.get_data_path(os.path.join('legacy_hdf',
+ 'legacy_0.10.h5'))
store = HDFStore(f, 'r')
@@ -3380,7 +3520,7 @@ def do_copy(f = None, new_f = None, keys = None, propindexes = True, **kwargs):
# check keys
if keys is None:
- keys = list(store.keys())
+ keys = store.keys()
self.assert_(set(keys) == set(tstore.keys()))
# check indicies & nrows
@@ -3437,6 +3577,7 @@ def test_legacy_table_write(self):
df = DataFrame(dict(A = 'foo', B = 'bar'),index=lrange(10))
store.append('df', df, data_columns = ['B'], min_itemsize={'A' : 200 })
+ store.append('wp', wp)
store.close()
@@ -3524,6 +3665,7 @@ def _test_sort(obj):
else:
raise ValueError('type not supported here')
+
if __name__ == '__main__':
import nose
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py
index e2051eba7f42a..8c5764a3f59a6 100644
--- a/pandas/tests/test_common.py
+++ b/pandas/tests/test_common.py
@@ -4,15 +4,13 @@
import nose
from nose.tools import assert_equal
-import unittest
import numpy as np
from pandas.tslib import iNaT
from pandas import Series, DataFrame, date_range, DatetimeIndex, Timestamp
-import pandas.compat as compat
+from pandas import compat
from pandas.compat import range, long, lrange, lmap, u
from pandas.core.common import notnull, isnull
-import pandas.compat as compat
import pandas.core.common as com
import pandas.util.testing as tm
import pandas.core.config as cf
@@ -42,6 +40,7 @@ def __getitem__(self):
assert(not is_seq(A()))
+
def test_notnull():
assert notnull(1.)
assert not notnull(None)
@@ -121,11 +120,13 @@ def test_isnull_datetime():
assert(mask[0])
assert(not mask[1:].any())
+
def test_datetimeindex_from_empty_datetime64_array():
for unit in [ 'ms', 'us', 'ns' ]:
idx = DatetimeIndex(np.array([], dtype='datetime64[%s]' % unit))
assert(len(idx) == 0)
+
def test_nan_to_nat_conversions():
df = DataFrame(dict({
@@ -144,6 +145,7 @@ def test_nan_to_nat_conversions():
if LooseVersion(np.__version__) >= '1.7.0':
assert(s[8].value == np.datetime64('NaT').astype(np.int64))
+
def test_any_none():
assert(com._any_none(1, 2, 3, None))
assert(not com._any_none(1, 2, 3, 4))
@@ -308,6 +310,7 @@ def test_ensure_int32():
result = com._ensure_int32(values)
assert(result.dtype == np.int32)
+
def test_ensure_platform_int():
# verify that when we create certain types of indices
diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py
index ff76c7c070946..f81620b897a4a 100644
--- a/pandas/tests/test_expressions.py
+++ b/pandas/tests/test_expressions.py
@@ -4,31 +4,25 @@
import unittest
import nose
-import operator
-from numpy import random, nan
from numpy.random import randn
+
+import operator
import numpy as np
from numpy.testing import assert_array_equal
-import pandas as pan
-from pandas.core.api import DataFrame, Series, notnull, isnull
-from pandas.core import expressions as expr
+from pandas.core.api import DataFrame
+from pandas.computation import expressions as expr
-from pandas.util.testing import (assert_almost_equal,
- assert_series_equal,
- assert_frame_equal)
+from pandas.util.testing import assert_series_equal, assert_frame_equal
from pandas import compat
-import pandas.util.testing as tm
-import pandas.lib as lib
-
-from numpy.testing.decorators import slow
if not expr._USE_NUMEXPR:
- raise nose.SkipTest
+ raise nose.SkipTest("numexpr not available")
+
-_frame = DataFrame(np.random.randn(10000, 4), columns = list('ABCD'), dtype='float64')
-_frame2 = DataFrame(np.random.randn(100, 4), columns = list('ABCD'), dtype='float64')
+_frame = DataFrame(randn(10000, 4), columns=list('ABCD'), dtype='float64')
+_frame2 = DataFrame(randn(100, 4), columns = list('ABCD'), dtype='float64')
_mixed = DataFrame({ 'A' : _frame['A'].copy(), 'B' : _frame['B'].astype('float32'), 'C' : _frame['C'].astype('int64'), 'D' : _frame['D'].astype('int32') })
_mixed2 = DataFrame({ 'A' : _frame2['A'].copy(), 'B' : _frame2['B'].astype('float32'), 'C' : _frame2['C'].astype('int64'), 'D' : _frame2['D'].astype('int32') })
_integer = DataFrame(np.random.randint(1, 100, size=(10001, 4)), columns = list('ABCD'), dtype='int64')
@@ -128,11 +122,11 @@ def testit():
result = expr.evaluate(op, op_str, f, f, use_numexpr=True)
expected = expr.evaluate(op, op_str, f, f, use_numexpr=False)
assert_array_equal(result,expected.values)
-
+
result = expr._can_use_numexpr(op, op_str, f2, f2, 'evaluate')
self.assert_(result == False)
-
+
expr.set_use_numexpr(False)
testit()
expr.set_use_numexpr(True)
@@ -149,7 +143,7 @@ def testit():
f11 = f
f12 = f + 1
-
+
f21 = f2
f22 = f2 + 1
@@ -163,7 +157,7 @@ def testit():
result = expr.evaluate(op, op_str, f11, f12, use_numexpr=True)
expected = expr.evaluate(op, op_str, f11, f12, use_numexpr=False)
assert_array_equal(result,expected.values)
-
+
result = expr._can_use_numexpr(op, op_str, f21, f22, 'evaluate')
self.assert_(result == False)
@@ -180,7 +174,7 @@ def test_where(self):
def testit():
for f in [ self.frame, self.frame2, self.mixed, self.mixed2 ]:
-
+
for cond in [ True, False ]:
c = np.empty(f.shape,dtype=np.bool_)
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
index a5c1941a7f2d3..423707e0016d8 100644
--- a/pandas/tests/test_frame.py
+++ b/pandas/tests/test_frame.py
@@ -11,6 +11,8 @@
import nose
import functools
import itertools
+from itertools import product
+
from pandas.compat import(
map, zip, range, long, lrange, lmap, lzip,
OrderedDict, cPickle as pickle, u, StringIO
@@ -18,7 +20,7 @@
from pandas import compat
from numpy import random, nan
-from numpy.random import randn
+from numpy.random import randn, rand
import numpy as np
import numpy.ma as ma
from numpy.testing import assert_array_equal
@@ -30,7 +32,7 @@
import pandas.core.format as fmt
import pandas.core.datetools as datetools
from pandas.core.api import (DataFrame, Index, Series, notnull, isnull,
- MultiIndex, DatetimeIndex, Timestamp, Period)
+ MultiIndex, DatetimeIndex, Timestamp)
from pandas import date_range
import pandas as pd
from pandas.io.parsers import read_csv
@@ -40,10 +42,14 @@
assert_series_equal,
assert_frame_equal,
assertRaisesRegexp,
+ assertRaises,
makeCustomDataframe as mkdf,
ensure_clean)
from pandas.core.indexing import IndexingError
from pandas.core.common import PandasError
+from pandas.compat import OrderedDict
+from pandas.computation.expr import Expr
+import pandas.computation as comp
import pandas.util.testing as tm
import pandas.lib as lib
@@ -81,6 +87,7 @@ def _check_mixed_float(df, dtype = None):
if dtypes.get('D'):
assert(df.dtypes['D'] == dtypes['D'])
+
def _check_mixed_int(df, dtype = None):
dtypes = dict(A = 'int32', B = 'uint64', C = 'uint8', D = 'int64')
if isinstance(dtype, compat.string_types):
@@ -97,8 +104,6 @@ def _check_mixed_int(df, dtype = None):
assert(df.dtypes['D'] == dtypes['D'])
-
-
class CheckIndexing(object):
_multiprocess_can_split_ = True
@@ -122,6 +127,14 @@ def test_getitem(self):
with assertRaisesRegexp(KeyError, 'no item named random'):
self.frame['random']
+ df = self.frame.copy()
+ df['$10'] = randn(len(df))
+ ad = randn(len(df))
+ df['@awesome_domain'] = ad
+ self.assertRaises(KeyError, df.__getitem__, 'df["$10"]')
+ res = df['@awesome_domain']
+ assert_array_equal(ad, res.values)
+
def test_getitem_dupe_cols(self):
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=['a', 'a', 'b'])
try:
@@ -2119,7 +2132,6 @@ def test_constructor_cast_failure(self):
# this is ok
df['foo2'] = np.ones((4,2)).tolist()
-
def test_constructor_dtype_nocast_view(self):
df = DataFrame([[1, 2]])
should_be_view = DataFrame(df, dtype=df[0].dtype)
@@ -3166,7 +3178,6 @@ def test_constructor_single_value(self):
with tm.assertRaisesRegexp(TypeError, 'incompatible data and dtype'):
DataFrame('a', [1, 2], ['a', 'c'], float)
-
def test_constructor_with_datetimes(self):
intname = np.dtype(np.int_).name
floatname = np.dtype(np.float_).name
@@ -5238,8 +5249,6 @@ def make_dtnat_arr(n,nnat=None):
_do_test(mkdf(nrows, ncols,r_idx_nlevels=2,c_idx_nlevels=2),
path,rnlvl=2,cnlvl=2)
-
-
def test_to_csv_from_csv_w_some_infs(self):
# test roundtrip with inf, -inf, nan, as full columns and mix
@@ -8100,6 +8109,7 @@ def test_mask_edge_case_1xN_frame(self):
#----------------------------------------------------------------------
# Transposing
+
def test_transpose(self):
frame = self.frame
dft = frame.T
@@ -8228,7 +8238,6 @@ def test_diff(self):
assert_series_equal(the_diff['A'],
tf['A'] - tf['A'].shift(1))
-
def test_diff_mixed_dtype(self):
df = DataFrame(np.random.randn(5, 3))
df['A'] = np.array([1, 2, 3, 4, 5], dtype=object)
@@ -10137,7 +10146,6 @@ def test_unstack_dtypes(self):
expected = Series({'float64' : 2, 'object' : 2})
assert_series_equal(result, expected)
-
def test_reset_index(self):
stacked = self.frame.stack()[::2]
stacked = DataFrame({'foo': stacked, 'bar': stacked})
@@ -11106,10 +11114,632 @@ def test_isin_with_string_scalar(self):
with tm.assertRaises(TypeError):
df.isin('aaa')
+
+def skip_if_no_ne(engine='numexpr'):
+ if engine == 'numexpr':
+ try:
+ import numexpr as ne
+ except ImportError:
+ raise nose.SkipTest("cannot query engine numexpr when numexpr not "
+ "installed")
+
+
+def skip_if_no_pandas_parser(parser):
+ if parser != 'pandas':
+ raise nose.SkipTest("cannot evaluate with parser {0!r}".format(parser))
+
+
+class TestDataFrameQueryWithMultiIndex(object):
+ def check_query_with_named_multiindex(self, parser, engine):
+ skip_if_no_ne(engine)
+ a = tm.choice(['red', 'green'], size=10)
+ b = tm.choice(['eggs', 'ham'], size=10)
+ index = MultiIndex.from_arrays([a, b], names=['color', 'food'])
+ df = DataFrame(randn(10, 2), index=index)
+ ind = Series(df.index.get_level_values('color').values, index=index,
+ name='color')
+
+ # equality
+ #import ipdb; ipdb.set_trace()
+ res1 = df.query('color == "red"', parser=parser, engine=engine)
+ res2 = df.query('"red" == color', parser=parser, engine=engine)
+ exp = df[ind == 'red']
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ # inequality
+ res1 = df.query('color != "red"', parser=parser, engine=engine)
+ res2 = df.query('"red" != color', parser=parser, engine=engine)
+ exp = df[ind != 'red']
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ # list equality (really just set membership)
+ res1 = df.query('color == ["red"]', parser=parser, engine=engine)
+ res2 = df.query('["red"] == color', parser=parser, engine=engine)
+ exp = df[ind.isin(['red'])]
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ res1 = df.query('color != ["red"]', parser=parser, engine=engine)
+ res2 = df.query('["red"] != color', parser=parser, engine=engine)
+ exp = df[~ind.isin(['red'])]
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ # in/not in ops
+ res1 = df.query('["red"] in color', parser=parser, engine=engine)
+ res2 = df.query('"red" in color', parser=parser, engine=engine)
+ exp = df[ind.isin(['red'])]
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ res1 = df.query('["red"] not in color', parser=parser, engine=engine)
+ res2 = df.query('"red" not in color', parser=parser, engine=engine)
+ exp = df[~ind.isin(['red'])]
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ def test_query_with_named_multiindex(self):
+ for parser, engine in product(['pandas'], ENGINES):
+ yield self.check_query_with_named_multiindex, parser, engine
+
+ def check_query_with_unnamed_multiindex(self, parser, engine):
+ skip_if_no_ne(engine)
+ a = tm.choice(['red', 'green'], size=10)
+ b = tm.choice(['eggs', 'ham'], size=10)
+ index = MultiIndex.from_arrays([a, b])
+ df = DataFrame(randn(10, 2), index=index)
+ ind = Series(df.index.get_level_values(0).values, index=index)
+
+ res1 = df.query('ilevel_0 == "red"', parser=parser, engine=engine)
+ res2 = df.query('"red" == ilevel_0', parser=parser, engine=engine)
+ exp = df[ind == 'red']
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ # inequality
+ res1 = df.query('ilevel_0 != "red"', parser=parser, engine=engine)
+ res2 = df.query('"red" != ilevel_0', parser=parser, engine=engine)
+ exp = df[ind != 'red']
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ # list equality (really just set membership)
+ res1 = df.query('ilevel_0 == ["red"]', parser=parser, engine=engine)
+ res2 = df.query('["red"] == ilevel_0', parser=parser, engine=engine)
+ exp = df[ind.isin(['red'])]
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ res1 = df.query('ilevel_0 != ["red"]', parser=parser, engine=engine)
+ res2 = df.query('["red"] != ilevel_0', parser=parser, engine=engine)
+ exp = df[~ind.isin(['red'])]
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ # in/not in ops
+ res1 = df.query('["red"] in ilevel_0', parser=parser, engine=engine)
+ res2 = df.query('"red" in ilevel_0', parser=parser, engine=engine)
+ exp = df[ind.isin(['red'])]
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ res1 = df.query('["red"] not in ilevel_0', parser=parser, engine=engine)
+ res2 = df.query('"red" not in ilevel_0', parser=parser, engine=engine)
+ exp = df[~ind.isin(['red'])]
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ #### LEVEL 1 ####
+ ind = Series(df.index.get_level_values(1).values, index=index)
+ res1 = df.query('ilevel_1 == "eggs"', parser=parser, engine=engine)
+ res2 = df.query('"eggs" == ilevel_1', parser=parser, engine=engine)
+ exp = df[ind == 'eggs']
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ # inequality
+ res1 = df.query('ilevel_1 != "eggs"', parser=parser, engine=engine)
+ res2 = df.query('"eggs" != ilevel_1', parser=parser, engine=engine)
+ exp = df[ind != 'eggs']
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ # list equality (really just set membership)
+ res1 = df.query('ilevel_1 == ["eggs"]', parser=parser, engine=engine)
+ res2 = df.query('["eggs"] == ilevel_1', parser=parser, engine=engine)
+ exp = df[ind.isin(['eggs'])]
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ res1 = df.query('ilevel_1 != ["eggs"]', parser=parser, engine=engine)
+ res2 = df.query('["eggs"] != ilevel_1', parser=parser, engine=engine)
+ exp = df[~ind.isin(['eggs'])]
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ # in/not in ops
+ res1 = df.query('["eggs"] in ilevel_1', parser=parser, engine=engine)
+ res2 = df.query('"eggs" in ilevel_1', parser=parser, engine=engine)
+ exp = df[ind.isin(['eggs'])]
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ res1 = df.query('["eggs"] not in ilevel_1', parser=parser, engine=engine)
+ res2 = df.query('"eggs" not in ilevel_1', parser=parser, engine=engine)
+ exp = df[~ind.isin(['eggs'])]
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ def test_query_with_unnamed_multiindex(self):
+ for parser, engine in product(['pandas'], ENGINES):
+ yield self.check_query_with_unnamed_multiindex, parser, engine
+
+ def check_query_with_partially_named_multiindex(self, parser, engine):
+ skip_if_no_ne(engine)
+ a = tm.choice(['red', 'green'], size=10)
+ b = np.arange(10)
+ index = MultiIndex.from_arrays([a, b])
+ index.names = [None, 'rating']
+ df = DataFrame(randn(10, 2), index=index)
+ res = df.query('rating == 1', parser=parser, engine=engine)
+ ind = Series(df.index.get_level_values('rating').values, index=index,
+ name='rating')
+ exp = df[ind == 1]
+ assert_frame_equal(res, exp)
+
+ res = df.query('rating != 1', parser=parser, engine=engine)
+ ind = Series(df.index.get_level_values('rating').values, index=index,
+ name='rating')
+ exp = df[ind != 1]
+ assert_frame_equal(res, exp)
+
+ res = df.query('ilevel_0 == "red"', parser=parser, engine=engine)
+ ind = Series(df.index.get_level_values(0).values, index=index)
+ exp = df[ind == "red"]
+ assert_frame_equal(res, exp)
+
+ res = df.query('ilevel_0 != "red"', parser=parser, engine=engine)
+ ind = Series(df.index.get_level_values(0).values, index=index)
+ exp = df[ind != "red"]
+ assert_frame_equal(res, exp)
+
+ def test_query_with_partially_named_multiindex(self):
+ for parser, engine in product(['pandas'], ENGINES):
+ yield self.check_query_with_partially_named_multiindex, parser, engine
+
+
+class TestDataFrameQueryNumExprPandas(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ cls.engine = 'numexpr'
+ cls.parser = 'pandas'
+ skip_if_no_ne()
+
+ @classmethod
+ def tearDownClass(cls):
+ del cls.engine, cls.parser
+
+ def test_date_query_method(self):
+ engine, parser = self.engine, self.parser
+ df = DataFrame(randn(5, 3))
+ df['dates1'] = date_range('1/1/2012', periods=5)
+ df['dates2'] = date_range('1/1/2013', periods=5)
+ df['dates3'] = date_range('1/1/2014', periods=5)
+ res = df.query('dates1 < 20130101 < dates3', engine=engine,
+ parser=parser)
+ expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)]
+ assert_frame_equal(res, expec)
+
+ def test_query_scope(self):
+ engine, parser = self.engine, self.parser
+ from pandas.computation.common import NameResolutionError
+
+ df = DataFrame({"i": lrange(10), "+": lrange(3, 13),
+ "r": lrange(4, 14)})
+ i, s = 5, 6
+ self.assertRaises(NameResolutionError, df.query, 'i < 5',
+ engine=engine, parser=parser, local_dict={'i': i})
+ self.assertRaises(SyntaxError, df.query, 'i - +', engine=engine,
+ parser=parser)
+ self.assertRaises(NameResolutionError, df.query, 'i == s',
+ engine=engine, parser=parser, local_dict={'i': i,
+ 's': s})
+
+ def test_query_scope_index(self):
+ engine, parser = self.engine, self.parser
+ from pandas.computation.common import NameResolutionError
+ df = DataFrame(np.random.randint(10, size=(10, 3)),
+ index=Index(range(10), name='blob'),
+ columns=['a', 'b', 'c'])
+ from numpy import sin
+ df.index.name = 'sin'
+ self.assertRaises(NameResolutionError, df.query, 'sin > 5',
+ engine=engine, parser=parser, local_dict={'sin':
+ sin})
+
+ def test_query(self):
+ engine, parser = self.engine, self.parser
+ df = DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c'])
+
+ assert_frame_equal(df.query('a < b', engine=engine, parser=parser),
+ df[df.a < df.b])
+ assert_frame_equal(df.query('a + b > b * c', engine=engine,
+ parser=parser),
+ df[df.a + df.b > df.b * df.c])
+
+ local_dict = dict(df.iteritems())
+ local_dict.update({'df': df})
+ self.assertRaises(NameError, df.query, 'a < d & b < f',
+ local_dict=local_dict, engine=engine, parser=parser)
+
+ # make sure that it's not just because we didn't pass the locals in
+ self.assertRaises(AssertionError, self.assertRaises, NameError,
+ df.query, 'a < b', local_dict={'df': df},
+ engine=engine, parser=parser)
+
+ def test_query_index_with_name(self):
+ engine, parser = self.engine, self.parser
+ df = DataFrame(np.random.randint(10, size=(10, 3)),
+ index=Index(range(10), name='blob'),
+ columns=['a', 'b', 'c'])
+ res = df.query('(blob < 5) & (a < b)', engine=engine, parser=parser)
+ expec = df[(df.index < 5) & (df.a < df.b)]
+ assert_frame_equal(res, expec)
+
+ res = df.query('blob < b', engine=engine, parser=parser)
+ expec = df[df.index < df.b]
+
+ assert_frame_equal(res, expec)
+
+ def test_query_index_without_name(self):
+ engine, parser = self.engine, self.parser
+ df = DataFrame(np.random.randint(10, size=(10, 3)),
+ index=range(10), columns=['a', 'b', 'c'])
+
+ # "index" should refer to the index
+ res = df.query('index < b', engine=engine, parser=parser)
+ expec = df[df.index < df.b]
+ assert_frame_equal(res, expec)
+
+ # test against a scalar
+ res = df.query('index < 5', engine=engine, parser=parser)
+ expec = df[df.index < 5]
+ assert_frame_equal(res, expec)
+
+ def test_nested_scope(self):
+ engine = self.engine
+ parser = self.parser
+ # smoke test
+ x = 1
+ result = pd.eval('x + 1', engine=engine, parser=parser)
+ self.assertEqual(result, 2)
+
+ df = DataFrame(np.random.randn(5, 3))
+ df2 = DataFrame(np.random.randn(5, 3))
+ expected = df[(df>0) & (df2>0)]
+
+ result = df.query('(df>0) & (df2>0)', engine=engine, parser=parser)
+ assert_frame_equal(result, expected)
+
+ result = pd.eval('df[(df > 0) and (df2 > 0)]', engine=engine,
+ parser=parser)
+ assert_frame_equal(result, expected)
+
+ result = pd.eval('df[(df > 0) and (df2 > 0) and df[df > 0] > 0]',
+ engine=engine, parser=parser)
+ expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]
+ assert_frame_equal(result, expected)
+
+ result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser)
+ expected = df.query('(df>0) & (df2>0)', engine=engine, parser=parser)
+ assert_frame_equal(result, expected)
+
+ def test_local_syntax(self):
+ skip_if_no_pandas_parser(self.parser)
+
+ from pandas.computation.common import NameResolutionError
+
+ engine, parser = self.engine, self.parser
+ df = DataFrame(randn(100, 10), columns=list('abcdefghij'))
+ b = 1
+ expect = df[df.a < b]
+ result = df.query('a < @b', engine=engine, parser=parser)
+ assert_frame_equal(result, expect)
+
+ # scope issue with self.assertRaises so just catch it and let it pass
+ try:
+ df.query('a < @b', engine=engine, parser=parser)
+ except NameResolutionError:
+ pass
+
+ del b
+ expect = df[df.a < df.b]
+ result = df.query('a < b', engine=engine, parser=parser)
+ assert_frame_equal(result, expect)
+
+ def test_chained_cmp_and_in(self):
+ skip_if_no_pandas_parser(self.parser)
+ engine, parser = self.engine, self.parser
+ cols = list('abc')
+ df = DataFrame(randn(100, len(cols)), columns=cols)
+ res = df.query('a < b < c and a not in b not in c', engine=engine,
+ parser=parser)
+ ind = (df.a < df.b) & (df.b < df.c) & ~df.b.isin(df.a) & ~df.c.isin(df.b)
+ expec = df[ind]
+ assert_frame_equal(res, expec)
+
+
+class TestDataFrameQueryNumExprPython(TestDataFrameQueryNumExprPandas):
+ @classmethod
+ def setUpClass(cls):
+ cls.engine = 'numexpr'
+ cls.parser = 'python'
+ skip_if_no_ne(cls.engine)
+ cls.frame = _frame.copy()
+
+ @classmethod
+ def tearDownClass(cls):
+ del cls.frame, cls.engine, cls.parser
+
+ def test_date_query_method(self):
+ engine, parser = self.engine, self.parser
+ df = DataFrame(randn(5, 3))
+ df['dates1'] = date_range('1/1/2012', periods=5)
+ df['dates2'] = date_range('1/1/2013', periods=5)
+ df['dates3'] = date_range('1/1/2014', periods=5)
+ res = df.query('(df.dates1 < 20130101) & (20130101 < df.dates3)',
+ engine=engine, parser=parser)
+ expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)]
+ assert_frame_equal(res, expec)
+
+ def test_nested_scope(self):
+ engine = self.engine
+ parser = self.parser
+ # smoke test
+ x = 1
+ result = pd.eval('x + 1', engine=engine, parser=parser)
+ self.assertEqual(result, 2)
+
+ df = DataFrame(np.random.randn(5, 3))
+ df2 = DataFrame(np.random.randn(5, 3))
+ expected = df[(df>0) & (df2>0)]
+
+ result = df.query('(df>0) & (df2>0)', engine=engine, parser=parser)
+ assert_frame_equal(result, expected)
+
+ result = pd.eval('df[(df > 0) & (df2 > 0)]', engine=engine,
+ parser=parser)
+ assert_frame_equal(result, expected)
+
+ result = pd.eval('df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]',
+ engine=engine, parser=parser)
+ expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]
+ assert_frame_equal(result, expected)
+
+ result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser)
+ expected = df.query('(df>0) & (df2>0)', engine=engine, parser=parser)
+ assert_frame_equal(result, expected)
+
+
+class TestDataFrameQueryPythonPandas(TestDataFrameQueryNumExprPandas):
+ @classmethod
+ def setUpClass(cls):
+ cls.engine = 'python'
+ cls.parser = 'pandas'
+ cls.frame = _frame.copy()
+
+ @classmethod
+ def tearDownClass(cls):
+ del cls.frame, cls.engine, cls.parser
+
+
+class TestDataFrameQueryPythonPython(TestDataFrameQueryNumExprPython):
+ @classmethod
+ def setUpClass(cls):
+ cls.engine = cls.parser = 'python'
+ cls.frame = _frame.copy()
+
+ @classmethod
+ def tearDownClass(cls):
+ del cls.frame, cls.engine, cls.parser
+
+
+PARSERS = 'python', 'pandas'
+ENGINES = 'python', 'numexpr'
+
+
+class TestDataFrameQueryStrings(object):
+ def check_str_query_method(self, parser, engine):
+ skip_if_no_ne(engine)
+ df = DataFrame(randn(10, 1), columns=['b'])
+ df['strings'] = Series(list('aabbccddee'))
+ expect = df[df.strings == 'a']
+
+ if parser != 'pandas':
+ col = 'strings'
+ lst = '"a"'
+
+ lhs = [col] * 2 + [lst] * 2
+ rhs = lhs[::-1]
+
+ eq, ne = '==', '!='
+ ops = 2 * ([eq] + [ne])
+
+ for lhs, op, rhs in zip(lhs, ops, rhs):
+ ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs)
+ assertRaises(NotImplementedError, df.query, ex, engine=engine,
+ parser=parser, local_dict={'strings': df.strings})
+ else:
+ res = df.query('"a" == strings', engine=engine, parser=parser)
+ assert_frame_equal(res, expect)
+
+ res = df.query('strings == "a"', engine=engine, parser=parser)
+ assert_frame_equal(res, expect)
+ assert_frame_equal(res, df[df.strings.isin(['a'])])
+
+ expect = df[df.strings != 'a']
+ res = df.query('strings != "a"', engine=engine, parser=parser)
+ assert_frame_equal(res, expect)
+
+ res = df.query('"a" != strings', engine=engine, parser=parser)
+ assert_frame_equal(res, expect)
+ assert_frame_equal(res, df[~df.strings.isin(['a'])])
+
+ def test_str_query_method(self):
+ for parser, engine in product(PARSERS, ENGINES):
+ yield self.check_str_query_method, parser, engine
+
+ def test_str_list_query_method(self):
+ for parser, engine in product(PARSERS, ENGINES):
+ yield self.check_str_list_query_method, parser, engine
+
+ def check_str_list_query_method(self, parser, engine):
+ skip_if_no_ne(engine)
+ df = DataFrame(randn(10, 1), columns=['b'])
+ df['strings'] = Series(list('aabbccddee'))
+ expect = df[df.strings.isin(['a', 'b'])]
+
+ if parser != 'pandas':
+ col = 'strings'
+ lst = '["a", "b"]'
+
+ lhs = [col] * 2 + [lst] * 2
+ rhs = lhs[::-1]
+
+ eq, ne = '==', '!='
+ ops = 2 * ([eq] + [ne])
+
+ for lhs, op, rhs in zip(lhs, ops, rhs):
+ ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs)
+ assertRaises(NotImplementedError, df.query, ex, engine=engine,
+ parser=parser, local_dict={'strings': df.strings})
+ else:
+ res = df.query('strings == ["a", "b"]', engine=engine,
+ parser=parser)
+ assert_frame_equal(res, expect)
+
+ res = df.query('["a", "b"] == strings', engine=engine,
+ parser=parser)
+ assert_frame_equal(res, expect)
+
+ expect = df[~df.strings.isin(['a', 'b'])]
+
+ res = df.query('strings != ["a", "b"]', engine=engine,
+ parser=parser)
+ assert_frame_equal(res, expect)
+
+ res = df.query('["a", "b"] != strings', engine=engine,
+ parser=parser)
+ assert_frame_equal(res, expect)
+
+ def check_query_with_string_columns(self, parser, engine):
+ skip_if_no_ne(engine)
+ df = DataFrame({'a': list('aaaabbbbcccc'),
+ 'b': list('aabbccddeeff'),
+ 'c': np.random.randint(5, size=12),
+ 'd': np.random.randint(9, size=12)})
+ if parser == 'pandas':
+ res = df.query('a in b', parser=parser, engine=engine)
+ expec = df[df.a.isin(df.b)]
+ assert_frame_equal(res, expec)
+
+ res = df.query('a in b and c < d', parser=parser, engine=engine)
+ expec = df[df.a.isin(df.b) & (df.c < df.d)]
+ assert_frame_equal(res, expec)
+ else:
+ with assertRaises(NotImplementedError):
+ df.query('a in b', parser=parser, engine=engine)
+
+ with assertRaises(NotImplementedError):
+ df.query('a in b and c < d', parser=parser, engine=engine)
+
+ def test_query_with_string_columns(self):
+ for parser, engine in product(PARSERS, ENGINES):
+ yield self.check_query_with_string_columns, parser, engine
+
+ def check_object_array_eq_ne(self, parser, engine):
+ skip_if_no_ne(engine)
+ df = DataFrame({'a': list('aaaabbbbcccc'),
+ 'b': list('aabbccddeeff'),
+ 'c': np.random.randint(5, size=12),
+ 'd': np.random.randint(9, size=12)})
+ res = df.query('a == b', parser=parser, engine=engine)
+ exp = df[df.a == df.b]
+ assert_frame_equal(res, exp)
+
+ res = df.query('a != b', parser=parser, engine=engine)
+ exp = df[df.a != df.b]
+ assert_frame_equal(res, exp)
+
+ def test_object_array_eq_ne(self):
+ for parser, engine in product(PARSERS, ENGINES):
+ yield self.check_object_array_eq_ne, parser, engine
+
+
+class TestDataFrameEvalNumExprPandas(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ cls.engine = 'numexpr'
+ cls.parser = 'pandas'
+ skip_if_no_ne()
+
+ @classmethod
+ def tearDownClass(cls):
+ del cls.engine, cls.parser
+
+ def setUp(self):
+ self.frame = DataFrame(randn(10, 3), columns=list('abc'))
+
+ def tearDown(self):
+ del self.frame
+
+ def test_simple_expr(self):
+ res = self.frame.eval('a + b', engine=self.engine, parser=self.parser)
+ expect = self.frame.a + self.frame.b
+ assert_series_equal(res, expect)
+
+ def test_bool_arith_expr(self):
+ res = self.frame.eval('a[a < 1] + b', engine=self.engine,
+ parser=self.parser)
+ expect = self.frame.a[self.frame.a < 1] + self.frame.b
+ assert_series_equal(res, expect)
+
+
+class TestDataFrameEvalNumExprPython(TestDataFrameEvalNumExprPandas):
+ @classmethod
+ def setUpClass(cls):
+ cls.engine = 'numexpr'
+ cls.parser = 'python'
+ skip_if_no_ne()
+
+ @classmethod
+ def tearDownClass(cls):
+ del cls.engine, cls.parser
+
+
+class TestDataFrameEvalPythonPandas(TestDataFrameEvalNumExprPandas):
+ @classmethod
+ def setUpClass(cls):
+ cls.engine = 'python'
+ cls.parser = 'pandas'
+
+ @classmethod
+ def tearDownClass(cls):
+ del cls.engine, cls.parser
+
+
+class TestDataFrameEvalPythonPython(TestDataFrameEvalNumExprPython):
+ @classmethod
+ def setUpClass(cls):
+ cls.engine = cls.parser = 'python'
+
+ @classmethod
+ def tearDownClass(cls):
+ del cls.engine, cls.parser
+
+
if __name__ == '__main__':
- # unittest.main()
- import nose
- # nose.runmodule(argv=[__file__,'-vvs','-x', '--ipdb-failure'],
- # exit=False)
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)
diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py
index 1572ca481d8a4..8646d261306ca 100644
--- a/pandas/tseries/index.py
+++ b/pandas/tseries/index.py
@@ -926,7 +926,8 @@ def join(self, other, how='left', level=None, return_indexers=False):
See Index.join
"""
if (not isinstance(other, DatetimeIndex) and len(other) > 0 and
- other.inferred_type != 'mixed-integer'):
+ other.inferred_type not in ('floating', 'mixed-integer',
+ 'mixed-integer-float', 'mixed')):
try:
other = DatetimeIndex(other)
except TypeError:
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
index abc13fb2ad9ee..0718dc8926011 100644
--- a/pandas/util/testing.py
+++ b/pandas/util/testing.py
@@ -15,7 +15,7 @@
from contextlib import contextmanager
from distutils.version import LooseVersion
-from numpy.random import randn
+from numpy.random import randn, rand
import numpy as np
from pandas.core.common import isnull, _is_sequence
@@ -27,14 +27,14 @@
import pandas.compat as compat
from pandas.compat import(
map, zip, range, unichr, lrange, lmap, lzip, u, callable, Counter,
- raise_with_traceback
+ raise_with_traceback, httplib
)
from pandas import bdate_range
from pandas.tseries.index import DatetimeIndex
from pandas.tseries.period import PeriodIndex
-from pandas.io.common import urlopen, HTTPException
+from pandas.io.common import urlopen
Index = index.Index
MultiIndex = index.MultiIndex
@@ -48,6 +48,10 @@
_RAISE_NETWORK_ERROR_DEFAULT = False
+def randbool(size=(), p=0.5):
+ return rand(*size) <= p
+
+
def rands(n):
choices = string.ascii_letters + string.digits
return ''.join(random.choice(choices) for _ in range(n))
@@ -58,10 +62,17 @@ def randu(n):
choices += string.digits
return ''.join([random.choice(choices) for _ in range(n)])
+
+def choice(x, size=10):
+ """sample with replacement; uniform over the input"""
+ try:
+ return np.random.choice(x, size=size)
+ except AttributeError:
+ return np.random.randint(len(x), size=size).choose(x)
+
#------------------------------------------------------------------------------
# Console debugging tools
-
def debug(f, *args, **kwargs):
from pdb import Pdb as OldPdb
try:
@@ -752,7 +763,7 @@ def dec(f):
return wrapper
-_network_error_classes = IOError, HTTPException
+_network_error_classes = IOError, httplib.HTTPException
@optional_args
@@ -796,13 +807,13 @@ def network(t, raise_on_error=_RAISE_NETWORK_ERROR_DEFAULT,
>>> import nose
>>> @network
... def test_network():
- ... with urlopen("rabbit://bonanza.com") as f:
- ... pass
+ ... with urlopen("rabbit://bonanza.com") as f:
+ ... pass
...
>>> try:
- ... test_network()
+ ... test_network()
... except nose.SkipTest:
- ... print "SKIPPING!"
+ ... print("SKIPPING!")
...
SKIPPING!
@@ -811,8 +822,8 @@ def network(t, raise_on_error=_RAISE_NETWORK_ERROR_DEFAULT,
>>> @network(raise_on_error=True)
... def test_network():
- ... with urlopen("complaint://deadparrot.com") as f:
- ... pass
+ ... with urlopen("complaint://deadparrot.com") as f:
+ ... pass
...
>>> test_network()
Traceback (most recent call last):
diff --git a/setup.py b/setup.py
index b7df339daf75a..ffd6089bdc88d 100755
--- a/setup.py
+++ b/setup.py
@@ -83,7 +83,7 @@
except ImportError:
cython = False
-from os.path import splitext, basename, join as pjoin
+from os.path import join as pjoin
class build_ext(_build_ext):
@@ -506,6 +506,8 @@ def pxd(name):
maintainer=AUTHOR,
packages=['pandas',
'pandas.compat',
+ 'pandas.computation',
+ 'pandas.computation.tests',
'pandas.core',
'pandas.io',
'pandas.rpy',
diff --git a/vb_suite/binary_ops.py b/vb_suite/binary_ops.py
index 54774344520c9..3f076f9f922a3 100644
--- a/vb_suite/binary_ops.py
+++ b/vb_suite/binary_ops.py
@@ -21,7 +21,7 @@
start_date=datetime(2012, 1, 1))
setup = common_setup + """
-import pandas.core.expressions as expr
+import pandas.computation.expressions as expr
df = DataFrame(np.random.randn(20000, 100))
df2 = DataFrame(np.random.randn(20000, 100))
expr.set_numexpr_threads(1)
@@ -32,7 +32,7 @@
start_date=datetime(2013, 2, 26))
setup = common_setup + """
-import pandas.core.expressions as expr
+import pandas.computation.expressions as expr
df = DataFrame(np.random.randn(20000, 100))
df2 = DataFrame(np.random.randn(20000, 100))
expr.set_use_numexpr(False)
@@ -53,7 +53,7 @@
start_date=datetime(2012, 1, 1))
setup = common_setup + """
-import pandas.core.expressions as expr
+import pandas.computation.expressions as expr
df = DataFrame(np.random.randn(20000, 100))
df2 = DataFrame(np.random.randn(20000, 100))
expr.set_numexpr_threads(1)
@@ -63,7 +63,7 @@
start_date=datetime(2013, 2, 26))
setup = common_setup + """
-import pandas.core.expressions as expr
+import pandas.computation.expressions as expr
df = DataFrame(np.random.randn(20000, 100))
df2 = DataFrame(np.random.randn(20000, 100))
expr.set_use_numexpr(False)
@@ -84,7 +84,7 @@
start_date=datetime(2012, 1, 1))
setup = common_setup + """
-import pandas.core.expressions as expr
+import pandas.computation.expressions as expr
df = DataFrame(np.random.randn(20000, 100))
df2 = DataFrame(np.random.randn(20000, 100))
expr.set_numexpr_threads(1)
@@ -94,7 +94,7 @@
start_date=datetime(2013, 2, 26))
setup = common_setup + """
-import pandas.core.expressions as expr
+import pandas.computation.expressions as expr
df = DataFrame(np.random.randn(20000, 100))
df2 = DataFrame(np.random.randn(20000, 100))
expr.set_use_numexpr(False)
diff --git a/vb_suite/eval.py b/vb_suite/eval.py
new file mode 100644
index 0000000000000..c666cd431cbb4
--- /dev/null
+++ b/vb_suite/eval.py
@@ -0,0 +1,114 @@
+from vbench.benchmark import Benchmark
+from datetime import datetime
+
+common_setup = """from pandas_vb_common import *
+import pandas as pd
+df = DataFrame(np.random.randn(20000, 100))
+df2 = DataFrame(np.random.randn(20000, 100))
+df3 = DataFrame(np.random.randn(20000, 100))
+df4 = DataFrame(np.random.randn(20000, 100))
+"""
+
+setup = common_setup + """
+import pandas.computation.expressions as expr
+expr.set_numexpr_threads(1)
+"""
+
+SECTION = 'Eval'
+
+#----------------------------------------------------------------------
+# binary ops
+
+#----------------------------------------------------------------------
+# add
+eval_frame_add_all_threads = \
+ Benchmark("pd.eval('df + df2 + df3 + df4')", common_setup,
+ name='eval_frame_add_all_threads',
+ start_date=datetime(2013, 7, 21))
+
+
+
+eval_frame_add_one_thread = \
+ Benchmark("pd.eval('df + df2 + df3 + df4')", setup,
+ name='eval_frame_add_one_thread',
+ start_date=datetime(2013, 7, 26))
+
+eval_frame_add_python = \
+ Benchmark("pd.eval('df + df2 + df3 + df4', engine='python')", common_setup,
+ name='eval_frame_add_python', start_date=datetime(2013, 7, 21))
+
+eval_frame_add_python_one_thread = \
+ Benchmark("pd.eval('df + df2 + df3 + df4', engine='python')", setup,
+ name='eval_frame_add_python_one_thread',
+ start_date=datetime(2013, 7, 26))
+#----------------------------------------------------------------------
+# mult
+
+eval_frame_mult_all_threads = \
+ Benchmark("pd.eval('df * df2 * df3 * df4')", common_setup,
+ name='eval_frame_mult_all_threads',
+ start_date=datetime(2012, 7, 21))
+
+eval_frame_mult_one_thread = \
+ Benchmark("pd.eval('df * df2 * df3 * df4')", setup,
+ name='eval_frame_mult_one_thread',
+ start_date=datetime(2012, 7, 26))
+
+eval_frame_mult_python = \
+ Benchmark("pdl.eval('df * df2 * df3 * df4', engine='python')",
+ common_setup,
+ name='eval_frame_mult_python', start_date=datetime(2013, 7, 21))
+
+eval_frame_mult_python_one_thread = \
+ Benchmark("pd.eval('df * df2 * df3 * df4', engine='python')", setup,
+ name='eval_frame_mult_python_one_thread',
+ start_date=datetime(2012, 7, 26))
+
+#----------------------------------------------------------------------
+# multi and
+
+eval_frame_and_all_threads = \
+ Benchmark("pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)')",
+ common_setup,
+ name='eval_frame_and_all_threads',
+ start_date=datetime(2012, 7, 21))
+
+eval_frame_and_one_thread = \
+ Benchmark("pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)')", setup,
+ name='eval_frame_and_one_thread',
+ start_date=datetime(2012, 7, 26))
+
+setup = common_setup
+eval_frame_and_python = \
+ Benchmark("pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python')",
+ common_setup, name='eval_frame_and_python',
+ start_date=datetime(2013, 7, 21))
+
+eval_frame_and_one_thread = \
+ Benchmark("pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python')",
+ setup,
+ name='eval_frame_and_python_one_thread',
+ start_date=datetime(2012, 7, 26))
+
+#--------------------------------------------------------------------
+# chained comp
+eval_frame_chained_cmp_all_threads = \
+ Benchmark("pd.eval('df < df2 < df3 < df4')", common_setup,
+ name='eval_frame_chained_cmp_all_threads',
+ start_date=datetime(2012, 7, 21))
+
+eval_frame_chained_cmp_one_thread = \
+ Benchmark("pd.eval('df < df2 < df3 < df4')", setup,
+ name='eval_frame_chained_cmp_one_thread',
+ start_date=datetime(2012, 7, 26))
+
+setup = common_setup
+eval_frame_chained_cmp_python = \
+ Benchmark("pd.eval('df < df2 < df3 < df4', engine='python')",
+ common_setup, name='eval_frame_chained_cmp_python',
+ start_date=datetime(2013, 7, 26))
+
+eval_frame_chained_cmp_one_thread = \
+ Benchmark("pd.eval('df < df2 < df3 < df4', engine='python')", setup,
+ name='eval_frame_chained_cmp_python_one_thread',
+ start_date=datetime(2012, 7, 26))
diff --git a/vb_suite/indexing.py b/vb_suite/indexing.py
index 1264ae053ffca..beefec256ed81 100644
--- a/vb_suite/indexing.py
+++ b/vb_suite/indexing.py
@@ -106,7 +106,7 @@
start_date=datetime(2012, 1, 1))
setup = common_setup + """
-import pandas.core.expressions as expr
+import pandas.computation.expressions as expr
df = DataFrame(np.random.randn(50000, 100))
df2 = DataFrame(np.random.randn(50000, 100))
expr.set_numexpr_threads(1)
@@ -118,7 +118,7 @@
setup = common_setup + """
-import pandas.core.expressions as expr
+import pandas.computation.expressions as expr
df = DataFrame(np.random.randn(50000, 100))
df2 = DataFrame(np.random.randn(50000, 100))
expr.set_use_numexpr(False)
diff --git a/vb_suite/suite.py b/vb_suite/suite.py
index ca83855c2a109..f3c8dfe3032e0 100644
--- a/vb_suite/suite.py
+++ b/vb_suite/suite.py
@@ -23,7 +23,8 @@
'sparse',
'reshape',
'stat_ops',
- 'timeseries']
+ 'timeseries',
+ 'eval']
by_module = {}
benchmarks = []