diff --git a/bench/bench_with_subset.R b/bench/bench_with_subset.R new file mode 100644 index 0000000000000..69d0f7a9eec63 --- /dev/null +++ b/bench/bench_with_subset.R @@ -0,0 +1,53 @@ +library(microbenchmark) +library(data.table) + + +data.frame.subset.bench <- function (n=1e7, times=30) { + df <- data.frame(a=rnorm(n), b=rnorm(n), c=rnorm(n)) + print(microbenchmark(subset(df, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c), + times=times)) +} + + +# data.table allows something very similar to query with an expression +# but we have chained comparisons AND we're faster BOO YAH! +data.table.subset.expression.bench <- function (n=1e7, times=30) { + dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n)) + print(microbenchmark(dt[, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c], + times=times)) +} + + +# compare against subset with data.table for good measure +data.table.subset.bench <- function (n=1e7, times=30) { + dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n)) + print(microbenchmark(subset(dt, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c), + times=times)) +} + + +data.frame.with.bench <- function (n=1e7, times=30) { + df <- data.frame(a=rnorm(n), b=rnorm(n), c=rnorm(n)) + + print(microbenchmark(with(df, a + b * (c ^ 2 + b ^ 2 - a) / (a * c) ^ 3), + times=times)) +} + + +data.table.with.bench <- function (n=1e7, times=30) { + dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n)) + print(microbenchmark(with(dt, a + b * (c ^ 2 + b ^ 2 - a) / (a * c) ^ 3), + times=times)) +} + + +bench <- function () { + data.frame.subset.bench() + data.table.subset.expression.bench() + data.table.subset.bench() + data.frame.with.bench() + data.table.with.bench() +} + + +bench() diff --git a/bench/bench_with_subset.py b/bench/bench_with_subset.py new file mode 100644 index 0000000000000..99b98c9838a90 --- /dev/null +++ b/bench/bench_with_subset.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python + +""" +Microbenchmarks for comparison with R's "with" and "subset" functions +""" + +from __future__ import print_function +import numpy as np +from numpy import array +from timeit import repeat as timeit +from pandas.compat import range, zip +from pandas import DataFrame + + +setup_common = """from pandas import DataFrame +from numpy.random import randn +df = DataFrame(randn(%d, 3), columns=list('abc')) +%s""" + + +setup_with = "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'" + + +def bench_with(n, times=10, repeat=3, engine='numexpr'): + return np.array(timeit('df.eval(s, engine=%r)' % engine, + setup=setup_common % (n, setup_with), + repeat=repeat, number=times)) / times + + +setup_subset = "s = 'a <= b <= c ** 2 + b ** 2 - a and b > c'" + + +def bench_subset(n, times=10, repeat=3, engine='numexpr'): + return np.array(timeit('df.query(s, engine=%r)' % engine, + setup=setup_common % (n, setup_subset), + repeat=repeat, number=times)) / times + + +def bench(mn=1, mx=7, num=100, engines=('python', 'numexpr'), verbose=False): + r = np.logspace(mn, mx, num=num).round().astype(int) + + ev = DataFrame(np.empty((num, len(engines))), columns=engines) + qu = ev.copy(deep=True) + + ev['size'] = qu['size'] = r + + for engine in engines: + for i, n in enumerate(r): + if verbose: + print('engine: %r, i == %d' % (engine, i)) + ev.loc[i, engine] = bench_with(n, times=1, repeat=1, engine=engine) + qu.loc[i, engine] = bench_subset(n, times=1, repeat=1, + engine=engine) + + return ev, qu + + +def plot_perf(df, engines, title, filename=None): + from matplotlib.pyplot import figure, rc + + try: + from mpltools import style + except ImportError: + pass + else: + style.use('ggplot') + + rc('text', usetex=True) + + fig = figure(figsize=(4, 3), dpi=100) + ax = fig.add_subplot(111) + + for engine in engines: + ax.plot(df.size, df[engine], label=engine, lw=2) + + ax.set_xlabel('Number of Rows') + ax.set_ylabel('Time (s)') + ax.set_title(title) + ax.legend(loc='best') + ax.tick_params(top=False, right=False) + + fig.tight_layout() + + if filename is not None: + fig.savefig(filename) + + +if __name__ == '__main__': + import os + import pandas as pd + + pandas_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) + static_path = os.path.join(pandas_dir, 'doc', 'source', '_static') + + join = lambda p: os.path.join(static_path, p) + + fn = join('eval-query-perf-data.h5') + + engines = 'python', 'numexpr' + + if not os.path.exists(fn): + ev, qu = bench(verbose=True) + ev.to_hdf(fn, 'eval') + qu.to_hdf(fn, 'query') + else: + ev = pd.read_hdf(fn, 'eval') + qu = pd.read_hdf(fn, 'query') + + plot_perf(ev, engines, 'DataFrame.eval()', filename=join('eval-perf.png')) + plot_perf(qu, engines, 'DataFrame.query()', + filename=join('query-perf.png')) + + plot_perf(ev[ev.size <= 50000], engines, 'DataFrame.eval()', + filename=join('eval-perf-small.png')) + plot_perf(qu[qu.size <= 100000], engines, 'DataFrame.query()', + filename=join('query-perf-small.png')) diff --git a/doc/source/_static/eval-perf-small.png b/doc/source/_static/eval-perf-small.png new file mode 100644 index 0000000000000..d86018363ffdc Binary files /dev/null and b/doc/source/_static/eval-perf-small.png differ diff --git a/doc/source/_static/eval-perf.png b/doc/source/_static/eval-perf.png new file mode 100644 index 0000000000000..14c69c1b85d9e Binary files /dev/null and b/doc/source/_static/eval-perf.png differ diff --git a/doc/source/_static/query-perf-small.png b/doc/source/_static/query-perf-small.png new file mode 100644 index 0000000000000..56fcc787a66af Binary files /dev/null and b/doc/source/_static/query-perf-small.png differ diff --git a/doc/source/_static/query-perf.png b/doc/source/_static/query-perf.png new file mode 100644 index 0000000000000..d96318df94357 Binary files /dev/null and b/doc/source/_static/query-perf.png differ diff --git a/doc/source/api.rst b/doc/source/api.rst index 538965d0be7ad..28c1515e93bc5 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -155,6 +155,17 @@ Top-level dealing with datetimes to_datetime +Top-level evaluation +~~~~~~~~~~~~~~~~~~~~ + +.. currentmodule:: pandas + +.. autosummary:: + :toctree: generated/ + + eval + + Standard moving window functions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -452,6 +463,7 @@ Indexing, iteration DataFrame.tail DataFrame.xs DataFrame.isin + DataFrame.query Binary operator functions ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -502,6 +514,7 @@ Computations / Descriptive Stats DataFrame.cumsum DataFrame.describe DataFrame.diff + DataFrame.eval DataFrame.kurt DataFrame.mad DataFrame.max diff --git a/doc/source/comparison_with_r.rst b/doc/source/comparison_with_r.rst index 5759768051c0e..ef609aaa7d70c 100644 --- a/doc/source/comparison_with_r.rst +++ b/doc/source/comparison_with_r.rst @@ -1,28 +1,87 @@ .. currentmodule:: pandas .. _compare_with_r: -******************************* Comparison with R / R libraries ******************************* -Since pandas aims to provide a lot of the data manipulation and analysis -functionality that people use R for, this page was started to provide a more -detailed look at the R language and it's many 3rd party libraries as they -relate to pandas. In offering comparisons with R and CRAN libraries, we care -about the following things: +Since ``pandas`` aims to provide a lot of the data manipulation and analysis +functionality that people use `R `__ for, this page +was started to provide a more detailed look at the `R language +`__ and its many third +party libraries as they relate to ``pandas``. In comparisons with R and CRAN +libraries, we care about the following things: - - **Functionality / flexibility**: what can / cannot be done with each tool - - **Performance**: how fast are operations. Hard numbers / benchmarks are + - **Functionality / flexibility**: what can/cannot be done with each tool + - **Performance**: how fast are operations. Hard numbers/benchmarks are preferable - - **Ease-of-use**: is one tool easier or harder to use (you may have to be - the judge of this given side-by-side code comparisons) + - **Ease-of-use**: Is one tool easier/harder to use (you may have to be + the judge of this, given side-by-side code comparisons) + +This page is also here to offer a bit of a translation guide for users of these +R packages. + +Base R +------ + +|subset|_ +~~~~~~~~~~ + +.. versionadded:: 0.13 + +The :meth:`~pandas.DataFrame.query` method is similar to the base R ``subset`` +function. In R you might want to get the rows of a ``data.frame`` where one +column's values are less than another column's values: + + .. code-block:: r + + df <- data.frame(a=rnorm(10), b=rnorm(10)) + subset(df, a <= b) + df[df$a <= df$b,] # note the comma + +In ``pandas``, there are a few ways to perform subsetting. You can use +:meth:`~pandas.DataFrame.query` or pass an expression as if it were an +index/slice as well as standard boolean indexing: + + .. ipython:: python + + from pandas import DataFrame + from numpy.random import randn + + df = DataFrame({'a': randn(10), 'b': randn(10)}) + df.query('a <= b') + df[df.a <= df.b] + df.loc[df.a <= df.b] -As I do not have an encyclopedic knowledge of R packages, feel free to suggest -additional CRAN packages to add to this list. This is also here to offer a big -of a translation guide for users of these R packages. +For more details and examples see :ref:`the query documentation +`. -data.frame ----------- + +|with|_ +~~~~~~~~ + +.. versionadded:: 0.13 + +An expression using a data.frame called ``df`` in R with the columns ``a`` and +``b`` would be evaluated using ``with`` like so: + + .. code-block:: r + + df <- data.frame(a=rnorm(10), b=rnorm(10)) + with(df, a + b) + df$a + df$b # same as the previous expression + +In ``pandas`` the equivalent expression, using the +:meth:`~pandas.DataFrame.eval` method, would be: + + .. ipython:: python + + df = DataFrame({'a': randn(10), 'b': randn(10)}) + df.eval('a + b') + df.a + df.b # same as the previous expression + +In certain cases :meth:`~pandas.DataFrame.eval` will be much faster than +evaluation in pure Python. For more details and examples see :ref:`the eval +documentation `. zoo --- @@ -36,3 +95,9 @@ plyr reshape / reshape2 ------------------ + +.. |with| replace:: ``with`` +.. _with: http://finzi.psych.upenn.edu/R/library/base/html/with.html + +.. |subset| replace:: ``subset`` +.. _subset: http://finzi.psych.upenn.edu/R/library/base/html/subset.html diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index 95428bd27e2a2..87b68248c3e9e 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -225,8 +225,8 @@ the rows, applying our ``integrate_f_typed``, and putting this in the zeros arra .. note:: - Loop like this would be *extremely* slow in python, but in cython looping over - numpy arrays is *fast*. + Loops like this would be *extremely* slow in python, but in Cython looping + over numpy arrays is *fast*. .. ipython:: python @@ -289,3 +289,262 @@ Further topics - Loading C modules into cython. Read more in the `cython docs `__. + +.. _enhancingperf.eval: + +Expression Evaluation via :func:`~pandas.eval` (Experimental) +------------------------------------------------------------- + +.. versionadded:: 0.13 + +The top-level function :func:`~pandas.eval` implements expression evaluation of +:class:`~pandas.Series` and :class:`~pandas.DataFrame` objects. + +.. note:: + + To benefit from using :func:`~pandas.eval` you need to + install ``numexpr``. See the :ref:`recommended dependencies section + ` for more details. + +The point of using :func:`~pandas.eval` for expression evaluation rather than +plain Python is two-fold: 1) large :class:`~pandas.DataFrame` objects are +evaluated more efficiently and 2) large arithmetic and boolean expressions are +evaluated all at once by the underlying engine (by default ``numexpr`` is used +for evaluation). + +.. note:: + + You should not use :func:`~pandas.eval` for simple + expressions or for expressions involving small DataFrames. In fact, + :func:`~pandas.eval` is many orders of magnitude slower for + smaller expressions/objects than plain ol' Python. A good rule of thumb is + to only use :func:`~pandas.eval` when you have a + :class:`~pandas.core.frame.DataFrame` with more than 10,000 rows. + + +:func:`~pandas.eval` supports all arithmetic expressions supported by the +engine in addition to some extensions available only in pandas. + +.. note:: + + The larger the frame and the larger the expression the more speedup you will + see from using :func:`~pandas.eval`. + + +:func:`~pandas.eval` Examples +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:func:`~pandas.eval` works wonders for expressions containing large arrays + +First let's create 4 decent-sized arrays to play with: + +.. ipython:: python + + import pandas as pd + from pandas import DataFrame, Series + from numpy.random import randn + import numpy as np + nrows, ncols = 20000, 100 + df1, df2, df3, df4 = [DataFrame(randn(nrows, ncols)) for _ in xrange(4)] + + +Now let's compare adding them together using plain ol' Python versus +:func:`~pandas.eval`: + + +.. ipython:: python + + %timeit df1 + df2 + df3 + df4 + +.. ipython:: python + + %timeit pd.eval('df1 + df2 + df3 + df4') + + +Now let's do the same thing but with comparisons: + +.. ipython:: python + + %timeit (df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0) + +.. ipython:: python + + %timeit pd.eval('(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)') + + +:func:`~pandas.eval` also works with unaligned pandas objects: + + +.. ipython:: python + + s = Series(randn(50)) + %timeit df1 + df2 + df3 + df4 + s + +.. ipython:: python + + %timeit pd.eval('df1 + df2 + df3 + df4 + s') + +.. note:: + + Operations such as ``1 and 2`` should be performed in Python. An exception + will be raised if you try to performed any boolean or bitwise operations + with scalar operands that are not of type ``bool`` or ``np.bool_``. *This + includes bitwise operations on scalars.* You should perform these kinds of + operations in Python. + +The ``DataFrame.eval`` method (Experimental) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In addition to the top level :func:`~pandas.eval` function you can also +evaluate an expression in the "context" of a ``DataFrame``. + + +.. ipython:: python + + df = DataFrame(randn(5, 2), columns=['a', 'b']) + df.eval('a + b') + + +Any expression that is a valid :func:`~pandas.eval` expression is also a valid +``DataFrame.eval`` expression, with the added benefit that *you don't have to +prefix the name of the* ``DataFrame`` *to the column you're interested in +evaluating*. + + +Local Variables +~~~~~~~~~~~~~~~ + +You can refer to local variables the same way you would in vanilla Python + +.. ipython:: python + + df = DataFrame(randn(5, 2), columns=['a', 'b']) + newcol = randn(len(df)) + df.eval('b + newcol') + +.. note:: + + The one exception is when you have a local (or global) with the same name as + a column in the ``DataFrame`` + + .. code-block:: python + + df = DataFrame(randn(5, 2), columns=['a', 'b']) + a = randn(len(df)) + df.eval('a + b') + NameResolutionError: resolvers and locals overlap on names ['a'] + + + To deal with these conflicts, a special syntax exists for referring + variables with the same name as a column + + .. ipython:: python + :suppress: + + a = randn(len(df)) + + .. ipython:: python + + df.eval('@a + b') + + The same is true for :meth:`~pandas.DataFrame.query` + + .. ipython:: python + + df.query('@a < b') + + .. ipython:: python + :suppress: + + del a + + +:func:`~pandas.eval` Parsers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There are two different parsers and and two different engines you can use as +the backend. + +The default ``'pandas'`` parser allows a more intuitive syntax for expressing +query-like operations (comparisons, conjunctions and disjunctions). In +particular, the precedence of the ``&`` and ``|`` operators is made equal to +the precedence of the corresponding boolean operations ``and`` and ``or``. + +For example, the above conjunction can be written without parentheses. +Alternatively, you can use the ``'python'`` parser to enforce strict Python +semantics. + +.. ipython:: python + + expr = '(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)' + x = pd.eval(expr, parser='python') + expr_no_parens = 'df1 > 0 & df2 > 0 & df3 > 0 & df4 > 0' + y = pd.eval(expr_no_parens, parser='pandas') + np.all(x == y) + + +The same expression can be "anded" together with the word :keyword:`and` as +well: + +.. ipython:: python + + expr = '(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)' + x = pd.eval(expr, parser='python') + expr_with_ands = 'df1 > 0 and df2 > 0 and df3 > 0 and df4 > 0' + y = pd.eval(expr_with_ands, parser='pandas') + np.all(x == y) + + +The ``and`` and ``or`` operators here have the same precedence that they would +in vanilla Python. + + +:func:`~pandas.eval` Backends +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There's also the option to make :func:`~pandas.eval` operate identical to plain +ol' Python. + +.. note:: + + Using the ``'python'`` engine is generally *not* useful, except for testing + other :func:`~pandas.eval` engines against it. You will acheive **no** + performance benefits using :func:`~pandas.eval` with ``engine='python'``. + +You can see this by using :func:`~pandas.eval` with the ``'python'`` engine is +actually a bit slower (not by much) than evaluating the same expression in +Python: + +.. ipython:: python + + %timeit df1 + df2 + df3 + df4 + +.. ipython:: python + + %timeit pd.eval('df1 + df2 + df3 + df4', engine='python') + + +:func:`~pandas.eval` Performance +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:func:`~pandas.eval` is intended to speed up certain kinds of operations. In +particular, those operations involving complex expressions with large +``DataFrame``/``Series`` objects should see a significant performance benefit. +Here is a plot showing the running time of :func:`~pandas.eval` as function of +the size of the frame involved in the computation. The two lines are two +different engines. + + +.. image:: _static/eval-perf.png + + +.. note:: + + Operations with smallish objects (around 15k-20k rows) are faster using + plain Python: + + .. image:: _static/eval-perf-small.png + + +This plot was created using a ``DataFrame`` with 3 columns each containing +floating point values generated using ``numpy.random.randn()``. diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index d2fd11ee43615..2f2a47d4b0bf2 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -26,48 +26,58 @@ The axis labeling information in pandas objects serves many purposes: - Enables automatic and explicit data alignment - Allows intuitive getting and setting of subsets of the data set -In this section / chapter, we will focus on the final point: namely, how to -slice, dice, and generally get and set subsets of pandas objects. The primary -focus will be on Series and DataFrame as they have received more development -attention in this area. Expect more work to be invested higher-dimensional data -structures (including Panel) in the future, especially in label-based advanced +In this section, we will focus on the final point: namely, how to slice, dice, +and generally get and set subsets of pandas objects. The primary focus will be +on Series and DataFrame as they have received more development attention in +this area. Expect more work to be invested higher-dimensional data structures +(including ``Panel``) in the future, especially in label-based advanced indexing. .. note:: - The Python and NumPy indexing operators ``[]`` and attribute operator ``.`` provide quick and easy access to pandas data structures - across a wide range of use cases. This makes interactive work intuitive, as - there's little new to learn if you already know how to deal with Python - dictionaries and NumPy arrays. However, since the type of the data to be accessed - isn't known in advance, directly using - standard operators has some optimization limits. For production code, we recommended - that you take advantage of the optimized pandas data access methods exposed in this chapter. + The Python and NumPy indexing operators ``[]`` and attribute operator ``.`` + provide quick and easy access to pandas data structures across a wide range + of use cases. This makes interactive work intuitive, as there's little new + to learn if you already know how to deal with Python dictionaries and NumPy + arrays. However, since the type of the data to be accessed isn't known in + advance, directly using standard operators has some optimization limits. For + production code, we recommended that you take advantage of the optimized + pandas data access methods exposed in this chapter. .. warning:: - Whether a copy or a reference is returned for a setting operation, may depend on the context. - This is sometimes called ``chained assignment`` and should be avoided. - See :ref:`Returning a View versus Copy ` + Whether a copy or a reference is returned for a setting operation, may + depend on the context. This is sometimes called ``chained assignment`` and + should be avoided. See :ref:`Returning a View versus Copy + ` See the :ref:`cookbook` for some advanced strategies -Choice ------- +Different Choices for Indexing (``loc``, ``iloc``, and ``ix``) +-------------------------------------------------------------- + +.. versionadded:: 0.11.0 -Starting in 0.11.0, object selection has had a number of user-requested additions in -order to support more explicit location based indexing. Pandas now supports -three types of multi-axis indexing. +Object selection has had a number of user-requested additions in order to +support more explicit location based indexing. Pandas now supports three types +of multi-axis indexing. -- ``.loc`` is strictly label based, will raise ``KeyError`` when the items are not found, allowed inputs are: +- ``.loc`` is strictly label based, will raise ``KeyError`` when the items are + not found, allowed inputs are: - - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is interpreted as a *label* of the index. This use is **not** an integer position along the index) + - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is interpreted as a + *label* of the index. This use is **not** an integer position along the + index) - A list or array of labels ``['a', 'b', 'c']`` - - A slice object with labels ``'a':'f'``, (note that contrary to usual python slices, **both** the start and the stop are included!) + - A slice object with labels ``'a':'f'``, (note that contrary to usual python + slices, **both** the start and the stop are included!) - A boolean array See more at :ref:`Selection by Label ` -- ``.iloc`` is strictly integer position based (from ``0`` to ``length-1`` of the axis), will raise ``IndexError`` when the requested indicies are out of bounds. Allowed inputs are: +- ``.iloc`` is strictly integer position based (from ``0`` to ``length-1`` of + the axis), will raise ``IndexError`` when the requested indicies are out of + bounds. Allowed inputs are: - An integer e.g. ``5`` - A list or array of integers ``[4, 3, 0]`` @@ -75,20 +85,24 @@ three types of multi-axis indexing. See more at :ref:`Selection by Position ` -- ``.ix`` supports mixed integer and label based access. It is primarily label based, but will fallback to integer positional access. ``.ix`` is the most general - and will support any of the inputs to ``.loc`` and ``.iloc``, as well as support for floating point label schemes. ``.ix`` is especially useful when dealing with mixed positional and label - based hierarchial indexes. - - As using integer slices with ``.ix`` have different behavior depending on whether the slice is interpreted as position based or label based, it's +- ``.ix`` supports mixed integer and label based access. It is primarily label + based, but will fallback to integer positional access. ``.ix`` is the most + general and will support any of the inputs to ``.loc`` and ``.iloc``, as well + as support for floating point label schemes. ``.ix`` is especially useful + when dealing with mixed positional and label based hierarchial indexes. + As using integer slices with ``.ix`` have different behavior depending on + whether the slice is interpreted as position based or label based, it's usually better to be explicit and use ``.iloc`` or ``.loc``. - See more at :ref:`Advanced Indexing `, :ref:`Advanced Hierarchical ` and :ref:`Fallback Indexing ` + See more at :ref:`Advanced Indexing `, :ref:`Advanced + Hierarchical ` and :ref:`Fallback Indexing + ` Getting values from an object with multi-axes selection uses the following notation (using ``.loc`` as an example, but applies to ``.iloc`` and ``.ix`` as well). Any of the axes accessors may be the null slice ``:``. Axes left out of the specification are assumed to be ``:``. (e.g. ``p.loc['a']`` is equiv to -``p.loc['a',:,:]``) +``p.loc['a', :, :]``) .. csv-table:: :header: "Object Type", "Indexers" @@ -100,7 +114,7 @@ the specification are assumed to be ``:``. (e.g. ``p.loc['a']`` is equiv to Panel; ``p.loc[item_indexer,major_indexer,minor_indexer]`` Deprecations -~~~~~~~~~~~~ +------------ Beginning with version 0.11.0, it's recommended that you transition away from the following methods as they *may* be deprecated in future versions. @@ -168,7 +182,7 @@ You may find this useful for applying a transform (in-place) to a subset of the columns. Attribute Access -~~~~~~~~~~~~~~~~ +---------------- .. _indexing.columns.multiple: @@ -213,7 +227,7 @@ If you are using the IPython environment, you may also use tab-completion to see these accessable attributes. Slicing ranges -~~~~~~~~~~~~~~ +-------------- The most robust and consistent way of slicing ranges along arbitrary axes is described in the :ref:`Selection by Position ` section @@ -247,7 +261,7 @@ largely as a convenience since it is such a common operation. .. _indexing.label: Selection By Label -~~~~~~~~~~~~~~~~~~ +------------------ .. warning:: @@ -318,7 +332,7 @@ For getting a value explicity (equiv to deprecated ``df.get_value('a','A')``) .. _indexing.integer: Selection By Position -~~~~~~~~~~~~~~~~~~~~~ +--------------------- .. warning:: @@ -415,7 +429,7 @@ Pandas will detect this and raise ``IndexError``, rather than return an empty st .. _indexing.basics.partial_setting: Setting With Enlargement -~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------ .. versionadded:: 0.13 @@ -450,7 +464,7 @@ This is like an ``append`` operation on the ``DataFrame``. .. _indexing.basics.get_value: Fast scalar value getting and setting -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------------- Since indexing with ``[]`` must handle a lot of cases (single-label access, slicing, boolean indexing, etc.), it has a bit of overhead in order to figure @@ -481,7 +495,7 @@ You can also set using these same indexers. df Boolean indexing -~~~~~~~~~~~~~~~~ +---------------- .. _indexing.boolean: @@ -572,8 +586,8 @@ You can also describe columns using integer location: df.isin(values, iloc=True) -Where and Masking -~~~~~~~~~~~~~~~~~ +The :meth:`~pandas.DataFrame.where` Method and Masking +------------------------------------------------------ Selecting values from a Series with a boolean vector generally returns a subset of the data. To guarantee that selection output has the same shape as @@ -673,8 +687,304 @@ This is equivalent (but faster than) the following. s.mask(s >= 0) df.mask(df >= 0) +.. _indexing.query: + +The :meth:`~pandas.DataFrame.query` Method (Experimental) +--------------------------------------------------------- + +.. versionadded:: 0.13 + +:class:`~pandas.DataFrame` objects have a :meth:`~pandas.DataFrame.query` +method that allows selection using an expression. + +You can get the value of the frame where column ``b`` has values +between the values of columns ``a`` and ``c``. For example: + +.. ipython:: python + :suppress: + + from numpy.random import randint, rand + np.random.seed(1234) + +.. ipython:: python + + n = 10 + df = DataFrame(rand(n, 3), columns=list('abc')) + df + + # pure python + df[(df.a < df.b) & (df.b < df.c)] + + # query + df.query('(a < b) & (b < c)') + +Do the same thing but fallback on a named index if there is no column +with the name ``a``. + +.. ipython:: python + + df = DataFrame(randint(n / 2, size=(n, 2)), columns=list('bc')) + df.index.name = 'a' + df + df.query('a < b and b < c') + +If instead you don't want to or cannot name your index, you can use the name +``index`` in your query expression: + +.. ipython:: python + :suppress: + + old_index = index + del index + +.. ipython:: python + + df = DataFrame(randint(n, size=(n, 2)), columns=list('bc')) + df + df.query('index < b < c') + +.. ipython:: python + :suppress: + + index = old_index + del old_index + + +:class:`~pandas.MultiIndex` :meth:`~pandas.DataFrame.query` Syntax +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can also use the levels of a ``DataFrame`` with a +:class:`~pandas.MultiIndex` as if they were columns in the frame: + +.. ipython:: python + + import pandas.util.testing as tm + + n = 10 + colors = tm.choice(['red', 'green'], size=n) + foods = tm.choice(['eggs', 'ham'], size=n) + colors + foods + + index = MultiIndex.from_arrays([colors, foods], names=['color', 'food']) + df = DataFrame(randn(n, 2), index=index) + df + df.query('color == "red"') + +If the levels of the ``MultiIndex`` are unnamed, you can refer to them using +special names: + + +.. ipython:: python + + df.index.names = [None, None] + df + df.query('ilevel_0 == "red"') + + +The convention is ``ilevel_0``, which means "index level 0" for the 0th level +of the ``index``. + + +:meth:`~pandas.DataFrame.query` Use Cases +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A use case for :meth:`~pandas.DataFrame.query` is when you have a collection of +:class:`~pandas.DataFrame` objects that have a subset of column names (or index +levels/names) in common. You can pass the same query to both frames *without* +having to specify which frame you're interested in querying + +.. ipython:: python + + df = DataFrame(rand(n, 3), columns=list('abc')) + df + df2 = DataFrame(rand(n + 2, 3), columns=df.columns) + df2 + expr = '0.0 <= a <= c <= 0.5' + map(lambda frame: frame.query(expr), [df, df2]) + +:meth:`~pandas.DataFrame.query` Python versus pandas Syntax Comparison +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Full numpy-like syntax + +.. ipython:: python + + df = DataFrame(randint(n, size=(n, 3)), columns=list('abc')) + df + df.query('(a < b) & (b < c)') + df[(df.a < df.b) & (df.b < df.c)] + +Slightly nicer by removing the parentheses (by binding making comparison +operators bind tighter than ``&``/``|``) + +.. ipython:: python + + df.query('a < b & b < c') + +Use English instead of symbols + +.. ipython:: python + + df.query('a < b and b < c') + +Pretty close to how you might write it on paper + +.. ipython:: python + + df.query('a < b < c') + +The ``in`` and ``not in`` operators +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~pandas.DataFrame.query` also supports special use of Python's ``in`` and +``not in`` comparison operators, providing a succint syntax for calling the +``isin`` method of a ``Series`` or ``DataFrame``. + +.. ipython:: python + :suppress: + + try: + old_d = d + del d + except NameError: + pass + +.. ipython:: python + + # get all rows where columns "a" and "b" have overlapping values + df = DataFrame({'a': list('aabbccddeeff'), 'b': list('aaaabbbbcccc'), + 'c': randint(5, size=12), 'd': randint(9, size=12)}) + df + df.query('a in b') + + # How you'd do it in pure Python + df[df.a.isin(df.b)] + + df.query('a not in b') + + # pure Python + df[~df.a.isin(df.b)] + + +You can combine this with other expressions for very succinct queries: + + +.. ipython:: python + + # rows where cols a and b have overlapping values and col c's values are less than col d's + df.query('a in b and c < d') + + # pure Python + df[df.b.isin(df.a) & (df.c < df.d)] + + +.. note:: + + Note that ``in`` and ``not in`` are evaluated in Python, since ``numexpr`` + has no equivalent of this operation. However, **only the** ``in``/``not in`` + **expression itself** is evaluated in vanilla Python. For example, in the + expression + + .. code-block:: python + + df.query('a in b + c + d') + + ``(b + c + d)`` is evaluated by ``numexpr`` and *then* the ``in`` + operation is evaluated in plain Python. In general, any operations that can + be evaluated using ``numexpr`` will be. + +Special use of the ``==`` operator with ``list`` objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Comparing a ``list`` of values to a column using ``==``/``!=`` works similarly +to ``in``/``not in`` + +.. ipython:: python + + df.query('b == ["a", "b", "c"]') + + # pure Python + df[df.b.isin(["a", "b", "c"])] + + df.query('c == [1, 2]') + + df.query('c != [1, 2]') + + # using in/not in + df.query('[1, 2] in c') + + df.query('[1, 2] not in c') + + # pure Python + df[df.c.isin([1, 2])] + + +Boolean Operators +~~~~~~~~~~~~~~~~~ + +You can negate boolean expressions with the word ``not`` or the ``~`` operator. + +.. ipython:: python + + df = DataFrame(rand(n, 3), columns=list('abc')) + df['bools'] = rand(len(df)) > 0.5 + df.query('~bools') + df.query('not bools') + df.query('not bools') == df[~df.bools] + +Of course, expressions can be arbitrarily complex too + +.. ipython:: python + + # short query syntax + shorter = df.query('a < b < c and (not bools) or bools > 2') + + # equivalent in pure Python + longer = df[(df.a < df.b) & (df.b < df.c) & (~df.bools) | (df.bools > 2)] + + shorter + longer + + shorter == longer + +.. ipython:: python + :suppress: + + try: + d = old_d + del old_d + except NameError: + pass + + +Performance of :meth:`~pandas.DataFrame.query` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``DataFrame.query()`` using ``numexpr`` is slightly faster than Python for +large frames + +.. image:: _static/query-perf.png + +.. note:: + + You will only see the performance benefits of using the ``numexpr`` engine + with ``DataFrame.query()`` if your frame has more than approximately 50,000 + rows + + .. image:: _static/query-perf-small.png + +This plot was created using a ``DataFrame`` with 3 columns each containing +floating point values generated using ``numpy.random.randn()``. + +.. ipython:: python + :suppress: + + df = DataFrame(randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) + df2 = df.copy() + Take Methods -~~~~~~~~~~~~ +------------ .. _indexing.take: @@ -740,7 +1050,7 @@ faster than fancy indexing. timeit ser.take(indexer) Duplicate Data -~~~~~~~~~~~~~~ +-------------- .. _indexing.duplicate: @@ -766,8 +1076,8 @@ should be taken instead. .. _indexing.dictionarylike: -Dictionary-like ``get`` method -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Dictionary-like :meth:`~pandas.DataFrame.get` method +---------------------------------------------------- Each of Series, DataFrame, and Panel have a ``get`` method which can return a default value. @@ -865,8 +1175,8 @@ labels or even boolean vectors: Slicing with labels is closely related to the ``truncate`` method which does precisely ``.ix[start:stop]`` but returns a copy (for legacy reasons). -The ``select`` method -~~~~~~~~~~~~~~~~~~~~~ +The :meth:`~pandas.DataFrame.select` Method +------------------------------------------- Another way to extract slices from an object is with the ``select`` method of Series, DataFrame, and Panel. This method should be used only when there is no @@ -877,8 +1187,8 @@ more direct way. ``select`` takes a function which operates on labels along df.select(lambda x: x == 'A', axis=1) -The ``lookup`` method -~~~~~~~~~~~~~~~~~~~~~ +The :meth:`~pandas.DataFrame.lookup` Method +------------------------------------------- Sometimes you want to extract a set of values given a sequence of row labels and column labels, and the ``lookup`` method allows for this and returns a @@ -890,7 +1200,7 @@ numpy array. For instance, dflookup.lookup(list(range(0,10,2)), ['B','C','A','B','D']) Setting values in mixed-type DataFrame -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-------------------------------------- .. _indexing.mixed_type_setting: @@ -909,7 +1219,7 @@ scalar values, though setting arbitrary vectors is not yet supported: .. _indexing.view_versus_copy: Returning a view versus a copy -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------ The rules about when a view on the data is returned are entirely dependent on NumPy. Whenever an array of labels or a boolean vector are involved in the @@ -970,7 +1280,7 @@ When assigning values to subsets of your data, thus, make sure to either use the pandas access methods or explicitly handle the assignment creating a copy. Fallback indexing -~~~~~~~~~~~~~~~~~~~~ +----------------- .. _indexing.fallback: @@ -1006,6 +1316,71 @@ convert to an integer index: df_new[(df_new['index'] >= 1.0) & (df_new['index'] < 2)] +.. _indexing.class: + +Index objects +------------- + +The pandas :class:`~pandas.Index` class and its subclasses can be viewed as +implementing an *ordered multiset*. Duplicates are allowed. However, if you try +to convert an :class:`~pandas.Index` object with duplicate entries into a +``set``, an exception will be raised. + +:class:`~pandas.Index` also provides the infrastructure necessary for +lookups, data alignment, and reindexing. The easiest way to create an +:class:`~pandas.Index` directly is to pass a ``list`` or other sequence to +:class:`~pandas.Index`: + +.. ipython:: python + + index = Index(['e', 'd', 'a', 'b']) + index + 'd' in index + +You can also pass a ``name`` to be stored in the index: + + +.. ipython:: python + + index = Index(['e', 'd', 'a', 'b'], name='something') + index.name + +Starting with pandas 0.5, the name, if set, will be shown in the console +display: + +.. ipython:: python + + index = Index(list(range(5)), name='rows') + columns = Index(['A', 'B', 'C'], name='cols') + df = DataFrame(np.random.randn(5, 3), index=index, columns=columns) + df + df['A'] + + +Set operations on Index objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _indexing.set_ops: + +The three main operations are ``union (|)``, ``intersection (&)``, and ``diff +(-)``. These can be directly called as instance methods or used via overloaded +operators: + +.. ipython:: python + + a = Index(['c', 'b', 'a']) + b = Index(['c', 'e', 'd']) + a.union(b) + a | b + a & b + a - b + +The ``isin`` method of Index objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +One additional operation is the ``isin`` method that works analogously to the +``Series.isin`` method found :ref:`here `. + .. _indexing.hierarchical: Hierarchical indexing (MultiIndex) @@ -1206,7 +1581,7 @@ mailing list. .. _indexing.xs: Cross-section with hierarchical index -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The ``xs`` method of ``DataFrame`` additionally takes a level argument to make selecting data at a particular level of a MultiIndex easier. @@ -1238,8 +1613,8 @@ instance: print df2_aligned -The need for sortedness -~~~~~~~~~~~~~~~~~~~~~~~ +The need for sortedness with :class:`~pandas.MultiIndex` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ **Caveat emptor**: the present implementation of ``MultiIndex`` requires that the labels be sorted for some of the slicing / indexing routines to work @@ -1311,8 +1686,8 @@ However: ... KeyError: Key length (3) was greater than MultiIndex lexsort depth (2) -Swapping levels with ``swaplevel`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Swapping levels with :meth:`~pandas.MultiIndex.swaplevel` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The ``swaplevel`` function can switch the order of two levels: @@ -1323,8 +1698,8 @@ The ``swaplevel`` function can switch the order of two levels: .. _indexing.reorderlevels: -Reordering levels with ``reorder_levels`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Reordering levels with :meth:`~pandas.MultiIndex.reorder_levels` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The ``reorder_levels`` function generalizes the ``swaplevel`` function, allowing you to permute the hierarchical index levels in one step: @@ -1354,68 +1729,9 @@ not check (or care) whether the levels themselves are sorted. Fortunately, the constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but if you compute the levels and labels yourself, please be careful. -.. _indexing.class: - -Index objects -------------- - -The pandas Index class and its subclasses can be viewed as implementing an -*ordered set* in addition to providing the support infrastructure necessary for -lookups, data alignment, and reindexing. The easiest way to create one directly -is to pass a list or other sequence to ``Index``: - -.. ipython:: python - - index = Index(['e', 'd', 'a', 'b']) - index - 'd' in index - -You can also pass a ``name`` to be stored in the index: - - -.. ipython:: python - - index = Index(['e', 'd', 'a', 'b'], name='something') - index.name - -Starting with pandas 0.5, the name, if set, will be shown in the console -display: - -.. ipython:: python - - index = Index(list(range(5)), name='rows') - columns = Index(['A', 'B', 'C'], name='cols') - df = DataFrame(np.random.randn(5, 3), index=index, columns=columns) - df - df['A'] - - -Set operations on Index objects -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. _indexing.set_ops: - -The three main operations are ``union (|)``, ``intersection (&)``, and ``diff -(-)``. These can be directly called as instance methods or used via overloaded -operators: - -.. ipython:: python - - a = Index(['c', 'b', 'a']) - b = Index(['c', 'e', 'd']) - a.union(b) - a | b - a & b - a - b - -``isin`` method of Index objects -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -One additional operation is the ``isin`` method that works analogously to the -``Series.isin`` method found :ref:`here `. Setting index metadata (``name(s)``, ``levels``, ``labels``) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------------------------------------ .. _indexing.set_metadata: @@ -1444,7 +1760,7 @@ add an index after you've already done so. There are a couple of different ways. Add an index using DataFrame columns -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------------ .. _indexing.set_index: @@ -1487,7 +1803,7 @@ the index in-place (without creating a new object): data Remove / reset the index, ``reset_index`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------------------ As a convenience, there is a new function on DataFrame called ``reset_index`` which transfers the index values into the DataFrame's columns and sets a simple @@ -1518,7 +1834,7 @@ discards the index, instead of putting index values in the DataFrame's columns. deprecated. Adding an ad hoc index -~~~~~~~~~~~~~~~~~~~~~~ +---------------------- If you create an index yourself, you can just assign it to the ``index`` field: @@ -1531,9 +1847,9 @@ Indexing internal details .. note:: - The following is largely relevant for those actually working on the pandas - codebase. And the source code is still the best place to look at the - specifics of how things are implemented. + The following is largely relevant for those actually working on the pandas + codebase. The source code is still the best place to look at the specifics + of how things are implemented. In pandas there are a few objects implemented which can serve as valid containers for the axis labels: @@ -1545,6 +1861,8 @@ containers for the axis labels: - ``Int64Index``: a version of ``Index`` highly optimized for 64-bit integer data, such as time stamps - ``MultiIndex``: the standard hierarchical index object + - ``PeriodIndex``: An Index object with Period elements + - ``DatetimeIndex``: An Index object with Timestamp elements - ``date_range``: fixed frequency date range generated from a time rule or DateOffset. An ndarray of Python datetime objects diff --git a/doc/source/io.rst b/doc/source/io.rst index c29af29d2e63f..e30eb030afb88 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1962,7 +1962,7 @@ storing/selecting from homogeneous index DataFrames. store.select('df_mi') # the levels are automatically included as data columns - store.select('df_mi', Term('foo=bar')) + store.select('df_mi', 'foo=bar') .. _io.hdf5-query: @@ -1970,49 +1970,102 @@ storing/selecting from homogeneous index DataFrames. Querying a Table ~~~~~~~~~~~~~~~~ +.. warning:: + + This query capabilities have changed substantially starting in ``0.13.0``. + Queries from prior version are accepted (with a ``DeprecationWarning``) printed + if its not string-like. + ``select`` and ``delete`` operations have an optional criterion that can be specified to select/delete only a subset of the data. This allows one to have a very large on-disk table and retrieve only a portion of the data. -A query is specified using the ``Term`` class under the hood. +A query is specified using the ``Term`` class under the hood, as a boolean expression. - - 'index' and 'columns' are supported indexers of a DataFrame - - 'major_axis', 'minor_axis', and 'items' are supported indexers of + - ``index`` and ``columns`` are supported indexers of a DataFrame + - ``major_axis``, ``minor_axis``, and ``items`` are supported indexers of the Panel + - if ``data_columns`` are specified, these can be used as additional indexers + +Valid comparison operators are: + + - ``=, ==, !=, >, >=, <, <=`` + +Valid boolean expressions are combined with: + + - ``|`` : or + - ``&`` : and + - ``(`` and ``)`` : for grouping + +These rules are similar to how boolean expressions are used in pandas for indexing. + +.. note:: + + - ``=`` will be automatically expanded to the comparison operator ``==`` + - ``~`` is the not operator, but can only be used in very limited + circumstances + - If a list/tuple of expressions is passed they will be combined via ``&`` + +The following are valid expressions: + + - ``'index>=date'`` + - ``"columns=['A', 'D']"`` + - ``"columns in ['A', 'D']"`` + - ``'columns=A'`` + - ``'columns==A'`` + - ``"~(columns=['A','B'])"`` + - ``'index>df.index[3] & string="bar"'`` + - ``'(index>df.index[3] & index<=df.index[6]) | string="bar"'`` + - ``"ts>=Timestamp('2012-02-01')"`` + - ``"major_axis>=20130101"`` + +The ``indexers`` are on the left-hand side of the sub-expression: -Valid terms can be created from ``dict, list, tuple, or -string``. Objects can be embeded as values. Allowed operations are: ``<, -<=, >, >=, =, !=``. ``=`` will be inferred as an implicit set operation -(e.g. if 2 or more values are provided). The following are all valid -terms. + - ``columns``, ``major_axis``, ``ts`` - - ``dict(field = 'index', op = '>', value = '20121114')`` - - ``('index', '>', '20121114')`` - - ``'index > 20121114'`` - - ``('index', '>', datetime(2012, 11, 14))`` - - ``('index', ['20121114', '20121115'])`` - - ``('major_axis', '=', Timestamp('2012/11/14'))`` - - ``('minor_axis', ['A', 'B'])`` +The right-hand side of the sub-expression (after a comparsion operator) can be: -Queries are built up using a list of ``Terms`` (currently only -**anding** of terms is supported). An example query for a panel might be -specified as follows. ``['major_axis>20000102', ('minor_axis', '=', -['A', 'B']) ]``. This is roughly translated to: `major_axis must be -greater than the date 20000102 and the minor_axis must be A or B` + - functions that will be evaluated, e.g. ``Timestamp('2012-02-01')`` + - strings, e.g. ``"bar"`` + - date-like, e.g. ``20130101``, or ``"20130101"`` + - lists, e.g. ``"['A','B']"`` + - variables that are defined in the local names space, e.g. ``date`` + +Here are some examples: + +.. ipython:: python + + dfq = DataFrame(randn(10,4),columns=list('ABCD'),index=date_range('20130101',periods=10)) + store.append('dfq',dfq,format='table',data_columns=True) + +Use boolean expressions, with in-line function evaluation. + +.. ipython:: python + + store.select('dfq',"index>Timestamp('20130104') & columns=['A', 'B']") + +Use and inline column reference + +.. ipython:: python + + store.select('dfq',where="A>0 or C>0") + +Works with a Panel as well. .. ipython:: python store.append('wp',wp) store - store.select('wp', [ Term('major_axis>20000102'), Term('minor_axis', '=', ['A', 'B']) ]) + store.select('wp', "major_axis>Timestamp('20000102') & minor_axis=['A', 'B']") -The ``columns`` keyword can be supplied to select a list of columns to be returned, -this is equivalent to passing a ``Term('columns', list_of_columns_to_filter)``: +The ``columns`` keyword can be supplied to select a list of columns to be +returned, this is equivalent to passing a +``'columns=list_of_columns_to_filter'``: .. ipython:: python - store.select('df', columns=['A', 'B']) + store.select('df', "columns=['A', 'B']") ``start`` and ``stop`` parameters can be specified to limit the total search space. These are in terms of the total number of rows in a table. @@ -2023,10 +2076,18 @@ space. These are in terms of the total number of rows in a table. wp.to_frame() # limiting the search - store.select('wp',[ Term('major_axis>20000102'), - Term('minor_axis', '=', ['A','B']) ], + store.select('wp',"major_axis>20000102 & minor_axis=['A','B']", start=0, stop=10) +.. note:: + + ``select`` will raise a ``ValueError`` if the query expression has an unknown + variable reference. Usually this means that you are trying to select on a column + that is **not** a data_column. + + ``select`` will raise a ``SyntaxError`` if the query expression is not valid. + + .. _io.hdf5-timedelta: **Using timedelta64[ns]** @@ -2048,7 +2109,7 @@ specified in the format: ``()``, where float may be signed (and fra dftd['C'] = dftd['A']-dftd['B'] dftd store.append('dftd',dftd,data_columns=True) - store.select('dftd',Term("C","<","-3.5D")) + store.select('dftd',"C<'-3.5D'") Indexing ~~~~~~~~ @@ -2057,10 +2118,13 @@ You can create/modify an index for a table with ``create_table_index`` after data is already in the table (after and ``append/put`` operation). Creating a table index is **highly** encouraged. This will speed your queries a great deal when you use a ``select`` with the -indexed dimension as the ``where``. **Indexes are automagically created -(starting 0.10.1)** on the indexables and any data columns you -specify. This behavior can be turned off by passing ``index=False`` to -``append``. +indexed dimension as the ``where``. + +.. note:: + + Indexes are automagically created (starting ``0.10.1``) on the indexables + and any data columns you specify. This behavior can be turned off by passing + ``index=False`` to ``append``. .. ipython:: python @@ -2117,7 +2181,7 @@ create a new table!) Iterator ~~~~~~~~ -Starting in 0.11, you can pass, ``iterator=True`` or ``chunksize=number_in_a_chunk`` +Starting in ``0.11.0``, you can pass, ``iterator=True`` or ``chunksize=number_in_a_chunk`` to ``select`` and ``select_as_multiple`` to return an iterator on the results. The default is 50,000 rows returned in a chunk. @@ -2151,7 +2215,7 @@ Advanced Queries To retrieve a single indexable or data column, use the method ``select_column``. This will, for example, enable you to get the index very quickly. These return a ``Series`` of the result, indexed by the row number. -These do not currently accept the ``where`` selector (coming soon) +These do not currently accept the ``where`` selector. .. ipython:: python diff --git a/doc/source/release.rst b/doc/source/release.rst index 0ed1f39d72cb5..b8a817a00403c 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -294,7 +294,15 @@ See :ref:`Internal Refactoring` Experimental Features ~~~~~~~~~~~~~~~~~~~~~ -.. _release:bug_fixes-0.13.0: +- The new :func:`~pandas.eval` function implements expression evaluation using + ``numexpr`` behind the scenes. This results in large speedups for complicated + expressions involving large DataFrames/Series. +- :class:`~pandas.DataFrame` has a new :meth:`~pandas.DataFrame.eval` that + evaluates an expression in the context of the ``DataFrame``. +- A :meth:`~pandas.DataFrame.query` method has been added that allows + you to select elements of a ``DataFrame`` using a natural query syntax nearly + identical to Python syntax. + Bug Fixes ~~~~~~~~~ diff --git a/doc/source/v0.10.0.txt b/doc/source/v0.10.0.txt index d0c0ecc148239..0c86add1225ad 100644 --- a/doc/source/v0.10.0.txt +++ b/doc/source/v0.10.0.txt @@ -262,7 +262,7 @@ Updated PyTables Support [ Term('major_axis>20000102'), Term('minor_axis', '=', ['A','B']) ]) # removing data from tables - store.remove('wp', [ 'major_axis', '>', wp.major_axis[3] ]) + store.remove('wp', Term('major_axis>20000103')) store.select('wp') # deleting a store diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index c56af23e85eae..694281b813c3b 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -187,6 +187,96 @@ Indexing API Changes p p.loc[:,:,'C'] +HDFStore API Changes +~~~~~~~~~~~~~~~~~~~~ + + - Query Format Changes. A much more string-like query format is now supported. + + .. ipython:: python + + path = 'test_query.h5' + dfq = DataFrame(randn(10,4),columns=list('ABCD'),index=date_range('20130101',periods=10)) + dfq.to_hdf(path,'dfq',format='table',data_columns=True) + + Use boolean expressions, with in-line function evaluation. + + .. ipython:: python + + read_hdf(path,'dfq',where="index>Timestamp('20130104') & columns=['A', 'B']") + + Use an inline column reference + + .. ipython:: python + + read_hdf(path,'dfq',where="A>0 or C>0") + + See :ref:`the docs`. + + - Significant table writing performance improvements + - handle a passed ``Series`` in table format (:issue:`4330`) + - added an ``is_open`` property to indicate if the underlying file handle is_open; + a closed store will now report 'CLOSED' when viewing the store (rather than raising an error) + (:issue:`4409`) + - a close of a ``HDFStore`` now will close that instance of the ``HDFStore`` + but will only close the actual file if the ref count (by ``PyTables``) w.r.t. all of the open handles + are 0. Essentially you have a local instance of ``HDFStore`` referenced by a variable. Once you + close it, it will report closed. Other references (to the same file) will continue to operate + until they themselves are closed. Performing an action on a closed file will raise + ``ClosedFileError`` + + .. ipython:: python + + path = 'test.h5' + df = DataFrame(randn(10,2)) + store1 = HDFStore(path) + store2 = HDFStore(path) + store1.append('df',df) + store2.append('df2',df) + + store1 + store2 + store1.close() + store2 + store2.close() + store2 + + .. ipython:: python + :suppress: + + import os + os.remove(path) + + - removed the ``_quiet`` attribute, replace by a ``DuplicateWarning`` if retrieving + duplicate rows from a table (:issue:`4367`) + - removed the ``warn`` argument from ``open``. Instead a ``PossibleDataLossError`` exception will + be raised if you try to use ``mode='w'`` with an OPEN file handle (:issue:`4367`) + - allow a passed locations array or mask as a ``where`` condition (:issue:`4467`). + See :ref:`here` for an example. + + - the ``format`` keyword now replaces the ``table`` keyword; allowed values are ``fixed(f)`` or ``table(t)`` + the same defaults as prior < 0.13.0 remain, e.g. ``put`` implies 'fixed` or 'f' (Fixed) format + and ``append`` imples 'table' or 't' (Table) format + + .. ipython:: python + + path = 'test.h5' + df = DataFrame(randn(10,2)) + df.to_hdf(path,'df_table',format='table') + df.to_hdf(path,'df_table2',append=True) + df.to_hdf(path,'df_fixed') + with get_store(path) as store: + print store + + .. ipython:: python + :suppress: + + import os + os.remove('test.h5') + os.remove('test_query.h5') + - add the keyword ``dropna=True`` to ``append`` to change whether ALL nan rows are not written + to the store (default is ``True``, ALL nan rows are NOT written), also settable + via the option ``io.hdf.dropna_table`` (:issue:`4625`) + Enhancements ~~~~~~~~~~~~ @@ -271,6 +361,90 @@ Enhancements is evaluated, respecttively. See scipy docs. - DataFrame constructor now accepts a numpy masked record array (:issue:`3478`) + +.. _whatsnew_0130.experimental: + +Experimental +~~~~~~~~~~~~ + +- :func:`~pandas.eval`: + + - The new :func:`~pandas.eval` function implements expression evaluation using + ``numexpr`` behind the scenes. This results in large speedups for + complicated expressions involving large DataFrames/Series. For example, + + .. ipython:: python + + nrows, ncols = 20000, 100 + df1, df2, df3, df4 = [DataFrame(randn(nrows, ncols)) + for _ in xrange(4)] + + .. ipython:: python + + %timeit pd.eval('df1 + df2 + df3 + df4') + + For more details, see the :ref:`enhancing performance documentation on eval + ` + +- :meth:`~pandas.DataFrame.eval` + + - Similar to :func:`~pandas.eval`, :class:`~pandas.DataFrame` has a new + :meth:`~pandas.DataFrame.eval` that evaluates an expression in the context + of the ``DataFrame``. For example, + + .. ipython:: python + :suppress: + + try: + del a + except NameError: + pass + + try: + del b + except NameError: + pass + + .. ipython:: python + + df = DataFrame(randn(10, 2), columns=['a', 'b']) + df.eval('a + b') + + +- :meth:`~pandas.DataFrame.query` + + - In 0.13 a :meth:`~pandas.DataFrame.query` method has been added that allows + you to select elements of a ``DataFrame`` using a natural query syntax + nearly identical to Python syntax. For example, + + .. ipython:: python + :suppress: + + try: + del a + except NameError: + pass + + try: + del b + except NameError: + pass + + try: + del c + except NameError: + pass + + .. ipython:: python + + n = 20 + df = DataFrame(randint(n, size=(n, 3)), columns=['a', 'b', 'c']) + df.query('a < b < c') + + selects all the rows of ``df`` where ``a < b < c`` evaluates to ``True``. + For more details see the :ref:`indexing documentation on query + `. + .. _whatsnew_0130.refactoring: Internal Refactoring diff --git a/pandas/__init__.py b/pandas/__init__.py index 03681d3fa5a3f..c4c012d6c5095 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -42,6 +42,7 @@ from pandas.stats.api import * from pandas.tseries.api import * from pandas.io.api import * +from pandas.computation.api import * from pandas.util.testing import debug diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 12c929cd59820..10e1464739203 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -46,11 +46,13 @@ from StringIO import StringIO BytesIO = StringIO import cPickle + import httplib except ImportError: import builtins from io import StringIO, BytesIO cStringIO = StringIO import pickle as cPickle + import http.client as httplib if PY3: diff --git a/pandas/computation/__init__.py b/pandas/computation/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/computation/align.py b/pandas/computation/align.py new file mode 100644 index 0000000000000..60975bdc8a5b4 --- /dev/null +++ b/pandas/computation/align.py @@ -0,0 +1,247 @@ +"""Core eval alignment algorithms +""" + +import warnings +from functools import partial, wraps +from pandas.compat import zip, range + +import numpy as np + +import pandas as pd +from pandas import compat +import pandas.core.common as com + + +def _align_core_single_unary_op(term): + if isinstance(term.value, np.ndarray): + typ = partial(np.asanyarray, dtype=term.value.dtype) + else: + typ = type(term.value) + ret = typ, + + if not hasattr(term.value, 'axes'): + ret += None, + else: + ret += _zip_axes_from_type(typ, term.value.axes), + return ret + + +def _zip_axes_from_type(typ, new_axes): + axes = {} + for ax_ind, ax_name in compat.iteritems(typ._AXIS_NAMES): + axes[ax_name] = new_axes[ax_ind] + return axes + + +def _maybe_promote_shape(values, naxes): + # test to see if we have an array else leave since must be a number + if not isinstance(values, np.ndarray): + return values + + ndims = values.ndim + if ndims > naxes: + raise AssertionError('cannot have more dims than axes, ' + '{0} > {1}'.format(ndims, naxes)) + if ndims == naxes: + return values + + ndim, nax = range(ndims), range(naxes) + + axes_slice = [slice(None)] * naxes + + # set difference of numaxes and ndims + slices = list(set(nax) - set(ndim)) + + if ndims == naxes: + if slices: + raise AssertionError('slices should be empty if ndims == naxes ' + '{0}'.format(slices)) + else: + if not slices: + raise AssertionError('slices should NOT be empty if ndim != naxes ' + '{0}'.format(slices)) + + for sl in slices: + axes_slice[sl] = np.newaxis + + return values[tuple(axes_slice)] + + +def _any_pandas_objects(terms): + """Check a sequence of terms for instances of PandasObject.""" + return any(isinstance(term.value, pd.core.generic.PandasObject) + for term in terms) + + +def _filter_special_cases(f): + @wraps(f) + def wrapper(terms): + # single unary operand + if len(terms) == 1: + return _align_core_single_unary_op(terms[0]) + + term_values = (term.value for term in terms) + # only scalars or indexes + if all(isinstance(term.value, pd.Index) or term.isscalar for term in + terms): + return np.result_type(*term_values), None + + # single element ndarrays + all_has_size = all(hasattr(term.value, 'size') for term in terms) + if all_has_size and all(term.value.size == 1 for term in terms): + return np.result_type(*term_values), None + + # no pandas objects + if not _any_pandas_objects(terms): + return np.result_type(*term_values), None + + return f(terms) + return wrapper + + +@_filter_special_cases +def _align_core(terms): + term_index = [i for i, term in enumerate(terms) if hasattr(term.value, + 'axes')] + term_dims = [terms[i].value.ndim for i in term_index] + ndims = pd.Series(dict(zip(term_index, term_dims))) + + # initial axes are the axes of the largest-axis'd term + biggest = terms[ndims.idxmax()].value + typ = biggest._constructor + axes = biggest.axes + naxes = len(axes) + + for term in (terms[i] for i in term_index): + for axis, items in enumerate(term.value.axes): + if isinstance(term.value, pd.Series) and naxes > 1: + ax, itm = naxes - 1, term.value.index + else: + ax, itm = axis, items + axes[ax] = axes[ax].join(itm, how='outer') + + for i, ndim in compat.iteritems(ndims): + for axis, items in zip(range(ndim), axes): + ti = terms[i].value + + if hasattr(ti, 'reindex_axis'): + transpose = isinstance(ti, pd.Series) and naxes > 1 + reindexer = axes[naxes - 1] if transpose else items + + term_axis_size = len(ti.axes[axis]) + reindexer_size = len(reindexer) + + ordm = np.log10(abs(reindexer_size - term_axis_size)) + if ordm >= 1 and reindexer_size >= 10000: + warnings.warn("Alignment difference on axis {0} is larger" + " than an order of magnitude on term {1!r}, " + "by more than {2:.4g}; performance may suffer" + "".format(axis, term.name, ordm), + category=pd.io.common.PerformanceWarning) + + if transpose: + f = partial(ti.reindex, index=reindexer, copy=False) + else: + f = partial(ti.reindex_axis, reindexer, axis=axis, + copy=False) + + if pd.lib.is_bool_array(ti.values): + r = f(fill_value=True) + else: + r = f() + + terms[i].update(r) + + res = _maybe_promote_shape(terms[i].value.T if transpose else + terms[i].value, naxes) + res = res.T if transpose else res + + try: + v = res.values + except AttributeError: + v = res + terms[i].update(v) + + return typ, _zip_axes_from_type(typ, axes) + + +def _filter_terms(flat): + # numeric literals + literals = frozenset(filter(lambda x: isinstance(x, Constant), flat)) + + # these are strings which are variable names + names = frozenset(flat) - literals + + # literals are not names and names are not literals, so intersection should + # be empty + if literals & names: + raise ValueError('literals cannot be names and names cannot be ' + 'literals') + return names, literals + + +def _align(terms): + """Align a set of terms""" + try: + # flatten the parse tree (a nested list, really) + terms = list(com.flatten(terms)) + except TypeError: + # can't iterate so it must just be a constant or single variable + if isinstance(terms.value, pd.core.generic.NDFrame): + typ = type(terms.value) + return typ, _zip_axes_from_type(typ, terms.value.axes) + return np.result_type(terms.type), None + + # if all resolved variables are numeric scalars + if all(term.isscalar for term in terms): + return np.result_type(*(term.value for term in terms)).type, None + + # perform the main alignment + typ, axes = _align_core(terms) + return typ, axes + + +def _reconstruct_object(typ, obj, axes, dtype): + """Reconstruct an object given its type, raw value, and possibly empty + (None) axes. + + Parameters + ---------- + typ : object + A type + obj : object + The value to use in the type constructor + axes : dict + The axes to use to construct the resulting pandas object + + Returns + ------- + ret : typ + An object of type ``typ`` with the value `obj` and possible axes + `axes`. + """ + try: + typ = typ.type + except AttributeError: + pass + + try: + res_t = np.result_type(obj.dtype, dtype) + except AttributeError: + res_t = dtype + + if (not isinstance(typ, partial) and + issubclass(typ, pd.core.generic.PandasObject)): + return typ(obj, dtype=res_t, **axes) + + # special case for pathological things like ~True/~False + if hasattr(res_t, 'type') and typ == np.bool_ and res_t != np.bool_: + ret_value = res_t.type(obj) + else: + ret_value = typ(obj).astype(res_t) + + try: + ret = ret_value.item() + except ValueError: + ret = ret_value + return ret diff --git a/pandas/computation/api.py b/pandas/computation/api.py new file mode 100644 index 0000000000000..db8269a497768 --- /dev/null +++ b/pandas/computation/api.py @@ -0,0 +1,2 @@ +from pandas.computation.eval import eval +from pandas.computation.expr import Expr diff --git a/pandas/computation/common.py b/pandas/computation/common.py new file mode 100644 index 0000000000000..9af2197a4fd69 --- /dev/null +++ b/pandas/computation/common.py @@ -0,0 +1,13 @@ +import numpy as np +import pandas as pd + + +def _ensure_decoded(s): + """ if we have bytes, decode them to unicode """ + if isinstance(s, (np.bytes_, bytes)): + s = s.decode(pd.get_option('display.encoding')) + return s + + +class NameResolutionError(NameError): + pass diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py new file mode 100644 index 0000000000000..88efc9eeab5d5 --- /dev/null +++ b/pandas/computation/engines.py @@ -0,0 +1,125 @@ +"""Engine classes for :func:`~pandas.eval` +""" + +import abc + +from pandas import compat +from pandas.core import common as com +from pandas.computation.align import _align, _reconstruct_object +from pandas.computation.ops import UndefinedVariableError + + +class AbstractEngine(object): + """Object serving as a base class for all engines.""" + + __metaclass__ = abc.ABCMeta + + has_neg_frac = False + + def __init__(self, expr): + self.expr = expr + self.aligned_axes = None + self.result_type = None + + def convert(self): + """Convert an expression for evaluation. + + Defaults to return the expression as a string. + """ + return com.pprint_thing(self.expr) + + def pre_evaluate(self): + self.expr.check_name_clashes() + + def evaluate(self): + """Run the engine on the expression + + This method performs alignment which is necessary no matter what engine + is being used, thus its implementation is in the base class. + + Returns + ------- + obj : object + The result of the passed expression. + """ + if not self._is_aligned: + self.result_type, self.aligned_axes = _align(self.expr.terms) + + # make sure no names in resolvers and locals/globals clash + self.pre_evaluate() + res = self._evaluate() + return _reconstruct_object(self.result_type, res, self.aligned_axes, + self.expr.terms.return_type) + + @property + def _is_aligned(self): + return self.aligned_axes is not None and self.result_type is not None + + @abc.abstractmethod + def _evaluate(self): + """Return an evaluated expression. + + Parameters + ---------- + env : Scope + The local and global environment in which to evaluate an + expression. + + Notes + ----- + Must be implemented by subclasses. + """ + pass + + +class NumExprEngine(AbstractEngine): + """NumExpr engine class""" + has_neg_frac = True + + def __init__(self, expr): + super(NumExprEngine, self).__init__(expr) + + def convert(self): + return str(super(NumExprEngine, self).convert()) + + def _evaluate(self): + import numexpr as ne + + # add the resolvers to locals + self.expr.add_resolvers_to_locals() + + # convert the expression to a valid numexpr expression + s = self.convert() + + try: + return ne.evaluate(s, local_dict=self.expr.env.locals, + global_dict=self.expr.env.globals, + truediv=self.expr.truediv) + except KeyError as e: + # python 3 compat kludge + try: + msg = e.message + except AttributeError: + msg = compat.text_type(e) + raise UndefinedVariableError(msg) + + +class PythonEngine(AbstractEngine): + """Evaluate an expression in Python space. + + Mostly for testing purposes. + """ + has_neg_frac = False + + def __init__(self, expr): + super(PythonEngine, self).__init__(expr) + + def evaluate(self): + self.pre_evaluate() + return self.expr() + + def _evaluate(self): + pass + + +_engines = {'numexpr': NumExprEngine, 'python': PythonEngine} diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py new file mode 100644 index 0000000000000..36b1e2bc96090 --- /dev/null +++ b/pandas/computation/eval.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python + +"""Top level ``eval`` module. +""" + +import numbers +import numpy as np + +from pandas.core import common as com +from pandas.compat import string_types +from pandas.computation.expr import Expr, _parsers, _ensure_scope +from pandas.computation.engines import _engines + + +def _check_engine(engine): + """Make sure a valid engine is passed. + + Parameters + ---------- + engine : str + + Raises + ------ + KeyError + * If an invalid engine is passed + ImportError + * If numexpr was requested but doesn't exist + """ + if engine not in _engines: + raise KeyError('Invalid engine {0!r} passed, valid engines are' + ' {1}'.format(engine, list(_engines.keys()))) + + # TODO: validate this in a more general way (thinking of future engines + # that won't necessarily be import-able) + # Could potentially be done on engine instantiation + if engine == 'numexpr': + try: + import numexpr + except ImportError: + raise ImportError("'numexpr' not found. Cannot use " + "engine='numexpr' if 'numexpr' is not installed") + + +def _check_parser(parser): + """Make sure a valid parser is passed. + + Parameters + ---------- + parser : str + + Raises + ------ + KeyError + * If an invalid parser is passed + """ + if parser not in _parsers: + raise KeyError('Invalid parser {0!r} passed, valid parsers are' + ' {1}'.format(parser, _parsers.keys())) + + +def _check_resolvers(resolvers): + if resolvers is not None: + for resolver in resolvers: + if not hasattr(resolver, '__getitem__'): + name = type(resolver).__name__ + raise AttributeError('Resolver of type {0!r} must implement ' + 'the __getitem__ method'.format(name)) + + +def _check_expression(expr): + """Make sure an expression is not an empty string + + Parameters + ---------- + expr : object + An object that can be converted to a string + + Raises + ------ + ValueError + * If expr is an empty string + """ + if not expr: + raise ValueError("expr cannot be an empty string") + + +def _convert_expression(expr): + """Convert an object to an expression. + + Thus function converts an object to an expression (a unicode string) and + checks to make sure it isn't empty after conversion. This is used to + convert operators to their string representation for recursive calls to + :func:`~pandas.eval`. + + Parameters + ---------- + expr : object + The object to be converted to a string. + + Returns + ------- + s : unicode + The string representation of an object. + + Raises + ------ + ValueError + * If the expression is empty. + """ + s = com.pprint_thing(expr) + _check_expression(s) + return s + + +def eval(expr, parser='pandas', engine='numexpr', truediv=True, + local_dict=None, global_dict=None, resolvers=None, level=2): + """Evaluate a Python expression as a string using various backends. + + The following arithmetic operations are supported: ``+``, ``-``, ``*``, + ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following + boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not). + Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`, + :keyword:`or`, and :keyword:`not` with the same semantics as the + corresponding bitwise operators. :class:`~pandas.Series` and + :class:`~pandas.DataFrame` objects are supported and behave as they would + with plain ol' Python evaluation. + + Parameters + ---------- + expr : str or unicode + The expression to evaluate. This string cannot contain any Python + `statements + `__, + only Python `expressions + `__. + parser : string, default 'pandas', {'pandas', 'python'} + The parser to use to construct the syntax tree from the expression. The + default of ``'pandas'`` parses code slightly different than standard + Python. Alternatively, you can parse an expression using the + ``'python'`` parser to retain strict Python semantics. See the + :ref:`enhancing performance ` documentation for + more details. + engine : string, default 'numexpr', {'python', 'numexpr'} + + The engine used to evaluate the expression. Supported engines are + + - ``'numexpr'``: This default engine evaluates pandas objects using + numexpr for large speed ups in complex expressions + with large frames. + - ``'python'``: Performs operations as if you had ``eval``'d in top + level python. This engine is generally not that useful. + + More backends may be available in the future. + + truediv : bool, optional + Whether to use true division, like in Python >= 3 + local_dict : dict or None, optional + A dictionary of local variables, taken from locals() by default. + global_dict : dict or None, optional + A dictionary of global variables, taken from globals() by default. + resolvers : list of dict-like or None, optional + A list of objects implementing the ``__getitem__`` special method that + you can use to inject an additional collection of namespaces to use for + variable lookup. For example, this is used in the + :meth:`~pandas.DataFrame.query` method to inject the + :attr:`~pandas.DataFrame.index` and :attr:`~pandas.DataFrame.columns` + variables that refer to their respective :class:`~pandas.DataFrame` + instance attributes. + level : int, optional + The number of prior stack frames to traverse and add to the current + scope. Most users will **not** need to change this parameter. + + Returns + ------- + ndarray, numeric scalar, DataFrame, Series + + Notes + ----- + The ``dtype`` of any objects involved in an arithmetic ``%`` operation are + recursively cast to ``float64``. + + See the :ref:`enhancing performance ` documentation for + more details. + + See Also + -------- + pandas.DataFrame.query + pandas.DataFrame.eval + """ + expr = _convert_expression(expr) + _check_engine(engine) + _check_parser(parser) + _check_resolvers(resolvers) + + # get our (possibly passed-in) scope + env = _ensure_scope(global_dict=global_dict, local_dict=local_dict, + resolvers=resolvers, level=level) + + parsed_expr = Expr(expr, engine=engine, parser=parser, env=env, + truediv=truediv) + + # construct the engine and evaluate the parsed expression + eng = _engines[engine] + eng_inst = eng(parsed_expr) + ret = eng_inst.evaluate() + return ret diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py new file mode 100644 index 0000000000000..ff9adc26b8201 --- /dev/null +++ b/pandas/computation/expr.py @@ -0,0 +1,763 @@ +""":func:`~pandas.eval` parsers +""" + +import ast +import operator +import sys +import inspect +import tokenize +import datetime +import struct + +from functools import partial + +import pandas as pd +from pandas import compat +from pandas.compat import StringIO, zip, reduce, string_types +from pandas.core.base import StringMixin +from pandas.core import common as com +from pandas.computation.common import NameResolutionError +from pandas.computation.ops import (_cmp_ops_syms, _bool_ops_syms, + _arith_ops_syms, _unary_ops_syms, is_term) +from pandas.computation.ops import _reductions, _mathops, _LOCAL_TAG +from pandas.computation.ops import Op, BinOp, UnaryOp, Term, Constant, Div + + +def _ensure_scope(level=2, global_dict=None, local_dict=None, resolvers=None, + **kwargs): + """Ensure that we are grabbing the correct scope.""" + return Scope(gbls=global_dict, lcls=local_dict, level=level, + resolvers=resolvers) + + +def _check_disjoint_resolver_names(resolver_keys, local_keys, global_keys): + """Make sure that variables in resolvers don't overlap with locals or + globals. + """ + res_locals = list(com.intersection(resolver_keys, local_keys)) + if res_locals: + msg = "resolvers and locals overlap on names {0}".format(res_locals) + raise NameResolutionError(msg) + + res_globals = list(com.intersection(resolver_keys, global_keys)) + if res_globals: + msg = "resolvers and globals overlap on names {0}".format(res_globals) + raise NameResolutionError(msg) + + +def _replacer(x, pad_size): + """Replace a number with its padded hexadecimal representation. Used to tag + temporary variables with their calling scope's id. + """ + # get the hex repr of the binary char and remove 0x and pad by pad_size + # zeros + try: + hexin = ord(x) + except TypeError: + # bytes literals masquerade as ints when iterating in py3 + hexin = x + + return hex(hexin).replace('0x', '').rjust(pad_size, '0') + + +def _raw_hex_id(obj, pad_size=2): + """Return the padded hexadecimal id of ``obj``.""" + # interpret as a pointer since that's what really what id returns + packed = struct.pack('@P', id(obj)) + + return ''.join(_replacer(x, pad_size) for x in packed) + + +class Scope(StringMixin): + """Object to hold scope, with a few bells to deal with some custom syntax + added by pandas. + + Parameters + ---------- + gbls : dict or None, optional, default None + lcls : dict or Scope or None, optional, default None + level : int, optional, default 1 + resolvers : list-like or None, optional, default None + + Attributes + ---------- + globals : dict + locals : dict + level : int + resolvers : tuple + resolver_keys : frozenset + """ + __slots__ = ('globals', 'locals', 'resolvers', '_global_resolvers', + 'resolver_keys', '_resolver', 'level', 'ntemps') + + def __init__(self, gbls=None, lcls=None, level=1, resolvers=None): + self.level = level + self.resolvers = tuple(resolvers or []) + self.globals = dict() + self.locals = dict() + self.ntemps = 1 # number of temporary variables in this scope + + if isinstance(lcls, Scope): + ld, lcls = lcls, dict() + self.locals.update(ld.locals.copy()) + self.globals.update(ld.globals.copy()) + self.resolvers += ld.resolvers + self.update(ld.level) + + frame = sys._getframe(level) + try: + self.globals.update(gbls or frame.f_globals) + self.locals.update(lcls or frame.f_locals) + finally: + del frame + + # add some useful defaults + self.globals['Timestamp'] = pd.lib.Timestamp + self.globals['datetime'] = datetime + + # SUCH a hack + self.globals['True'] = True + self.globals['False'] = False + + res_keys = (list(o.keys()) for o in self.resolvers) + self.resolver_keys = frozenset(reduce(operator.add, res_keys, [])) + self._global_resolvers = self.resolvers + (self.locals, self.globals) + self._resolver = None + + self.resolver_dict = {} + for o in self.resolvers: + self.resolver_dict.update(dict(o)) + + def __unicode__(self): + return com.pprint_thing("locals: {0}\nglobals: {0}\nresolvers: " + "{0}".format(list(self.locals.keys()), + list(self.globals.keys()), + list(self.resolver_keys))) + + def __getitem__(self, key): + return self.resolve(key, globally=False) + + def resolve(self, key, globally=False): + resolvers = self.locals, self.globals + if globally: + resolvers = self._global_resolvers + + for resolver in resolvers: + try: + return resolver[key] + except KeyError: + pass + + def update(self, level=None): + """Update the current scope by going back `level` levels. + + Parameters + ---------- + level : int or None, optional, default None + """ + # we are always 2 levels below the caller + # plus the caller may be below the env level + # in which case we need addtl levels + sl = 2 + if level is not None: + sl += level + + # add sl frames to the scope starting with the + # most distant and overwritting with more current + # makes sure that we can capture variable scope + frame = inspect.currentframe() + try: + frames = [] + while sl >= 0: + frame = frame.f_back + sl -= 1 + frames.append(frame) + for f in frames[::-1]: + self.locals.update(f.f_locals) + self.globals.update(f.f_globals) + finally: + del frame, frames + + def add_tmp(self, value, where='locals'): + """Add a temporary variable to the scope. + + Parameters + ---------- + value : object + An arbitrary object to be assigned to a temporary variable. + where : basestring, optional, default 'locals', {'locals', 'globals'} + What scope to add the value to. + + Returns + ------- + name : basestring + The name of the temporary variable created. + """ + d = getattr(self, where, None) + + if d is None: + raise AttributeError("Cannot add value to non-existent scope " + "{0!r}".format(where)) + if not isinstance(d, dict): + raise TypeError("Cannot add value to object of type {0!r}, " + "scope must be a dictionary" + "".format(type(d).__name__)) + name = 'tmp_var_{0}_{1}_{2}'.format(type(value).__name__, self.ntemps, + _raw_hex_id(self)) + d[name] = value + + # only increment if the variable gets put in the scope + self.ntemps += 1 + return name + + def remove_tmp(self, name, where='locals'): + d = getattr(self, where, None) + if d is None: + raise AttributeError("Cannot remove value from non-existent scope " + "{0!r}".format(where)) + if not isinstance(d, dict): + raise TypeError("Cannot remove value from object of type {0!r}, " + "scope must be a dictionary" + "".format(type(d).__name__)) + del d[name] + self.ntemps -= 1 + + +def _rewrite_assign(source): + """Rewrite the assignment operator for PyTables expression that want to use + ``=`` as a substitute for ``==``. + """ + res = [] + g = tokenize.generate_tokens(StringIO(source).readline) + for toknum, tokval, _, _, _ in g: + res.append((toknum, '==' if tokval == '=' else tokval)) + return tokenize.untokenize(res) + + +def _replace_booleans(source): + """Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise + precedence is changed to boolean precedence. + """ + return source.replace('|', ' or ').replace('&', ' and ') + + +def _replace_locals(source, local_symbol='@'): + """Replace local variables with a syntacticall valid name.""" + return source.replace(local_symbol, _LOCAL_TAG) + + +def _preparse(source): + """Compose assignment and boolean replacement.""" + return _replace_booleans(_rewrite_assign(source)) + + +def _is_type(t): + """Factory for a type checking function of type ``t`` or tuple of types.""" + return lambda x: isinstance(x.value, t) + + +_is_list = _is_type(list) +_is_str = _is_type(string_types) + + +# partition all AST nodes +_all_nodes = frozenset(filter(lambda x: isinstance(x, type) and + issubclass(x, ast.AST), + (getattr(ast, node) for node in dir(ast)))) + + +def _filter_nodes(superclass, all_nodes=_all_nodes): + """Filter out AST nodes that are subclasses of ``superclass``.""" + node_names = (node.__name__ for node in all_nodes + if issubclass(node, superclass)) + return frozenset(node_names) + + +_all_node_names = frozenset(map(lambda x: x.__name__, _all_nodes)) +_mod_nodes = _filter_nodes(ast.mod) +_stmt_nodes = _filter_nodes(ast.stmt) +_expr_nodes = _filter_nodes(ast.expr) +_expr_context_nodes = _filter_nodes(ast.expr_context) +_slice_nodes = _filter_nodes(ast.slice) +_boolop_nodes = _filter_nodes(ast.boolop) +_operator_nodes = _filter_nodes(ast.operator) +_unary_op_nodes = _filter_nodes(ast.unaryop) +_cmp_op_nodes = _filter_nodes(ast.cmpop) +_comprehension_nodes = _filter_nodes(ast.comprehension) +_handler_nodes = _filter_nodes(ast.excepthandler) +_arguments_nodes = _filter_nodes(ast.arguments) +_keyword_nodes = _filter_nodes(ast.keyword) +_alias_nodes = _filter_nodes(ast.alias) + + +# nodes that we don't support directly but are needed for parsing +_hacked_nodes = frozenset(['Assign', 'Module', 'Expr']) + + +_unsupported_expr_nodes = frozenset(['Yield', 'GeneratorExp', 'IfExp', + 'DictComp', 'SetComp', 'Repr', 'Lambda', + 'Set', 'AST', 'Is', 'IsNot']) + +# these nodes are low priority or won't ever be supported (e.g., AST) +_unsupported_nodes = ((_stmt_nodes | _mod_nodes | _handler_nodes | + _arguments_nodes | _keyword_nodes | _alias_nodes | + _expr_context_nodes | _unsupported_expr_nodes) - + _hacked_nodes) + +# we're adding a different assignment in some cases to be equality comparison +# and we don't want `stmt` and friends in their so get only the class whose +# names are capitalized +_base_supported_nodes = (_all_node_names - _unsupported_nodes) | _hacked_nodes +_msg = 'cannot both support and not support {0}'.format(_unsupported_nodes & + _base_supported_nodes) +assert not _unsupported_nodes & _base_supported_nodes, _msg + + +def _node_not_implemented(node_name, cls): + """Return a function that raises a NotImplementedError with a passed node + name. + """ + def f(self, *args, **kwargs): + raise NotImplementedError("{0!r} nodes are not " + "implemented".format(node_name)) + return f + + +def disallow(nodes): + """Decorator to disallow certain nodes from parsing. Raises a + NotImplementedError instead. + + Returns + ------- + disallowed : callable + """ + def disallowed(cls): + cls.unsupported_nodes = () + for node in nodes: + new_method = _node_not_implemented(node, cls) + name = 'visit_{0}'.format(node) + cls.unsupported_nodes += (name,) + setattr(cls, name, new_method) + return cls + return disallowed + + +def _op_maker(op_class, op_symbol): + """Return a function to create an op class with its symbol already passed. + + Returns + ------- + f : callable + """ + def f(self, node, *args, **kwargs): + """Return a partial function with an Op subclass with an operator + already passed. + + Returns + ------- + f : callable + """ + return partial(op_class, op_symbol, *args, **kwargs) + return f + + +_op_classes = {'binary': BinOp, 'unary': UnaryOp} + + +def add_ops(op_classes): + """Decorator to add default implementation of ops.""" + def f(cls): + for op_attr_name, op_class in compat.iteritems(op_classes): + ops = getattr(cls, '{0}_ops'.format(op_attr_name)) + ops_map = getattr(cls, '{0}_op_nodes_map'.format(op_attr_name)) + for op in ops: + op_node = ops_map[op] + if op_node is not None: + made_op = _op_maker(op_class, op) + setattr(cls, 'visit_{0}'.format(op_node), made_op) + return cls + return f + + +@disallow(_unsupported_nodes) +@add_ops(_op_classes) +class BaseExprVisitor(ast.NodeVisitor): + """Custom ast walker. Parsers of other engines should subclass this class + if necessary. + + Parameters + ---------- + env : Scope + engine : str + parser : str + preparser : callable + """ + const_type = Constant + term_type = Term + + binary_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms + binary_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', 'In', 'NotIn', + 'BitAnd', 'BitOr', 'And', 'Or', 'Add', 'Sub', 'Mult', + None, 'Pow', 'FloorDiv', 'Mod') + binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes)) + + unary_ops = _unary_ops_syms + unary_op_nodes = 'UAdd', 'USub', 'Invert', 'Not' + unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) + + rewrite_map = { + ast.Eq: ast.In, + ast.NotEq: ast.NotIn, + ast.In: ast.In, + ast.NotIn: ast.NotIn + } + + def __init__(self, env, engine, parser, preparser=_preparse): + self.env = env + self.engine = engine + self.parser = parser + self.preparser = preparser + + def visit(self, node, **kwargs): + if isinstance(node, string_types): + clean = self.preparser(node) + node = ast.fix_missing_locations(ast.parse(clean)) + elif not isinstance(node, ast.AST): + raise TypeError("Cannot visit objects of type {0!r}" + "".format(node.__class__.__name__)) + + method = 'visit_' + node.__class__.__name__ + visitor = getattr(self, method) + return visitor(node, **kwargs) + + def visit_Module(self, node, **kwargs): + if len(node.body) != 1: + raise SyntaxError('only a single expression is allowed') + expr = node.body[0] + return self.visit(expr, **kwargs) + + def visit_Expr(self, node, **kwargs): + return self.visit(node.value, **kwargs) + + def _rewrite_membership_op(self, node, left, right): + # the kind of the operator (is actually an instance) + op_instance = node.op + op_type = type(op_instance) + + # must be two terms and the comparison operator must be ==/!=/in/not in + if is_term(left) and is_term(right) and op_type in self.rewrite_map: + + left_list, right_list = map(_is_list, (left, right)) + left_str, right_str = map(_is_str, (left, right)) + + # if there are any strings or lists in the expression + if left_list or right_list or left_str or right_str: + op_instance = self.rewrite_map[op_type]() + + # pop the string variable out of locals and replace it with a list + # of one string, kind of a hack + if right_str: + self.env.remove_tmp(right.name) + name = self.env.add_tmp([right.value]) + right = self.term_type(name, self.env) + + if left_str: + self.env.remove_tmp(left.name) + name = self.env.add_tmp([left.value]) + left = self.term_type(name, self.env) + + op = self.visit(op_instance) + return op, op_instance, left, right + + def _possibly_transform_eq_ne(self, node, left=None, right=None): + if left is None: + left = self.visit(node.left, side='left') + if right is None: + right = self.visit(node.right, side='right') + op, op_class, left, right = self._rewrite_membership_op(node, left, + right) + return op, op_class, left, right + + def _possibly_eval(self, binop, eval_in_python): + # eval `in` and `not in` (for now) in "partial" python space + # things that can be evaluated in "eval" space will be turned into + # temporary variables. for example, + # [1,2] in a + 2 * b + # in that case a + 2 * b will be evaluated using numexpr, and the "in" + # call will be evaluated using isin (in python space) + return binop.evaluate(self.env, self.engine, self.parser, + self.term_type, eval_in_python) + + def _possibly_evaluate_binop(self, op, op_class, lhs, rhs, + eval_in_python=('in', 'not in'), + maybe_eval_in_python=('==', '!=')): + res = op(lhs, rhs) + + # "in"/"not in" ops are always evaluated in python + if res.op in eval_in_python: + return self._possibly_eval(res, eval_in_python) + elif (lhs.return_type == object or rhs.return_type == object and + self.engine != 'pytables'): + # evaluate "==" and "!=" in python if either of our operands has an + # object return type + return self._possibly_eval(res, eval_in_python + + maybe_eval_in_python) + return res + + def visit_BinOp(self, node, **kwargs): + op, op_class, left, right = self._possibly_transform_eq_ne(node) + return self._possibly_evaluate_binop(op, op_class, left, right) + + def visit_Div(self, node, **kwargs): + return lambda lhs, rhs: Div(lhs, rhs, + truediv=self.env.locals['truediv']) + + def visit_UnaryOp(self, node, **kwargs): + op = self.visit(node.op) + operand = self.visit(node.operand) + return op(operand) + + def visit_Name(self, node, **kwargs): + return self.term_type(node.id, self.env, **kwargs) + + def visit_Num(self, node, **kwargs): + return self.const_type(node.n, self.env) + + def visit_Str(self, node, **kwargs): + name = self.env.add_tmp(node.s) + return self.term_type(name, self.env) + + def visit_List(self, node, **kwargs): + name = self.env.add_tmp([self.visit(e).value for e in node.elts]) + return self.term_type(name, self.env) + + visit_Tuple = visit_List + + def visit_Index(self, node, **kwargs): + """ df.index[4] """ + return self.visit(node.value) + + def visit_Subscript(self, node, **kwargs): + value = self.visit(node.value) + slobj = self.visit(node.slice) + result = pd.eval(slobj, local_dict=self.env, engine=self.engine, + parser=self.parser) + try: + # a Term instance + v = value.value[result] + except AttributeError: + # an Op instance + lhs = pd.eval(value, local_dict=self.env, engine=self.engine, + parser=self.parser) + v = lhs[result] + name = self.env.add_tmp(v) + return self.term_type(name, env=self.env) + + def visit_Slice(self, node, **kwargs): + """ df.index[slice(4,6)] """ + lower = node.lower + if lower is not None: + lower = self.visit(lower).value + upper = node.upper + if upper is not None: + upper = self.visit(upper).value + step = node.step + if step is not None: + step = self.visit(step).value + + return slice(lower, upper, step) + + def visit_Assign(self, node, **kwargs): + cmpr = ast.Compare(ops=[ast.Eq()], left=node.targets[0], + comparators=[node.value]) + return self.visit(cmpr) + + def visit_Attribute(self, node, **kwargs): + attr = node.attr + value = node.value + + ctx = node.ctx + if isinstance(ctx, ast.Load): + # resolve the value + resolved = self.visit(value).value + try: + v = getattr(resolved, attr) + name = self.env.add_tmp(v) + return self.term_type(name, self.env) + except AttributeError: + # something like datetime.datetime where scope is overriden + if isinstance(value, ast.Name) and value.id == attr: + return resolved + + raise ValueError("Invalid Attribute context {0}".format(ctx.__name__)) + + def visit_Call(self, node, **kwargs): + + # this can happen with: datetime.datetime + if isinstance(node.func, ast.Attribute): + res = self.visit_Attribute(node.func) + elif not isinstance(node.func, ast.Name): + raise TypeError("Only named functions are supported") + else: + res = self.visit(node.func) + + if res is None: + raise ValueError("Invalid function call {0}".format(node.func.id)) + if hasattr(res, 'value'): + res = res.value + + args = [self.visit(targ).value for targ in node.args] + if node.starargs is not None: + args = args + self.visit(node.starargs).value + + keywords = {} + for key in node.keywords: + if not isinstance(key, ast.keyword): + raise ValueError("keyword error in function call " + "'{0}'".format(node.func.id)) + keywords[key.arg] = self.visit(key.value).value + if node.kwargs is not None: + keywords.update(self.visit(node.kwargs).value) + + return self.const_type(res(*args, **keywords), self.env) + + def translate_In(self, op): + return op + + def visit_Compare(self, node, **kwargs): + ops = node.ops + comps = node.comparators + + # base case: we have something like a CMP b + if len(comps) == 1: + op = self.translate_In(ops[0]) + binop = ast.BinOp(op=op, left=node.left, right=comps[0]) + return self.visit(binop) + + # recursive case: we have a chained comparison, a CMP b CMP c, etc. + left = node.left + values = [] + for op, comp in zip(ops, comps): + new_node = self.visit(ast.Compare(comparators=[comp], left=left, + ops=[self.translate_In(op)])) + left = comp + values.append(new_node) + return self.visit(ast.BoolOp(op=ast.And(), values=values)) + + def _try_visit_binop(self, bop): + if isinstance(bop, (Op, Term)): + return bop + return self.visit(bop) + + def visit_BoolOp(self, node, **kwargs): + def visitor(x, y): + lhs = self._try_visit_binop(x) + rhs = self._try_visit_binop(y) + + op, op_class, lhs, rhs = self._possibly_transform_eq_ne(node, lhs, + rhs) + return self._possibly_evaluate_binop(op, node.op, lhs, rhs) + + operands = node.values + return reduce(visitor, operands) + + +_python_not_supported = frozenset(['Assign', 'Dict', 'Call', 'BoolOp', + 'In', 'NotIn']) +_numexpr_supported_calls = frozenset(_reductions + _mathops) + + +@disallow((_unsupported_nodes | _python_not_supported) - + (_boolop_nodes | frozenset(['BoolOp', 'Attribute', 'In', 'NotIn', + 'Tuple']))) +class PandasExprVisitor(BaseExprVisitor): + def __init__(self, env, engine, parser, + preparser=lambda x: _replace_locals(_replace_booleans(x))): + super(PandasExprVisitor, self).__init__(env, engine, parser, preparser) + + +@disallow(_unsupported_nodes | _python_not_supported | frozenset(['Not'])) +class PythonExprVisitor(BaseExprVisitor): + def __init__(self, env, engine, parser, preparser=lambda x: x): + super(PythonExprVisitor, self).__init__(env, engine, parser, + preparser=preparser) + + +class Expr(StringMixin): + """Object encapsulating an expression. + + Parameters + ---------- + expr : str + engine : str, optional, default 'numexpr' + parser : str, optional, default 'pandas' + env : Scope, optional, default None + truediv : bool, optional, default True + level : int, optional, default 2 + """ + def __init__(self, expr, engine='numexpr', parser='pandas', env=None, + truediv=True, level=2): + self.expr = expr + self.env = _ensure_scope(level=level, local_dict=env) + self.engine = engine + self.parser = parser + self._visitor = _parsers[parser](self.env, self.engine, self.parser) + self.terms = self.parse() + self.truediv = truediv + + def __call__(self): + self.env.locals['truediv'] = self.truediv + return self.terms(self.env) + + def __unicode__(self): + return com.pprint_thing(self.terms) + + def __len__(self): + return len(self.expr) + + def parse(self): + """Parse an expression""" + return self._visitor.visit(self.expr) + + def align(self): + """align a set of Terms""" + return self.terms.align(self.env) + + @property + def names(self): + """Get the names in an expression""" + if is_term(self.terms): + return frozenset([self.terms.name]) + return frozenset(term.name for term in com.flatten(self.terms)) + + def check_name_clashes(self): + env = self.env + names = self.names + res_keys = frozenset(env.resolver_dict.keys()) & names + lcl_keys = frozenset(env.locals.keys()) & names + gbl_keys = frozenset(env.globals.keys()) & names + _check_disjoint_resolver_names(res_keys, lcl_keys, gbl_keys) + + def add_resolvers_to_locals(self): + """Add the extra scope (resolvers) to local scope + + Notes + ----- + This should be done after parsing and pre-evaluation, otherwise + unnecessary name clashes will occur. + """ + self.env.locals.update(self.env.resolver_dict) + + +def isexpr(s, check_names=True): + """Strict checking for a valid expression.""" + try: + Expr(s, env=_ensure_scope() if check_names else None) + except SyntaxError: + return False + except NameError: + return not check_names + return True + + +_parsers = {'python': PythonExprVisitor, 'pandas': PandasExprVisitor} diff --git a/pandas/core/expressions.py b/pandas/computation/expressions.py similarity index 67% rename from pandas/core/expressions.py rename to pandas/computation/expressions.py index b1bd104ce48a5..45c9a2d5259cb 100644 --- a/pandas/core/expressions.py +++ b/pandas/computation/expressions.py @@ -5,6 +5,7 @@ Offer fast expression evaluation thru numexpr """ + import numpy as np from pandas.core.common import _values_from_object @@ -15,17 +16,19 @@ _NUMEXPR_INSTALLED = False _USE_NUMEXPR = _NUMEXPR_INSTALLED -_evaluate = None -_where = None +_evaluate = None +_where = None # the set of dtypes that we will allow pass to numexpr -_ALLOWED_DTYPES = dict(evaluate = set(['int64','int32','float64','float32','bool']), - where = set(['int64','float64','bool'])) +_ALLOWED_DTYPES = dict( + evaluate=set(['int64', 'int32', 'float64', 'float32', 'bool']), + where=set(['int64', 'float64', 'bool'])) # the minimum prod shape that we will use numexpr -_MIN_ELEMENTS = 10000 +_MIN_ELEMENTS = 10000 + -def set_use_numexpr(v = True): +def set_use_numexpr(v=True): # set/unset to use numexpr global _USE_NUMEXPR if _NUMEXPR_INSTALLED: @@ -35,26 +38,25 @@ def set_use_numexpr(v = True): global _evaluate, _where if not _USE_NUMEXPR: _evaluate = _evaluate_standard - _where = _where_standard + _where = _where_standard else: _evaluate = _evaluate_numexpr - _where = _where_numexpr + _where = _where_numexpr -def set_numexpr_threads(n = None): + +def set_numexpr_threads(n=None): # if we are using numexpr, set the threads to n # otherwise reset - try: - if _NUMEXPR_INSTALLED and _USE_NUMEXPR: - if n is None: - n = ne.detect_number_of_cores() - ne.set_num_threads(n) - except: - pass + if _NUMEXPR_INSTALLED and _USE_NUMEXPR: + if n is None: + n = ne.detect_number_of_cores() + ne.set_num_threads(n) def _evaluate_standard(op, op_str, a, b, raise_on_error=True, **eval_kwargs): """ standard evaluation """ - return op(a,b) + return op(a, b) + def _can_use_numexpr(op, op_str, a, b, dtype_check): """ return a boolean if we WILL be using numexpr """ @@ -65,13 +67,13 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check): # check for dtype compatiblity dtypes = set() - for o in [ a, b ]: - if hasattr(o,'get_dtype_counts'): + for o in [a, b]: + if hasattr(o, 'get_dtype_counts'): s = o.get_dtype_counts() if len(s) > 1: return False dtypes |= set(s.index) - elif isinstance(o,np.ndarray): + elif isinstance(o, np.ndarray): dtypes |= set([o.dtype.name]) # allowed are a superset @@ -80,52 +82,54 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check): return False -def _evaluate_numexpr(op, op_str, a, b, raise_on_error = False, **eval_kwargs): + +def _evaluate_numexpr(op, op_str, a, b, raise_on_error=False, **eval_kwargs): result = None if _can_use_numexpr(op, op_str, a, b, 'evaluate'): try: a_value, b_value = a, b - if hasattr(a_value,'values'): + if hasattr(a_value, 'values'): a_value = a_value.values - if hasattr(b_value,'values'): + if hasattr(b_value, 'values'): b_value = b_value.values result = ne.evaluate('a_value %s b_value' % op_str, - local_dict={ 'a_value' : a_value, - 'b_value' : b_value }, + local_dict={'a_value': a_value, + 'b_value': b_value}, casting='safe', **eval_kwargs) except (ValueError) as detail: if 'unknown type object' in str(detail): pass except (Exception) as detail: if raise_on_error: - raise TypeError(str(detail)) + raise if result is None: - result = _evaluate_standard(op,op_str,a,b,raise_on_error) + result = _evaluate_standard(op, op_str, a, b, raise_on_error) return result def _where_standard(cond, a, b, raise_on_error=True): - return np.where(_values_from_object(cond), _values_from_object(a), _values_from_object(b)) + return np.where(_values_from_object(cond), _values_from_object(a), + _values_from_object(b)) -def _where_numexpr(cond, a, b, raise_on_error = False): +def _where_numexpr(cond, a, b, raise_on_error=False): result = None if _can_use_numexpr(None, 'where', a, b, 'where'): try: cond_value, a_value, b_value = cond, a, b - if hasattr(cond_value,'values'): + if hasattr(cond_value, 'values'): cond_value = cond_value.values - if hasattr(a_value,'values'): + if hasattr(a_value, 'values'): a_value = a_value.values - if hasattr(b_value,'values'): + if hasattr(b_value, 'values'): b_value = b_value.values - result = ne.evaluate('where(cond_value,a_value,b_value)', - local_dict={ 'cond_value' : cond_value, - 'a_value' : a_value, - 'b_value' : b_value }, + result = ne.evaluate('where(cond_value, a_value, b_value)', + local_dict={'cond_value': cond_value, + 'a_value': a_value, + 'b_value': b_value}, casting='safe') except (ValueError) as detail: if 'unknown type object' in str(detail): @@ -135,7 +139,7 @@ def _where_numexpr(cond, a, b, raise_on_error = False): raise TypeError(str(detail)) if result is None: - result = _where_standard(cond,a,b,raise_on_error) + result = _where_standard(cond, a, b, raise_on_error) return result @@ -143,7 +147,9 @@ def _where_numexpr(cond, a, b, raise_on_error = False): # turn myself on set_use_numexpr(True) -def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True, **eval_kwargs): + +def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True, + **eval_kwargs): """ evaluate and return the expression of the op on a and b Parameters @@ -153,15 +159,18 @@ def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True, **eval_kw op_str: the string version of the op a : left operand b : right operand - raise_on_error : pass the error to the higher level if indicated (default is False), - otherwise evaluate the op with and return the results + raise_on_error : pass the error to the higher level if indicated + (default is False), otherwise evaluate the op with and + return the results use_numexpr : whether to try to use numexpr (default True) """ if use_numexpr: - return _evaluate(op, op_str, a, b, raise_on_error=raise_on_error, **eval_kwargs) + return _evaluate(op, op_str, a, b, raise_on_error=raise_on_error, + **eval_kwargs) return _evaluate_standard(op, op_str, a, b, raise_on_error=raise_on_error) + def where(cond, a, b, raise_on_error=False, use_numexpr=True): """ evaluate the where condition cond on a and b @@ -171,8 +180,9 @@ def where(cond, a, b, raise_on_error=False, use_numexpr=True): cond : a boolean array a : return if cond is True b : return if cond is False - raise_on_error : pass the error to the higher level if indicated (default is False), - otherwise evaluate the op with and return the results + raise_on_error : pass the error to the higher level if indicated + (default is False), otherwise evaluate the op with and + return the results use_numexpr : whether to try to use numexpr (default True) """ diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py new file mode 100644 index 0000000000000..debc79e33968c --- /dev/null +++ b/pandas/computation/ops.py @@ -0,0 +1,510 @@ +"""Operator classes for eval. +""" + +import re +import operator as op +from functools import partial +from itertools import product, islice, chain + +import numpy as np + +import pandas as pd +from pandas.compat import PY3, string_types, text_type +import pandas.core.common as com +from pandas.core.base import StringMixin +from pandas.computation.common import _ensure_decoded + + +_reductions = 'sum', 'prod' +_mathops = ('sin', 'cos', 'exp', 'log', 'expm1', 'log1p', 'pow', 'div', 'sqrt', + 'inv', 'sinh', 'cosh', 'tanh', 'arcsin', 'arccos', 'arctan', + 'arccosh', 'arcsinh', 'arctanh', 'arctan2', 'abs') + + +_LOCAL_TAG = '__pd_eval_local_' +_TAG_RE = re.compile('^{0}'.format(_LOCAL_TAG)) + + +class UndefinedVariableError(NameError): + """NameError subclass for local variables.""" + def __init__(self, *args): + msg = 'name {0!r} is not defined' + subbed = _TAG_RE.sub('', args[0]) + if subbed != args[0]: + subbed = '@' + subbed + msg = 'local variable {0!r} is not defined' + super(UndefinedVariableError, self).__init__(msg.format(subbed)) + + +def _possibly_update_key(d, value, old_key, new_key=None): + if new_key is None: + new_key = old_key + + try: + del d[old_key] + except KeyError: + return False + else: + d[new_key] = value + return True + + +class Term(StringMixin): + def __new__(cls, name, env, side=None, encoding=None): + klass = Constant if not isinstance(name, string_types) else cls + supr_new = super(Term, klass).__new__ + if PY3: + return supr_new(klass) + return supr_new(klass, name, env, side=side, encoding=encoding) + + def __init__(self, name, env, side=None, encoding=None): + self._name = name + self.env = env + self.side = side + self.local = _TAG_RE.search(text_type(name)) is not None + self._value = self._resolve_name() + self.encoding = encoding + + @property + def local_name(self): + return _TAG_RE.sub('', self.name) + + def __unicode__(self): + return com.pprint_thing(self.name) + + def __call__(self, *args, **kwargs): + return self.value + + def evaluate(self, *args, **kwargs): + return self + + def _resolve_name(self): + env = self.env + key = self.name + res = env.resolve(self.local_name, globally=not self.local) + self.update(res) + + if res is None: + if not isinstance(key, string_types): + return key + raise UndefinedVariableError(key) + + if hasattr(res, 'ndim') and res.ndim > 2: + raise NotImplementedError("N-dimensional objects, where N > 2, are" + " not supported with eval") + return res + + def update(self, value): + """ + search order for local (i.e., @variable) variables: + + scope, key_variable + [('locals', 'local_name'), + ('globals', 'local_name'), + ('locals', 'key'), + ('globals', 'key')] + """ + env = self.env + key = self.name + + # if it's a variable name (otherwise a constant) + if isinstance(key, string_types): + if self.local: + # get it's name WITHOUT the local tag (defined above) + local_name = self.local_name + + # search for the local in the above specified order + scope_pairs = product([env.locals, env.globals], + [local_name, key]) + + # a[::2] + a[1::2] but iterators + scope_iter = chain(islice(scope_pairs, None, None, 2), + islice(scope_pairs, 1, None, 2)) + for d, k in scope_iter: + if _possibly_update_key(d, value, k, key): + break + else: + raise UndefinedVariableError(key) + else: + # otherwise we look in resolvers -> locals -> globals + for r in (env.resolver_dict, env.locals, env.globals): + if _possibly_update_key(r, value, key): + break + else: + raise UndefinedVariableError(key) + + self.value = value + + @property + def isscalar(self): + return np.isscalar(self._value) + + @property + def type(self): + try: + # potentially very slow for large, mixed dtype frames + return self._value.values.dtype + except AttributeError: + try: + # ndarray + return self._value.dtype + except AttributeError: + # scalar + return type(self._value) + + return_type = type + + @property + def raw(self): + return com.pprint_thing('{0}(name={1!r}, type={2})' + ''.format(self.__class__.__name__, self.name, + self.type)) + + @property + def kind(self): + try: + return self.type.__name__ + except AttributeError: + return self.type.type.__name__ + + @property + def value(self): + kind = self.kind.lower() + if kind == 'datetime64': + try: + return self._value.asi8 + except AttributeError: + return self._value.view('i8') + elif kind == 'datetime': + return pd.Timestamp(self._value) + elif kind == 'timestamp': + return self._value.asm8.view('i8') + return self._value + + @value.setter + def value(self, new_value): + self._value = new_value + + @property + def name(self): + return self._name + + @name.setter + def name(self, new_name): + self._name = new_name + + @property + def ndim(self): + try: + return self._value.ndim + except AttributeError: + return 0 + + +class Constant(Term): + def __init__(self, value, env, side=None, encoding=None): + super(Constant, self).__init__(value, env, side=side, + encoding=encoding) + + def _resolve_name(self): + return self._name + + @property + def name(self): + return self.value + + + +_bool_op_map = {'not': '~', 'and': '&', 'or': '|'} + + +class Op(StringMixin): + """Hold an operator of unknown arity + """ + def __init__(self, op, operands, *args, **kwargs): + self.op = _bool_op_map.get(op, op) + self.operands = operands + self.encoding = kwargs.get('encoding', None) + + def __iter__(self): + return iter(self.operands) + + def __unicode__(self): + """Print a generic n-ary operator and its operands using infix + notation""" + # recurse over the operands + parened = ('({0})'.format(com.pprint_thing(opr)) + for opr in self.operands) + return com.pprint_thing(' {0} '.format(self.op).join(parened)) + + @property + def return_type(self): + # clobber types to bool if the op is a boolean operator + if self.op in (_cmp_ops_syms + _bool_ops_syms): + return np.bool_ + return np.result_type(*(term.type for term in com.flatten(self))) + + @property + def isscalar(self): + return all(operand.isscalar for operand in self.operands) + + +def _in(x, y): + """Compute the vectorized membership of ``x in y`` if possible, otherwise + use Python. + """ + try: + return x.isin(y) + except AttributeError: + if com.is_list_like(x): + try: + return y.isin(x) + except AttributeError: + pass + return x in y + + +def _not_in(x, y): + """Compute the vectorized membership of ``x not in y`` if possible, + otherwise use Python. + """ + try: + return ~x.isin(y) + except AttributeError: + if com.is_list_like(x): + try: + return ~y.isin(x) + except AttributeError: + pass + return x not in y + + +_cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=', 'in', 'not in' +_cmp_ops_funcs = op.gt, op.lt, op.ge, op.le, op.eq, op.ne, _in, _not_in +_cmp_ops_dict = dict(zip(_cmp_ops_syms, _cmp_ops_funcs)) + +_bool_ops_syms = '&', '|', 'and', 'or' +_bool_ops_funcs = op.and_, op.or_, op.and_, op.or_ +_bool_ops_dict = dict(zip(_bool_ops_syms, _bool_ops_funcs)) + +_arith_ops_syms = '+', '-', '*', '/', '**', '//', '%' +_arith_ops_funcs = (op.add, op.sub, op.mul, op.truediv if PY3 else op.div, + op.pow, op.floordiv, op.mod) +_arith_ops_dict = dict(zip(_arith_ops_syms, _arith_ops_funcs)) + +_special_case_arith_ops_syms = '**', '//', '%' +_special_case_arith_ops_funcs = op.pow, op.floordiv, op.mod +_special_case_arith_ops_dict = dict(zip(_special_case_arith_ops_syms, + _special_case_arith_ops_funcs)) + +_binary_ops_dict = {} + +for d in (_cmp_ops_dict, _bool_ops_dict, _arith_ops_dict): + _binary_ops_dict.update(d) + + +def _cast_inplace(terms, dtype): + """Cast an expression inplace. + + Parameters + ---------- + terms : Op + The expression that should cast. + dtype : str or numpy.dtype + The dtype to cast to. + """ + dt = np.dtype(dtype) + for term in terms: + try: + new_value = term.value.astype(dt) + except AttributeError: + new_value = dt.type(term.value) + term.update(new_value) + + +def is_term(obj): + return isinstance(obj, Term) + + +class BinOp(Op): + """Hold a binary operator and its operands + + Parameters + ---------- + op : str + left : Term or Op + right : Term or Op + """ + def __init__(self, op, lhs, rhs, **kwargs): + super(BinOp, self).__init__(op, (lhs, rhs)) + self.lhs = lhs + self.rhs = rhs + + self._disallow_scalar_only_bool_ops() + + self.convert_values() + + try: + self.func = _binary_ops_dict[op] + except KeyError: + # has to be made a list for python3 + keys = list(_binary_ops_dict.keys()) + raise ValueError('Invalid binary operator {0!r}, valid' + ' operators are {1}'.format(op, keys)) + + def __call__(self, env): + """Recursively evaluate an expression in Python space. + + Parameters + ---------- + env : Scope + + Returns + ------- + object + The result of an evaluated expression. + """ + # handle truediv + if self.op == '/' and env.locals['truediv']: + self.func = op.truediv + + # recurse over the left/right nodes + left = self.lhs(env) + right = self.rhs(env) + + return self.func(left, right) + + def evaluate(self, env, engine, parser, term_type, eval_in_python): + """Evaluate a binary operation *before* being passed to the engine. + + Parameters + ---------- + env : Scope + engine : str + parser : str + term_type : type + eval_in_python : list + + Returns + ------- + term_type + The "pre-evaluated" expression as an instance of ``term_type`` + """ + if engine == 'python': + res = self(env) + else: + # recurse over the left/right nodes + left = self.lhs.evaluate(env, engine=engine, parser=parser, + term_type=term_type, + eval_in_python=eval_in_python) + right = self.rhs.evaluate(env, engine=engine, parser=parser, + term_type=term_type, + eval_in_python=eval_in_python) + + # base cases + if self.op in eval_in_python: + res = self.func(left.value, right.value) + else: + res = pd.eval(self, local_dict=env, engine=engine, + parser=parser) + + name = env.add_tmp(res) + return term_type(name, env=env) + + def convert_values(self): + """Convert datetimes to a comparable value in an expression. + """ + def stringify(value): + if self.encoding is not None: + encoder = partial(com.pprint_thing_encoded, + encoding=self.encoding) + else: + encoder = com.pprint_thing + return encoder(value) + + lhs, rhs = self.lhs, self.rhs + + if (is_term(lhs) and lhs.kind.startswith('datetime') and is_term(rhs) + and rhs.isscalar): + v = rhs.value + if isinstance(v, (int, float)): + v = stringify(v) + v = _ensure_decoded(v) + v = pd.Timestamp(v) + if v.tz is not None: + v = v.tz_convert('UTC') + self.rhs.update(v) + + if (is_term(rhs) and rhs.kind.startswith('datetime') and + is_term(lhs) and lhs.isscalar): + v = lhs.value + if isinstance(v, (int, float)): + v = stringify(v) + v = _ensure_decoded(v) + v = pd.Timestamp(v) + if v.tz is not None: + v = v.tz_convert('UTC') + self.lhs.update(v) + + def _disallow_scalar_only_bool_ops(self): + if ((self.lhs.isscalar or self.rhs.isscalar) and + self.op in _bool_ops_dict and + (not (issubclass(self.rhs.return_type, (bool, np.bool_)) and + issubclass(self.lhs.return_type, (bool, np.bool_))))): + raise NotImplementedError("cannot evaluate scalar only bool ops") + + +class Div(BinOp): + """Div operator to special case casting. + + Parameters + ---------- + lhs, rhs : Term or Op + The Terms or Ops in the ``/`` expression. + truediv : bool + Whether or not to use true division. With Python 3 this happens + regardless of the value of ``truediv``. + """ + def __init__(self, lhs, rhs, truediv=True, *args, **kwargs): + super(Div, self).__init__('/', lhs, rhs, *args, **kwargs) + + if truediv or PY3: + _cast_inplace(com.flatten(self), np.float_) + + +_unary_ops_syms = '+', '-', '~', 'not' +_unary_ops_funcs = op.pos, op.neg, op.invert, op.invert +_unary_ops_dict = dict(zip(_unary_ops_syms, _unary_ops_funcs)) + + +class UnaryOp(Op): + """Hold a unary operator and its operands + + Parameters + ---------- + op : str + The token used to represent the operator. + operand : Term or Op + The Term or Op operand to the operator. + + Raises + ------ + ValueError + * If no function associated with the passed operator token is found. + """ + def __init__(self, op, operand): + super(UnaryOp, self).__init__(op, (operand,)) + self.operand = operand + + try: + self.func = _unary_ops_dict[op] + except KeyError: + raise ValueError('Invalid unary operator {0!r}, valid operators ' + 'are {1}'.format(op, _unary_ops_syms)) + + def __call__(self, env): + operand = self.operand(env) + return self.func(operand) + + def __unicode__(self): + return com.pprint_thing('{0}({1})'.format(self.op, self.operand)) diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py new file mode 100644 index 0000000000000..9ffae5edd93bc --- /dev/null +++ b/pandas/computation/pytables.py @@ -0,0 +1,573 @@ +""" manage PyTables query interface via Expressions """ + +import ast +import time +import warnings +from functools import partial +from datetime import datetime + +import pandas as pd +from pandas.compat import u, string_types, PY3 +from pandas.core.base import StringMixin +import pandas.core.common as com +from pandas.computation import expr, ops +from pandas.computation.ops import is_term +from pandas.computation.expr import BaseExprVisitor +from pandas.computation.common import _ensure_decoded +from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type + +class Scope(expr.Scope): + __slots__ = 'globals', 'locals', 'queryables' + + def __init__(self, gbls=None, lcls=None, queryables=None, level=1): + super( + Scope, + self).__init__(gbls=gbls, + lcls=lcls, + level=level) + self.queryables = queryables or dict() + + +class Term(ops.Term): + def __new__(cls, name, env, side=None, encoding=None): + klass = Constant if not isinstance(name, string_types) else cls + supr_new = StringMixin.__new__ + if PY3: + return supr_new(klass) + return supr_new(klass, name, env, side=side, encoding=encoding) + + def __init__(self, name, env, side=None, encoding=None): + super(Term, self).__init__(name, env, side=side, encoding=encoding) + + def _resolve_name(self): + # must be a queryables + if self.side == 'left': + if self.name not in self.env.queryables: + raise NameError('name {0!r} is not defined'.format(self.name)) + return self.name + + # resolve the rhs (and allow to be None) + return self.env.locals.get(self.name, + self.env.globals.get(self.name, self.name)) + + @property + def value(self): + return self._value + + +class Constant(Term): + def __init__(self, value, env, side=None, encoding=None): + super(Constant, self).__init__(value, env, side=side, + encoding=encoding) + + def _resolve_name(self): + return self._name + + @property + def name(self): + return self._value + + +class BinOp(ops.BinOp): + + _max_selectors = 31 + + def __init__(self, op, lhs, rhs, queryables, encoding): + super(BinOp, self).__init__(op, lhs, rhs) + self.queryables = queryables + self.encoding = encoding + self.filter = None + self.condition = None + + def _disallow_scalar_only_bool_ops(self): + pass + + def prune(self, klass): + + def pr(left, right): + """ create and return a new specilized BinOp from myself """ + + if left is None: + return right + elif right is None: + return left + + k = klass + if isinstance(left, ConditionBinOp): + if (isinstance(left, ConditionBinOp) and + isinstance(right, ConditionBinOp)): + k = JointConditionBinOp + elif isinstance(left, k): + return left + elif isinstance(right, k): + return right + + elif isinstance(left, FilterBinOp): + if (isinstance(left, FilterBinOp) and + isinstance(right, FilterBinOp)): + k = JointFilterBinOp + elif isinstance(left, k): + return left + elif isinstance(right, k): + return right + + return k(self.op, left, right, queryables=self.queryables, + encoding=self.encoding).evaluate() + + left, right = self.lhs, self.rhs + + if is_term(left) and is_term(right): + res = pr(left.value, right.value) + elif not is_term(left) and is_term(right): + res = pr(left.prune(klass), right.value) + elif is_term(left) and not is_term(right): + res = pr(left.value, right.prune(klass)) + elif not (is_term(left) or is_term(right)): + res = pr(left.prune(klass), right.prune(klass)) + + return res + + def conform(self, rhs): + """ inplace conform rhs """ + if not com.is_list_like(rhs): + rhs = [rhs] + if hasattr(self.rhs, 'ravel'): + rhs = rhs.ravel() + return rhs + + @property + def is_valid(self): + """ return True if this is a valid field """ + return self.lhs in self.queryables + + @property + def is_in_table(self): + """ return True if this is a valid column name for generation (e.g. an + actual column in the table) """ + return self.queryables.get(self.lhs) is not None + + @property + def kind(self): + """ the kind of my field """ + return self.queryables.get(self.lhs) + + def generate(self, v): + """ create and return the op string for this TermValue """ + val = v.tostring(self.encoding) + return "(%s %s %s)" % (self.lhs, self.op, val) + + def convert_value(self, v): + """ convert the expression that is in the term to something that is + accepted by pytables """ + + def stringify(value): + if self.encoding is not None: + encoder = partial(com.pprint_thing_encoded, + encoding=self.encoding) + else: + encoder = com.pprint_thing + return encoder(value) + + kind = _ensure_decoded(self.kind) + if kind == u('datetime64') or kind == u('datetime'): + if isinstance(v, (int, float)): + v = stringify(v) + v = _ensure_decoded(v) + v = pd.Timestamp(v) + if v.tz is not None: + v = v.tz_convert('UTC') + return TermValue(v, v.value, kind) + elif isinstance(v, datetime) or hasattr(v, 'timetuple') or kind == u('date'): + v = time.mktime(v.timetuple()) + return TermValue(v, pd.Timestamp(v), kind) + elif kind == u('timedelta64') or kind == u('timedelta'): + v = _coerce_scalar_to_timedelta_type(v,unit='s').item() + return TermValue(int(v), v, kind) + elif kind == u('integer'): + v = int(float(v)) + return TermValue(v, v, kind) + elif kind == u('float'): + v = float(v) + return TermValue(v, v, kind) + elif kind == u('bool'): + if isinstance(v, string_types): + v = not v.strip().lower() in [u('false'), u('f'), u('no'), + u('n'), u('none'), u('0'), + u('[]'), u('{}'), u('')] + else: + v = bool(v) + return TermValue(v, v, kind) + elif not isinstance(v, string_types): + v = stringify(v) + return TermValue(v, stringify(v), u('string')) + + # string quoting + return TermValue(v, stringify(v), u('string')) + + def convert_values(self): + pass + + +class FilterBinOp(BinOp): + + def __unicode__(self): + return com.pprint_thing("[Filter : [{0}] -> " + "[{1}]".format(self.filter[0], self.filter[1])) + + def invert(self): + """ invert the filter """ + if self.filter is not None: + f = list(self.filter) + f[1] = self.generate_filter_op(invert=True) + self.filter = tuple(f) + return self + + def format(self): + """ return the actual filter format """ + return [self.filter] + + def evaluate(self): + + if not isinstance(self.lhs, string_types): + return self + + if not self.is_valid: + raise ValueError("query term is not valid [%s]" % self) + + rhs = self.conform(self.rhs) + values = [TermValue(v, v, self.kind) for v in rhs] + + if self.is_in_table: + + # if too many values to create the expression, use a filter instead + if self.op in ['==', '!='] and len(values) > self._max_selectors: + + filter_op = self.generate_filter_op() + self.filter = ( + self.lhs, + filter_op, + pd.Index([v.value for v in values])) + + return self + return None + + # equality conditions + if self.op in ['==', '!=']: + + filter_op = self.generate_filter_op() + self.filter = ( + self.lhs, + filter_op, + pd.Index([v.value for v in values])) + + else: + raise TypeError( + "passing a filterable condition to a non-table indexer [%s]" % + self) + + return self + + def generate_filter_op(self, invert=False): + if (self.op == '!=' and not invert) or (self.op == '==' and invert): + return lambda axis, vals: ~axis.isin(vals) + else: + return lambda axis, vals: axis.isin(vals) + + +class JointFilterBinOp(FilterBinOp): + + def format(self): + raise NotImplementedError("unable to collapse Joint Filters") + + def evaluate(self): + return self + + +class ConditionBinOp(BinOp): + + def __unicode__(self): + return com.pprint_thing("[Condition : [{0}]]".format(self.condition)) + + def invert(self): + """ invert the condition """ + #if self.condition is not None: + # self.condition = "~(%s)" % self.condition + #return self + raise NotImplementedError("cannot use an invert condition when passing to numexpr") + + def format(self): + """ return the actual ne format """ + return self.condition + + def evaluate(self): + + if not isinstance(self.lhs, string_types): + return self + + if not self.is_valid: + raise ValueError("query term is not valid [%s]" % self) + + # convert values if we are in the table + if not self.is_in_table: + return None + + rhs = self.conform(self.rhs) + values = [self.convert_value(v) for v in rhs] + + # equality conditions + if self.op in ['==', '!=']: + + # too many values to create the expression? + if len(values) <= self._max_selectors: + vs = [self.generate(v) for v in values] + self.condition = "(%s)" % ' | '.join(vs) + + # use a filter after reading + else: + return None + else: + self.condition = self.generate(values[0]) + + return self + + +class JointConditionBinOp(ConditionBinOp): + + def evaluate(self): + self.condition = "(%s %s %s)" % ( + self.lhs.condition, + self.op, + self.rhs.condition) + return self + + +class UnaryOp(ops.UnaryOp): + + def prune(self, klass): + + if self.op != '~': + raise NotImplementedError("UnaryOp only support invert type ops") + + operand = self.operand + operand = operand.prune(klass) + + if operand is not None: + if issubclass(klass,ConditionBinOp): + if operand.condition is not None: + return operand.invert() + elif issubclass(klass,FilterBinOp): + if operand.filter is not None: + return operand.invert() + + return None + + +_op_classes = {'unary': UnaryOp} + +class ExprVisitor(BaseExprVisitor): + const_type = Constant + term_type = Term + + def __init__(self, env, engine, parser, **kwargs): + super(ExprVisitor, self).__init__(env, engine, parser) + for bin_op in self.binary_ops: + setattr(self, 'visit_{0}'.format(self.binary_op_nodes_map[bin_op]), + lambda node, bin_op=bin_op: partial(BinOp, bin_op, + **kwargs)) + + def visit_UnaryOp(self, node, **kwargs): + if isinstance(node.op, (ast.Not, ast.Invert)): + return UnaryOp('~', self.visit(node.operand)) + elif isinstance(node.op, ast.USub): + return self.const_type(-self.visit(node.operand).value, self.env) + elif isinstance(node.op, ast.UAdd): + raise NotImplementedError('Unary addition not supported') + + def visit_USub(self, node, **kwargs): + return self.const_type(-self.visit(node.operand).value, self.env) + + def visit_Index(self, node, **kwargs): + return self.visit(node.value).value + + def visit_Subscript(self, node, **kwargs): + value = self.visit(node.value) + slobj = self.visit(node.slice) + try: + return self.const_type(value[slobj], self.env) + except TypeError: + raise ValueError("cannot subscript {0!r} with " + "{1!r}".format(value, slobj)) + + def visit_Attribute(self, node, **kwargs): + attr = node.attr + value = node.value + + ctx = node.ctx.__class__ + if ctx == ast.Load: + # resolve the value + resolved = self.visit(value).value + try: + return getattr(resolved, attr) + except AttributeError: + + # something like datetime.datetime where scope is overriden + if isinstance(value, ast.Name) and value.id == attr: + return resolved + + raise ValueError("Invalid Attribute context {0}".format(ctx.__name__)) + + def translate_In(self, op): + return ast.Eq() if isinstance(op, ast.In) else op + + def _rewrite_membership_op(self, node, left, right): + return self.visit(node.op), node.op, left, right + + +class Expr(expr.Expr): + + """ hold a pytables like expression, comprised of possibly multiple 'terms' + + Parameters + ---------- + where : string term expression, Expr, or list-like of Exprs + queryables : a "kinds" map (dict of column name -> kind), or None if column is non-indexable + encoding : an encoding that will encode the query terms + + Returns + ------- + an Expr object + + Examples + -------- + + 'index>=date' + "columns=['A', 'D']" + 'columns=A' + 'columns==A' + "~(columns=['A','B'])" + 'index>df.index[3] & string="bar"' + '(index>df.index[3] & index<=df.index[6]) | string="bar"' + "ts>=Timestamp('2012-02-01')" + "major_axis>=20130101" + """ + + def __init__(self, where, op=None, value=None, queryables=None, + encoding=None, scope_level=None): + + # try to be back compat + where = self.parse_back_compat(where, op, value) + + self.encoding = encoding + self.condition = None + self.filter = None + self.terms = None + self._visitor = None + + # capture the environement if needed + lcls = dict() + if isinstance(where, Expr): + + lcls.update(where.env.locals) + where = where.expr + + elif isinstance(where, (list, tuple)): + + for w in where: + if isinstance(w, Expr): + lcls.update(w.env.locals) + else: + w = self.parse_back_compat(w) + + where = ' & ' .join(["(%s)" % w for w in where]) + + self.expr = where + self.env = Scope(lcls=lcls) + self.env.update(scope_level) + + if queryables is not None and isinstance(self.expr, string_types): + self.env.queryables.update(queryables) + self._visitor = ExprVisitor(self.env, queryables=queryables, + parser='pytables', engine='pytables', + encoding=encoding) + self.terms = self.parse() + + def parse_back_compat(self, w, op=None, value=None): + """ allow backward compatibility for passed arguments """ + + if isinstance(w, dict): + w, op, value = w.get('field'), w.get('op'), w.get('value') + if not isinstance(w, string_types): + raise TypeError( + "where must be passed as a string if op/value are passed") + warnings.warn("passing a dict to Expr is deprecated, " + "pass the where as a single string", + DeprecationWarning) + + if op is not None: + if not isinstance(w, string_types): + raise TypeError( + "where must be passed as a string if op/value are passed") + + if isinstance(op, Expr): + raise TypeError("invalid op passed, must be a string") + w = "{0}{1}".format(w, op) + if value is not None: + if isinstance(value, Expr): + raise TypeError("invalid value passed, must be a string") + w = "{0}{1}".format(w, value) + + warnings.warn("passing multiple values to Expr is deprecated, " + "pass the where as a single string", + DeprecationWarning) + + return w + + def __unicode__(self): + if self.terms is not None: + return com.pprint_thing(self.terms) + return com.pprint_thing(self.expr) + + def evaluate(self): + """ create and return the numexpr condition and filter """ + + try: + self.condition = self.terms.prune(ConditionBinOp) + except AttributeError: + raise ValueError( + "cannot process expression [{0}], [{1}] is not a valid condition".format(self.expr,self)) + try: + self.filter = self.terms.prune(FilterBinOp) + except AttributeError: + raise ValueError( + "cannot process expression [{0}], [{1}] is not a valid filter".format(self.expr,self)) + + return self.condition, self.filter + + +class TermValue(object): + + """ hold a term value the we use to construct a condition/filter """ + + def __init__(self, value, converted, kind): + self.value = value + self.converted = converted + self.kind = kind + + def tostring(self, encoding): + """ quote the string if not encoded + else encode and return """ + if self.kind == u('string'): + if encoding is not None: + return self.converted + return '"%s"' % self.converted + return self.converted + + +def maybe_expression(s): + """ loose checking if s is a pytables-acceptable expression """ + if not isinstance(s, string_types): + return False + ops = ExprVisitor.binary_ops + ExprVisitor.unary_ops + ('=',) + + # make sure we have an op at least + return any(op in s for op in ops) diff --git a/pandas/computation/tests/__init__.py b/pandas/computation/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py new file mode 100755 index 0000000000000..d5bcf85d4de03 --- /dev/null +++ b/pandas/computation/tests/test_eval.py @@ -0,0 +1,1473 @@ +#!/usr/bin/env python + +import unittest +import functools +import numbers +from itertools import product +import ast + +import nose +from nose.tools import assert_raises, assert_true, assert_false, assert_equal + +from numpy.random import randn, rand, randint +import numpy as np +from numpy.testing import assert_array_equal, assert_allclose +from numpy.testing.decorators import slow + +import pandas as pd +from pandas.core import common as com +from pandas import DataFrame, Series, Panel, date_range +from pandas.util.testing import makeCustomDataframe as mkdf + +from pandas.computation import pytables +from pandas.computation.expressions import _USE_NUMEXPR +from pandas.computation.engines import _engines +from pandas.computation.expr import PythonExprVisitor, PandasExprVisitor +from pandas.computation.ops import (_binary_ops_dict, _unary_ops_dict, + _special_case_arith_ops_syms, + _arith_ops_syms, _bool_ops_syms) +import pandas.computation.expr as expr +import pandas.util.testing as tm +from pandas.util.testing import (assert_frame_equal, randbool, + assertRaisesRegexp, + assert_produces_warning, assert_series_equal) +from pandas.compat import PY3, u + +_series_frame_incompatible = _bool_ops_syms +_scalar_skip = 'in', 'not in' + +def skip_if_no_ne(engine='numexpr'): + if not _USE_NUMEXPR and engine == 'numexpr': + raise nose.SkipTest("numexpr engine not installed or disabled") + + +def engine_has_neg_frac(engine): + return _engines[engine].has_neg_frac + + +def _eval_single_bin(lhs, cmp1, rhs, engine): + c = _binary_ops_dict[cmp1] + if engine_has_neg_frac(engine): + try: + return c(lhs, rhs) + except ValueError as e: + try: + msg = e.message + except AttributeError: + msg = e + msg = u(msg) + if msg == u('negative number cannot be raised to a fractional' + ' power'): + return np.nan + raise + return c(lhs, rhs) + + +def _series_and_2d_ndarray(lhs, rhs): + return ((isinstance(lhs, Series) and + isinstance(rhs, np.ndarray) and rhs.ndim > 1) + or (isinstance(rhs, Series) and + isinstance(lhs, np.ndarray) and lhs.ndim > 1)) + + +def _series_and_frame(lhs, rhs): + return ((isinstance(lhs, Series) and isinstance(rhs, DataFrame)) + or (isinstance(rhs, Series) and isinstance(lhs, DataFrame))) + + +def _bool_and_frame(lhs, rhs): + return isinstance(lhs, bool) and isinstance(rhs, pd.core.generic.NDFrame) + + +def skip_incompatible_operand(f): + @functools.wraps(f) + def wrapper(self, lhs, arith1, rhs, *args, **kwargs): + if _series_and_2d_ndarray(lhs, rhs): + self.assertRaises(Exception, pd.eval, 'lhs {0} rhs'.format(arith1), + local_dict={'lhs': lhs, 'rhs': rhs}, + engine=self.engine, parser=self.parser) + elif (np.isscalar(lhs) and np.isscalar(rhs) and arith1 in + _bool_ops_syms): + with tm.assertRaises(NotImplementedError): + pd.eval('lhs {0} rhs'.format(arith1), engine=self.engine, + parser=self.parser) + else: + f(self, lhs, arith1, rhs, *args, **kwargs) + return wrapper + + +def _is_py3_complex_incompat(result, expected): + return (PY3 and isinstance(expected, (complex, np.complexfloating)) and + np.isnan(result)) + + +_good_arith_ops = com.difference(_arith_ops_syms, _special_case_arith_ops_syms) + + +class TestEvalNumexprPandas(unittest.TestCase): + @classmethod + def setUpClass(cls): + skip_if_no_ne() + import numexpr as ne + cls.ne = ne + cls.engine = 'numexpr' + cls.parser = 'pandas' + + @classmethod + def tearDownClass(cls): + del cls.engine, cls.parser + if hasattr(cls, 'ne'): + del cls.ne + + def setup_data(self): + nan_df1 = DataFrame(rand(10, 5)) + nan_df1[nan_df1 > 0.5] = np.nan + nan_df2 = DataFrame(rand(10, 5)) + nan_df2[nan_df2 > 0.5] = np.nan + + self.pandas_lhses = (DataFrame(randn(10, 5)), Series(randn(5)), + Series([1, 2, np.nan, np.nan, 5]), nan_df1) + self.pandas_rhses = (DataFrame(randn(10, 5)), Series(randn(5)), + Series([1, 2, np.nan, np.nan, 5]), nan_df2) + self.scalar_lhses = randn(), np.float64(randn()), np.nan + self.scalar_rhses = randn(), np.float64(randn()), np.nan + + self.lhses = self.pandas_lhses + self.scalar_lhses + self.rhses = self.pandas_rhses + self.scalar_rhses + + def setup_ops(self): + self.cmp_ops = expr._cmp_ops_syms + self.cmp2_ops = self.cmp_ops[::-1] + self.bin_ops = expr._bool_ops_syms + self.special_case_ops = _special_case_arith_ops_syms + self.arith_ops = _good_arith_ops + self.unary_ops = '-', '~', 'not ' + + def setUp(self): + self.setup_ops() + self.setup_data() + self.current_engines = filter(lambda x: x != self.engine, _engines) + + def tearDown(self): + del self.lhses, self.rhses, self.scalar_rhses, self.scalar_lhses + del self.pandas_rhses, self.pandas_lhses, self.current_engines + + @slow + def test_complex_cmp_ops(self): + for lhs, cmp1, rhs, binop, cmp2 in product(self.lhses, self.cmp_ops, + self.rhses, self.bin_ops, + self.cmp2_ops): + self.check_complex_cmp_op(lhs, cmp1, rhs, binop, cmp2) + + def test_simple_cmp_ops(self): + bool_lhses = (DataFrame(randbool(size=(10, 5))), + Series(randbool((5,))), randbool()) + bool_rhses = (DataFrame(randbool(size=(10, 5))), + Series(randbool((5,))), randbool()) + for lhs, rhs, cmp_op in product(bool_lhses, bool_rhses, self.cmp_ops): + self.check_simple_cmp_op(lhs, cmp_op, rhs) + + @slow + def test_binary_arith_ops(self): + for lhs, op, rhs in product(self.lhses, self.arith_ops, self.rhses): + self.check_binary_arith_op(lhs, op, rhs) + + def test_modulus(self): + for lhs, rhs in product(self.lhses, self.rhses): + self.check_modulus(lhs, '%', rhs) + + def test_floor_division(self): + for lhs, rhs in product(self.lhses, self.rhses): + self.check_floor_division(lhs, '//', rhs) + + @slow + def test_pow(self): + for lhs, rhs in product(self.lhses, self.rhses): + self.check_pow(lhs, '**', rhs) + + @slow + def test_single_invert_op(self): + for lhs, op, rhs in product(self.lhses, self.cmp_ops, self.rhses): + self.check_single_invert_op(lhs, op, rhs) + + @slow + def test_compound_invert_op(self): + for lhs, op, rhs in product(self.lhses, self.cmp_ops, self.rhses): + self.check_compound_invert_op(lhs, op, rhs) + + @slow + def test_chained_cmp_op(self): + mids = self.lhses + cmp_ops = tuple(set(self.cmp_ops) - set(['==', '!=', '<=', '>='])) + for lhs, cmp1, mid, cmp2, rhs in product(self.lhses, cmp_ops, + mids, cmp_ops, self.rhses): + self.check_chained_cmp_op(lhs, cmp1, mid, cmp2, rhs) + + def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2): + skip_these = 'in', 'not in' + ex = '(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)'.format(cmp1=cmp1, + binop=binop, + cmp2=cmp2) + scalar_with_in_notin = (np.isscalar(rhs) and (cmp1 in skip_these or + cmp2 in skip_these)) + if scalar_with_in_notin: + self.assertRaises(TypeError, pd.eval, ex, engine=self.engine, + parser=self.parser, local_dict={'lhs': lhs, + 'rhs': rhs}) + elif (_series_and_frame(lhs, rhs) and (cmp1 in + _series_frame_incompatible or + cmp2 in _series_frame_incompatible)): + self.assertRaises(TypeError, pd.eval, ex, + local_dict={'lhs': lhs, 'rhs': rhs}, + engine=self.engine, parser=self.parser) + elif _bool_and_frame(lhs, rhs): + self.assertRaises(TypeError, _eval_single_bin, lhs_new, '&', + rhs_new, self.engine) + self.assertRaises(TypeError, pd.eval, ex, + local_dict={'lhs': lhs, 'rhs': rhs}, + engine=self.engine, parser=self.parser) + elif (np.isscalar(lhs) and np.isnan(lhs) and + not np.isscalar(rhs) and (cmp1 in skip_these or cmp2 in + skip_these)): + with tm.assertRaises(TypeError): + _eval_single_bin(lhs, binop, rhs, self.engine) + else: + lhs_new = _eval_single_bin(lhs, cmp1, rhs, self.engine) + rhs_new = _eval_single_bin(lhs, cmp2, rhs, self.engine) + if (isinstance(lhs_new, Series) and isinstance(rhs_new, DataFrame) + and binop in _series_frame_incompatible): + pass + # TODO: the code below should be added back when left and right + # hand side bool ops are fixed. + + #try: + #self.assertRaises(Exception, pd.eval, ex, + #local_dict={'lhs': lhs, 'rhs': rhs}, + #engine=self.engine, parser=self.parser) + #except AssertionError: + #import ipdb; ipdb.set_trace() + #raise + elif (np.isscalar(lhs_new) and np.isnan(lhs_new) and + not np.isscalar(rhs_new) and binop in skip_these): + with tm.assertRaises(TypeError): + _eval_single_bin(lhs_new, binop, rhs_new, self.engine) + elif _bool_and_frame(lhs_new, rhs_new): + with tm.assertRaises(TypeError): + _eval_single_bin(lhs_new, binop, rhs_new, self.engine) + with tm.assertRaises(TypeError): + pd.eval('lhs_new & rhs_new'.format(binop), + engine=self.engine, parser=self.parser) + else: + expected = _eval_single_bin(lhs_new, binop, rhs_new, self.engine) + result = pd.eval(ex, engine=self.engine, parser=self.parser) + assert_array_equal(result, expected) + + @skip_incompatible_operand + def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): + skip_these = 'in', 'not in' + + def check_operands(left, right, cmp_op): + if (np.isscalar(left) and np.isnan(left) and not np.isscalar(right) + and cmp_op in skip_these): + ex = 'left {0} right'.format(cmp_op) + with tm.assertRaises(ValueError): + pd.eval(ex, engine=self.engine, parser=self.parser) + return + if (np.isscalar(left) and np.isscalar(right) and + cmp_op in _bool_ops_syms): + ex1 = 'lhs {0} mid {1} rhs'.format(cmp1, cmp2) + ex2 = 'lhs {0} mid and mid {1} rhs'.format(cmp1, cmp2) + ex3 = '(lhs {0} mid) & (mid {1} rhs)'.format(cmp1, cmp2) + for ex in (ex1, ex2, ex3): + with assertRaises(NotImplementedError): + pd.eval(ex, engine=self.engine, parser=self.parser) + return + if (np.isscalar(right) and not np.isscalar(left) and cmp_op in + skip_these): + self.assertRaises(Exception, _eval_single_bin, left, cmp_op, + right, self.engine) + elif _series_and_2d_ndarray(right, left): + self.assertRaises(Exception, _eval_single_bin, right, cmp_op, + left, self.engine) + elif (np.isscalar(right) and np.isscalar(left) and cmp_op in + skip_these): + self.assertRaises(Exception, _eval_single_bin, right, cmp_op, + left, self.engine) + else: + new = _eval_single_bin(left, cmp_op, right, self.engine) + return new + return + + lhs_new = check_operands(lhs, mid, cmp1) + rhs_new = check_operands(mid, rhs, cmp2) + + if lhs_new is not None and rhs_new is not None: + # these are not compatible operands + if isinstance(lhs_new, Series) and isinstance(rhs_new, DataFrame): + self.assertRaises(TypeError, _eval_single_bin, lhs_new, '&', + rhs_new, self.engine) + elif (_bool_and_frame(lhs_new, rhs_new)): + self.assertRaises(TypeError, _eval_single_bin, lhs_new, '&', + rhs_new, self.engine) + elif _series_and_2d_ndarray(lhs_new, rhs_new): + # TODO: once #4319 is fixed add this test back in + #self.assertRaises(Exception, _eval_single_bin, lhs_new, '&', + #rhs_new, self.engine) + pass + else: + ex1 = 'lhs {0} mid {1} rhs'.format(cmp1, cmp2) + ex2 = 'lhs {0} mid and mid {1} rhs'.format(cmp1, cmp2) + ex3 = '(lhs {0} mid) & (mid {1} rhs)'.format(cmp1, cmp2) + try: + expected = _eval_single_bin(lhs_new, '&', rhs_new, self.engine) + except TypeError: + import ipdb; ipdb.set_trace() + raise + + for ex in (ex1, ex2, ex3): + result = pd.eval(ex, engine=self.engine, + parser=self.parser) + assert_array_equal(result, expected) + + @skip_incompatible_operand + def check_simple_cmp_op(self, lhs, cmp1, rhs): + ex = 'lhs {0} rhs'.format(cmp1) + if cmp1 in ('in', 'not in') and not com.is_list_like(rhs): + self.assertRaises(TypeError, pd.eval, ex, engine=self.engine, + parser=self.parser, local_dict={'lhs': lhs, + 'rhs': rhs}) + else: + expected = _eval_single_bin(lhs, cmp1, rhs, self.engine) + result = pd.eval(ex, engine=self.engine, parser=self.parser) + assert_array_equal(result, expected) + + @skip_incompatible_operand + def check_binary_arith_op(self, lhs, arith1, rhs): + ex = 'lhs {0} rhs'.format(arith1) + result = pd.eval(ex, engine=self.engine, parser=self.parser) + expected = _eval_single_bin(lhs, arith1, rhs, self.engine) + assert_array_equal(result, expected) + ex = 'lhs {0} rhs {0} rhs'.format(arith1) + result = pd.eval(ex, engine=self.engine, parser=self.parser) + nlhs = _eval_single_bin(lhs, arith1, rhs, + self.engine) + self.check_alignment(result, nlhs, rhs, arith1) + + def check_alignment(self, result, nlhs, ghs, op): + try: + nlhs, ghs = nlhs.align(ghs) + except (ValueError, TypeError, AttributeError): + # ValueError: series frame or frame series align + # TypeError, AttributeError: series or frame with scalar align + pass + else: + expected = self.ne.evaluate('nlhs {0} ghs'.format(op)) + assert_array_equal(result, expected) + + # the following 3 tests require special casing + + @skip_incompatible_operand + def check_modulus(self, lhs, arith1, rhs): + ex = 'lhs {0} rhs'.format(arith1) + result = pd.eval(ex, engine=self.engine, parser=self.parser) + expected = lhs % rhs + assert_allclose(result, expected) + expected = self.ne.evaluate('expected {0} rhs'.format(arith1)) + assert_allclose(result, expected) + + @skip_incompatible_operand + def check_floor_division(self, lhs, arith1, rhs): + ex = 'lhs {0} rhs'.format(arith1) + + if self.engine == 'python': + res = pd.eval(ex, engine=self.engine, parser=self.parser) + expected = lhs // rhs + assert_array_equal(res, expected) + else: + self.assertRaises(TypeError, pd.eval, ex, local_dict={'lhs': lhs, + 'rhs': rhs}, + engine=self.engine, parser=self.parser) + + def get_expected_pow_result(self, lhs, rhs): + try: + expected = _eval_single_bin(lhs, '**', rhs, self.engine) + except ValueError as e: + msg = 'negative number cannot be raised to a fractional power' + try: + emsg = e.message + except AttributeError: + emsg = e + + emsg = u(emsg) + + if emsg == msg: + if self.engine == 'python': + raise nose.SkipTest(emsg) + else: + expected = np.nan + else: + raise + return expected + + @skip_incompatible_operand + def check_pow(self, lhs, arith1, rhs): + ex = 'lhs {0} rhs'.format(arith1) + expected = self.get_expected_pow_result(lhs, rhs) + result = pd.eval(ex, engine=self.engine, parser=self.parser) + + if (np.isscalar(lhs) and np.isscalar(rhs) and + _is_py3_complex_incompat(result, expected)): + self.assertRaises(AssertionError, assert_array_equal, result, + expected) + else: + assert_array_equal(result, expected) + + ex = '(lhs {0} rhs) {0} rhs'.format(arith1) + result = pd.eval(ex, engine=self.engine, parser=self.parser) + expected = self.get_expected_pow_result( + self.get_expected_pow_result(lhs, rhs), rhs) + assert_array_equal(result, expected) + + @skip_incompatible_operand + def check_single_invert_op(self, lhs, cmp1, rhs): + # simple + for el in (lhs, rhs): + try: + elb = el.astype(bool) + except AttributeError: + elb = np.array([bool(el)]) + expected = ~elb + result = pd.eval('~elb', engine=self.engine, parser=self.parser) + assert_array_equal(expected, result) + + for engine in self.current_engines: + skip_if_no_ne(engine) + assert_array_equal(result, pd.eval('~elb', engine=engine, + parser=self.parser)) + + @skip_incompatible_operand + def check_compound_invert_op(self, lhs, cmp1, rhs): + skip_these = 'in', 'not in' + ex = '~(lhs {0} rhs)'.format(cmp1) + + if np.isscalar(rhs) and cmp1 in skip_these: + self.assertRaises(TypeError, pd.eval, ex, engine=self.engine, + parser=self.parser, local_dict={'lhs': lhs, + 'rhs': rhs}) + elif (np.isscalar(lhs) and np.isnan(lhs) and not np.isscalar(rhs) + and cmp1 in skip_these): + with tm.assertRaises(ValueError): + pd.eval(ex, engine=self.engine, parser=self.parser) + else: + # compound + if np.isscalar(lhs) and np.isscalar(rhs): + lhs, rhs = map(lambda x: np.array([x]), (lhs, rhs)) + expected = _eval_single_bin(lhs, cmp1, rhs, self.engine) + if np.isscalar(expected): + expected = not expected + else: + expected = ~expected + result = pd.eval(ex, engine=self.engine, parser=self.parser) + assert_array_equal(expected, result) + + # make sure the other engines work the same as this one + for engine in self.current_engines: + skip_if_no_ne(engine) + ev = pd.eval(ex, engine=self.engine, parser=self.parser) + assert_array_equal(ev, result) + + def ex(self, op, var_name='lhs'): + return '{0}{1}'.format(op, var_name) + + def test_frame_invert(self): + expr = self.ex('~') + + ## ~ ## + # frame + ## float always raises + lhs = DataFrame(randn(5, 2)) + if self.engine == 'numexpr': + with tm.assertRaises(NotImplementedError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + + ## int raises on numexpr + lhs = DataFrame(randint(5, size=(5, 2))) + if self.engine == 'numexpr': + with tm.assertRaises(NotImplementedError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = ~lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) + + ## bool always works + lhs = DataFrame(rand(5, 2) > 0.5) + expect = ~lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) + + ## object raises + lhs = DataFrame({'b': ['a', 1, 2.0], 'c': rand(3) > 0.5}) + if self.engine == 'numexpr': + with tm.assertRaises(ValueError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + + def test_series_invert(self): + #### ~ #### + expr = self.ex('~') + + # series + ## float raises + lhs = Series(randn(5)) + if self.engine == 'numexpr': + with tm.assertRaises(NotImplementedError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + + ## int raises on numexpr + lhs = Series(randint(5, size=5)) + if self.engine == 'numexpr': + with tm.assertRaises(NotImplementedError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = ~lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) + + ## bool + lhs = Series(rand(5) > 0.5) + expect = ~lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) + + # float + # int + # bool + + # object + lhs = Series(['a', 1, 2.0]) + if self.engine == 'numexpr': + with tm.assertRaises(ValueError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + + def test_frame_negate(self): + expr = self.ex('-') + + # float + lhs = DataFrame(randn(5, 2)) + expect = -lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) + + # int + lhs = DataFrame(randint(5, size=(5, 2))) + expect = -lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) + + # bool doesn't work with numexpr but works elsewhere + lhs = DataFrame(rand(5, 2) > 0.5) + if self.engine == 'numexpr': + with tm.assertRaises(NotImplementedError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = -lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) + + def test_series_negate(self): + expr = self.ex('-') + + # float + lhs = Series(randn(5)) + expect = -lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) + + # int + lhs = Series(randint(5, size=5)) + expect = -lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) + + # bool doesn't work with numexpr but works elsewhere + lhs = Series(rand(5) > 0.5) + if self.engine == 'numexpr': + with tm.assertRaises(NotImplementedError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = -lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) + + def test_frame_pos(self): + expr = self.ex('+') + + # float + lhs = DataFrame(randn(5, 2)) + if self.engine == 'python': + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) + + # int + lhs = DataFrame(randint(5, size=(5, 2))) + if self.engine == 'python': + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) + + # bool doesn't work with numexpr but works elsewhere + lhs = DataFrame(rand(5, 2) > 0.5) + if self.engine == 'python': + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) + + def test_series_pos(self): + expr = self.ex('+') + + # float + lhs = Series(randn(5)) + if self.engine == 'python': + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) + + # int + lhs = Series(randint(5, size=5)) + if self.engine == 'python': + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) + + # bool doesn't work with numexpr but works elsewhere + lhs = Series(rand(5) > 0.5) + if self.engine == 'python': + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) + + def test_scalar_unary(self): + with tm.assertRaises(TypeError): + pd.eval('~1.0', engine=self.engine, parser=self.parser) + + self.assertEqual(pd.eval('-1.0', parser=self.parser, engine=self.engine), -1.0) + self.assertEqual(pd.eval('+1.0', parser=self.parser, engine=self.engine), +1.0) + + self.assertEqual(pd.eval('~1', parser=self.parser, engine=self.engine), ~1) + self.assertEqual(pd.eval('-1', parser=self.parser, engine=self.engine), -1) + self.assertEqual(pd.eval('+1', parser=self.parser, engine=self.engine), +1) + + self.assertEqual(pd.eval('~True', parser=self.parser, engine=self.engine), ~True) + self.assertEqual(pd.eval('~False', parser=self.parser, engine=self.engine), ~False) + self.assertEqual(pd.eval('-True', parser=self.parser, engine=self.engine), -True) + self.assertEqual(pd.eval('-False', parser=self.parser, engine=self.engine), -False) + self.assertEqual(pd.eval('+True', parser=self.parser, engine=self.engine), +True) + self.assertEqual(pd.eval('+False', parser=self.parser, engine=self.engine), +False) + + def test_disallow_scalar_bool_ops(self): + exprs = '1 or 2', '1 and 2' + exprs += 'a and b', 'a or b' + exprs += '1 or 2 and (3 + 2) > 3', + exprs += '2 * x > 2 or 1 and 2', + exprs += '2 * df > 3 and 1 or a', + + x, a, b, df = np.random.randn(3), 1, 2, DataFrame(randn(3, 2)) + for ex in exprs: + with tm.assertRaises(NotImplementedError): + pd.eval(ex, engine=self.engine, parser=self.parser) + + +class TestEvalNumexprPython(TestEvalNumexprPandas): + @classmethod + def setUpClass(cls): + skip_if_no_ne() + import numexpr as ne + cls.ne = ne + cls.engine = 'numexpr' + cls.parser = 'python' + + def setup_ops(self): + self.cmp_ops = list(filter(lambda x: x not in ('in', 'not in'), + expr._cmp_ops_syms)) + self.cmp2_ops = self.cmp_ops[::-1] + self.bin_ops = [s for s in expr._bool_ops_syms + if s not in ('and', 'or')] + self.special_case_ops = _special_case_arith_ops_syms + self.arith_ops = _good_arith_ops + self.unary_ops = '+', '-', '~' + + def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): + ex1 = 'lhs {0} mid {1} rhs'.format(cmp1, cmp2) + self.assertRaises(NotImplementedError, pd.eval, ex1, + local_dict={'lhs': lhs, 'mid': mid, 'rhs': rhs}, + engine=self.engine, parser=self.parser) + + +class TestEvalPythonPython(TestEvalNumexprPython): + @classmethod + def setUpClass(cls): + cls.engine = 'python' + cls.parser = 'python' + + @skip_incompatible_operand + def check_modulus(self, lhs, arith1, rhs): + ex = 'lhs {0} rhs'.format(arith1) + result = pd.eval(ex, engine=self.engine) + expected = lhs % rhs + assert_allclose(result, expected) + expected = eval('expected {0} rhs'.format(arith1)) + assert_allclose(result, expected) + + def check_alignment(self, result, nlhs, ghs, op): + try: + nlhs, ghs = nlhs.align(ghs) + except (ValueError, TypeError, AttributeError): + # ValueError: series frame or frame series align + # TypeError, AttributeError: series or frame with scalar align + pass + else: + expected = eval('nlhs {0} ghs'.format(op)) + assert_array_equal(result, expected) + + +class TestEvalPythonPandas(TestEvalPythonPython): + @classmethod + def setUpClass(cls): + cls.engine = 'python' + cls.parser = 'pandas' + + def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): + TestEvalNumexprPandas.check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, + rhs) + + +f = lambda *args, **kwargs: np.random.randn() + + +ENGINES_PARSERS = list(product(_engines, expr._parsers)) + + +#------------------------------------- +# basic and complex alignment + +class TestAlignment(object): + + index_types = 'i', 'f', 's', 'u', 'dt', # 'p' + + def check_align_nested_unary_op(self, engine, parser): + skip_if_no_ne(engine) + s = 'df * ~2' + df = mkdf(5, 3, data_gen_f=f) + res = pd.eval(s, engine=engine, parser=parser) + assert_frame_equal(res, df * ~2) + + def test_align_nested_unary_op(self): + for engine, parser in ENGINES_PARSERS: + yield self.check_align_nested_unary_op, engine, parser + + def check_basic_frame_alignment(self, engine, parser): + skip_if_no_ne(engine) + args = product(self.index_types, repeat=2) + for r_idx_type, c_idx_type in args: + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + df2 = mkdf(20, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + res = pd.eval('df + df2', engine=engine, parser=parser) + assert_frame_equal(res, df + df2) + + @slow + def test_basic_frame_alignment(self): + for engine, parser in ENGINES_PARSERS: + yield self.check_basic_frame_alignment, engine, parser + + def check_frame_comparison(self, engine, parser): + skip_if_no_ne(engine) + args = product(self.index_types, repeat=2) + for r_idx_type, c_idx_type in args: + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + res = pd.eval('df < 2', engine=engine, parser=parser) + assert_frame_equal(res, df < 2) + + df3 = DataFrame(randn(*df.shape), index=df.index, + columns=df.columns) + res = pd.eval('df < df3', engine=engine, parser=parser) + assert_frame_equal(res, df < df3) + + @slow + def test_frame_comparison(self): + for engine, parser in ENGINES_PARSERS: + yield self.check_frame_comparison, engine, parser + + def check_medium_complex_frame_alignment(self, engine, parser): + skip_if_no_ne(engine) + args = product(self.index_types, repeat=4) + for r1, c1, r2, c2 in args: + df = mkdf(5, 2, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) + df2 = mkdf(10, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + df3 = mkdf(15, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + res = pd.eval('df + df2 + df3', engine=engine, parser=parser) + assert_frame_equal(res, df + df2 + df3) + + @slow + def test_medium_complex_frame_alignment(self): + for engine, parser in ENGINES_PARSERS: + yield self.check_medium_complex_frame_alignment, engine, parser + + def check_basic_frame_series_alignment(self, engine, parser): + skip_if_no_ne(engine) + def testit(r_idx_type, c_idx_type, index_name): + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + + res = pd.eval('df + s', engine=engine, parser=parser) + if r_idx_type == 'dt' or c_idx_type == 'dt': + if engine == 'numexpr': + expected = df.add(s) + else: + expected = df + s + else: + expected = df + s + assert_frame_equal(res, expected) + + args = product(self.index_types, self.index_types, ('index', + 'columns')) + for r_idx_type, c_idx_type, index_name in args: + testit(r_idx_type, c_idx_type, index_name) + + @slow + def test_basic_frame_series_alignment(self): + for engine, parser in ENGINES_PARSERS: + yield self.check_basic_frame_series_alignment, engine, parser + + def check_basic_series_frame_alignment(self, engine, parser): + skip_if_no_ne(engine) + def testit(r_idx_type, c_idx_type, index_name): + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + + res = pd.eval('s + df', engine=engine, parser=parser) + if r_idx_type == 'dt' or c_idx_type == 'dt': + if engine == 'numexpr': + expected = df.add(s) + else: + expected = s + df + else: + expected = s + df + assert_frame_equal(res, expected) + + args = product(self.index_types, self.index_types, ('index', + 'columns')) + for r_idx_type, c_idx_type, index_name in args: + testit(r_idx_type, c_idx_type, index_name) + + @slow + def test_basic_series_frame_alignment(self): + for engine, parser in ENGINES_PARSERS: + yield self.check_basic_series_frame_alignment, engine, parser + + def check_series_frame_commutativity(self, engine, parser): + skip_if_no_ne(engine) + args = product(self.index_types, self.index_types, ('+', '*'), + ('index', 'columns')) + for r_idx_type, c_idx_type, op, index_name in args: + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + + lhs = 's {0} df'.format(op) + rhs = 'df {0} s'.format(op) + a = pd.eval(lhs, engine=engine, parser=parser) + b = pd.eval(rhs, engine=engine, parser=parser) + + if r_idx_type != 'dt' and c_idx_type != 'dt': + if engine == 'numexpr': + assert_frame_equal(a, b) + + @slow + def test_series_frame_commutativity(self): + for engine, parser in ENGINES_PARSERS: + yield self.check_series_frame_commutativity, engine, parser + + def check_complex_series_frame_alignment(self, engine, parser): + skip_if_no_ne(engine) + index_types = [self.index_types] * 4 + args = product(('index', 'columns'), ('df', 'df2'), *index_types) + for index_name, obj, r1, r2, c1, c2 in args: + df = mkdf(10, 5, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) + df2 = mkdf(20, 5, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + index = getattr(locals()[obj], index_name) + s = Series(np.random.randn(5), index[:5]) + + if r2 == 'dt' or c2 == 'dt': + if engine == 'numexpr': + expected2 = df2.add(s) + else: + expected2 = df2 + s + else: + expected2 = df2 + s + + if r1 == 'dt' or c1 == 'dt': + if engine == 'numexpr': + expected = expected2.add(df) + else: + expected = expected2 + df + else: + expected = expected2 + df + + res = pd.eval('df2 + s + df', engine=engine, parser=parser) + assert_equal(res.shape, expected.shape) + assert_frame_equal(res, expected) + + @slow + def test_complex_series_frame_alignment(self): + for engine, parser in ENGINES_PARSERS: + yield self.check_complex_series_frame_alignment, engine, parser + + def check_performance_warning_for_poor_alignment(self, engine, parser): + skip_if_no_ne(engine) + df = DataFrame(randn(1000, 10)) + s = Series(randn(10000)) + if engine == 'numexpr': + seen = pd.io.common.PerformanceWarning + else: + seen = False + + with assert_produces_warning(seen): + pd.eval('df + s', engine=engine, parser=parser) + + s = Series(randn(1000)) + with assert_produces_warning(False): + pd.eval('df + s', engine=engine, parser=parser) + + df = DataFrame(randn(10, 10000)) + s = Series(randn(10000)) + with assert_produces_warning(False): + pd.eval('df + s', engine=engine, parser=parser) + + df = DataFrame(randn(10, 10)) + s = Series(randn(10000)) + + is_python_engine = engine == 'python' + + if not is_python_engine: + wrn = pd.io.common.PerformanceWarning + else: + wrn = False + + with assert_produces_warning(wrn) as w: + pd.eval('df + s', engine=engine, parser=parser) + + if not is_python_engine: + assert_equal(len(w), 1) + msg = str(w[0].message) + expected = ("Alignment difference on axis {0} is larger" + " than an order of magnitude on term {1!r}, " + "by more than {2:.4g}; performance may suffer" + "".format(1, 's', np.log10(s.size - df.shape[1]))) + assert_equal(msg, expected) + + + def test_performance_warning_for_poor_alignment(self): + for engine, parser in ENGINES_PARSERS: + yield self.check_performance_warning_for_poor_alignment, engine, parser + + +#------------------------------------ +# slightly more complex ops + +class TestOperationsNumExprPandas(unittest.TestCase): + @classmethod + def setUpClass(cls): + skip_if_no_ne() + cls.engine = 'numexpr' + cls.parser = 'pandas' + cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms + + @classmethod + def tearDownClass(cls): + del cls.engine, cls.parser + + def eval(self, *args, **kwargs): + kwargs['engine'] = self.engine + kwargs['parser'] = self.parser + return pd.eval(*args, **kwargs) + + def test_simple_arith_ops(self): + ops = self.arith_ops + + for op in filter(lambda x: x != '//', ops): + ex = '1 {0} 1'.format(op) + ex2 = 'x {0} 1'.format(op) + ex3 = '1 {0} (x + 1)'.format(op) + + if op in ('in', 'not in'): + self.assertRaises(TypeError, pd.eval, ex, + engine=self.engine, parser=self.parser) + else: + expec = _eval_single_bin(1, op, 1, self.engine) + x = self.eval(ex, engine=self.engine, parser=self.parser) + assert_equal(x, expec) + + expec = _eval_single_bin(x, op, 1, self.engine) + y = self.eval(ex2, local_dict={'x': x}, engine=self.engine, + parser=self.parser) + assert_equal(y, expec) + + expec = _eval_single_bin(1, op, x + 1, self.engine) + y = self.eval(ex3, local_dict={'x': x}, + engine=self.engine, parser=self.parser) + assert_equal(y, expec) + + def test_simple_bool_ops(self): + for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), + (True, False)): + ex = '{0} {1} {2}'.format(lhs, op, rhs) + res = self.eval(ex) + exp = eval(ex) + self.assertEqual(res, exp) + + def test_bool_ops_with_constants(self): + for op, lhs, rhs in product(expr._bool_ops_syms, ('True', 'False'), + ('True', 'False')): + ex = '{0} {1} {2}'.format(lhs, op, rhs) + res = self.eval(ex) + exp = eval(ex) + self.assertEqual(res, exp) + + def test_panel_fails(self): + x = Panel(randn(3, 4, 5)) + y = Series(randn(10)) + assert_raises(NotImplementedError, self.eval, 'x + y', + local_dict={'x': x, 'y': y}) + + def test_4d_ndarray_fails(self): + x = randn(3, 4, 5, 6) + y = Series(randn(10)) + assert_raises(NotImplementedError, self.eval, 'x + y', + local_dict={'x': x, 'y': y}) + + def test_constant(self): + x = self.eval('1') + assert_equal(x, 1) + + def test_single_variable(self): + df = DataFrame(randn(10, 2)) + df2 = self.eval('df', local_dict={'df': df}) + assert_frame_equal(df, df2) + + def test_truediv(self): + s = np.array([1]) + ex = 's / 1' + d = {'s': s} + + if PY3: + res = self.eval(ex, truediv=False, local_dict=d) + assert_array_equal(res, np.array([1.0])) + + res = self.eval(ex, truediv=True, local_dict=d) + assert_array_equal(res, np.array([1.0])) + + res = self.eval('1 / 2', truediv=True) + expec = 0.5 + self.assertEqual(res, expec) + + res = self.eval('1 / 2', truediv=False) + expec = 0.5 + self.assertEqual(res, expec) + + res = self.eval('s / 2', truediv=False, local_dict={'s': s}) + expec = 0.5 + self.assertEqual(res, expec) + + res = self.eval('s / 2', truediv=True, local_dict={'s': s}) + expec = 0.5 + self.assertEqual(res, expec) + else: + res = self.eval(ex, truediv=False, local_dict=d) + assert_array_equal(res, np.array([1])) + + res = self.eval(ex, truediv=True, local_dict=d) + assert_array_equal(res, np.array([1.0])) + + res = self.eval('1 / 2', truediv=True) + expec = 0.5 + self.assertEqual(res, expec) + + res = self.eval('1 / 2', truediv=False) + expec = 0 + self.assertEqual(res, expec) + + res = self.eval('s / 2', truediv=False, local_dict={'s': s}) + expec = 0 + self.assertEqual(res, expec) + + res = self.eval('s / 2', truediv=True, local_dict={'s': s}) + expec = 0.5 + self.assertEqual(res, expec) + + def test_failing_subscript_with_name_error(self): + df = DataFrame(np.random.randn(5, 3)) + self.assertRaises(NameError, self.eval, 'df[x > 2] > 2', + local_dict={'df': df}) + + def test_lhs_expression_subscript(self): + df = DataFrame(np.random.randn(5, 3)) + result = self.eval('(df + 1)[df > 2]', local_dict={'df': df}) + expected = (df + 1)[df > 2] + assert_frame_equal(result, expected) + + def test_attr_expression(self): + df = DataFrame(np.random.randn(5, 3), columns=list('abc')) + expr1 = 'df.a < df.b' + expec1 = df.a < df.b + expr2 = 'df.a + df.b + df.c' + expec2 = df.a + df.b + df.c + expr3 = 'df.a + df.b + df.c[df.b < 0]' + expec3 = df.a + df.b + df.c[df.b < 0] + exprs = expr1, expr2, expr3 + expecs = expec1, expec2, expec3 + for e, expec in zip(exprs, expecs): + assert_series_equal(expec, self.eval(e, local_dict={'df': df})) + + def test_assignment_fails(self): + df = DataFrame(np.random.randn(5, 3), columns=list('abc')) + df2 = DataFrame(np.random.randn(5, 3)) + expr1 = 'df = df2' + self.assertRaises(NotImplementedError, self.eval, expr1, + local_dict={'df': df, 'df2': df2}) + + def test_basic_period_index_boolean_expression(self): + df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i') + + e = df < 2 + r = self.eval('df < 2', local_dict={'df': df}) + x = df < 2 + + assert_frame_equal(r, e) + assert_frame_equal(x, e) + + def test_basic_period_index_subscript_expression(self): + df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i') + r = self.eval('df[df < 2 + 3]', local_dict={'df': df}) + e = df[df < 2 + 3] + assert_frame_equal(r, e) + + def test_nested_period_index_subscript_expression(self): + df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i') + r = self.eval('df[df[df < 2] < 2] + df * 2', local_dict={'df': df}) + e = df[df[df < 2] < 2] + df * 2 + assert_frame_equal(r, e) + + def test_date_boolean(self): + df = DataFrame(randn(5, 3)) + df['dates1'] = date_range('1/1/2012', periods=5) + res = self.eval('df.dates1 < 20130101', local_dict={'df': df}, + engine=self.engine, parser=self.parser) + expec = df.dates1 < '20130101' + assert_series_equal(res, expec) + + def test_simple_in_ops(self): + if self.parser != 'python': + res = pd.eval('1 in [1, 2]', engine=self.engine, + parser=self.parser) + self.assertTrue(res) + + res = pd.eval('2 in (1, 2)', engine=self.engine, + parser=self.parser) + self.assertTrue(res) + + res = pd.eval('3 in (1, 2)', engine=self.engine, + parser=self.parser) + self.assertFalse(res) + + res = pd.eval('3 not in (1, 2)', engine=self.engine, + parser=self.parser) + self.assertTrue(res) + + res = pd.eval('[3] not in (1, 2)', engine=self.engine, + parser=self.parser) + self.assertTrue(res) + + res = pd.eval('[3] in ([3], 2)', engine=self.engine, + parser=self.parser) + self.assertTrue(res) + + res = pd.eval('[[3]] in [[[3]], 2]', engine=self.engine, + parser=self.parser) + self.assertTrue(res) + + res = pd.eval('(3,) in [(3,), 2]', engine=self.engine, + parser=self.parser) + self.assertTrue(res) + + res = pd.eval('(3,) not in [(3,), 2]', engine=self.engine, + parser=self.parser) + self.assertFalse(res) + + res = pd.eval('[(3,)] in [[(3,)], 2]', engine=self.engine, + parser=self.parser) + self.assertTrue(res) + else: + with tm.assertRaises(NotImplementedError): + pd.eval('1 in [1, 2]', engine=self.engine, parser=self.parser) + with tm.assertRaises(NotImplementedError): + pd.eval('2 in (1, 2)', engine=self.engine, parser=self.parser) + with tm.assertRaises(NotImplementedError): + pd.eval('3 in (1, 2)', engine=self.engine, parser=self.parser) + with tm.assertRaises(NotImplementedError): + pd.eval('3 not in (1, 2)', engine=self.engine, + parser=self.parser) + with tm.assertRaises(NotImplementedError): + pd.eval('[(3,)] in (1, 2, [(3,)])', engine=self.engine, + parser=self.parser) + with tm.assertRaises(NotImplementedError): + pd.eval('[3] not in (1, 2, [[3]])', engine=self.engine, + parser=self.parser) + + +class TestOperationsNumExprPython(TestOperationsNumExprPandas): + @classmethod + def setUpClass(cls): + if not _USE_NUMEXPR: + raise nose.SkipTest("numexpr engine not installed") + cls.engine = 'numexpr' + cls.parser = 'python' + cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms + cls.arith_ops = filter(lambda x: x not in ('in', 'not in'), + cls.arith_ops) + + def test_fails_and(self): + df = DataFrame(np.random.randn(5, 3)) + self.assertRaises(NotImplementedError, pd.eval, 'df > 2 and df > 3', + local_dict={'df': df}, parser=self.parser, + engine=self.engine) + + def test_fails_or(self): + df = DataFrame(np.random.randn(5, 3)) + self.assertRaises(NotImplementedError, pd.eval, 'df > 2 or df > 3', + local_dict={'df': df}, parser=self.parser, + engine=self.engine) + + def test_fails_not(self): + df = DataFrame(np.random.randn(5, 3)) + self.assertRaises(NotImplementedError, pd.eval, 'not df > 2', + local_dict={'df': df}, parser=self.parser, + engine=self.engine) + + def test_fails_ampersand(self): + df = DataFrame(np.random.randn(5, 3)) + ex = '(df + 2)[df > 1] > 0 & (df > 0)' + with tm.assertRaises(NotImplementedError): + pd.eval(ex, parser=self.parser, engine=self.engine) + + def test_fails_pipe(self): + df = DataFrame(np.random.randn(5, 3)) + ex = '(df + 2)[df > 1] > 0 | (df > 0)' + with tm.assertRaises(NotImplementedError): + pd.eval(ex, parser=self.parser, engine=self.engine) + + def test_bool_ops_with_constants(self): + for op, lhs, rhs in product(expr._bool_ops_syms, ('True', 'False'), + ('True', 'False')): + ex = '{0} {1} {2}'.format(lhs, op, rhs) + if op in ('and', 'or'): + with tm.assertRaises(NotImplementedError): + self.eval(ex) + else: + res = self.eval(ex) + exp = eval(ex) + self.assertEqual(res, exp) + + def test_simple_bool_ops(self): + for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), + (True, False)): + ex = 'lhs {0} rhs'.format(op) + if op in ('and', 'or'): + with tm.assertRaises(NotImplementedError): + pd.eval(ex, engine=self.engine, parser=self.parser) + else: + res = pd.eval(ex, engine=self.engine, parser=self.parser) + exp = eval(ex) + self.assertEqual(res, exp) + + +class TestOperationsPythonPython(TestOperationsNumExprPython): + @classmethod + def setUpClass(cls): + cls.engine = cls.parser = 'python' + cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms + cls.arith_ops = filter(lambda x: x not in ('in', 'not in'), + cls.arith_ops) + + +class TestOperationsPythonPandas(TestOperationsNumExprPandas): + @classmethod + def setUpClass(cls): + cls.engine = 'python' + cls.parser = 'pandas' + cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms + + +_var_s = randn(10) + + +class TestScope(object): + def check_global_scope(self, e, engine, parser): + skip_if_no_ne(engine) + assert_array_equal(_var_s * 2, pd.eval(e, engine=engine, + parser=parser)) + + def test_global_scope(self): + e = '_var_s * 2' + for engine, parser in product(_engines, expr._parsers): + yield self.check_global_scope, e, engine, parser + + def check_no_new_locals(self, engine, parser): + skip_if_no_ne(engine) + x = 1 + lcls = locals().copy() + pd.eval('x + 1', local_dict=lcls, engine=engine, parser=parser) + lcls2 = locals().copy() + lcls2.pop('lcls') + assert_equal(lcls, lcls2) + + def test_no_new_locals(self): + for engine, parser in product(_engines, expr._parsers): + yield self.check_no_new_locals, engine, parser + + def check_no_new_globals(self, engine, parser): + skip_if_no_ne(engine) + x = 1 + gbls = globals().copy() + pd.eval('x + 1', engine=engine, parser=parser) + gbls2 = globals().copy() + assert_equal(gbls, gbls2) + + def test_no_new_globals(self): + for engine, parser in product(_engines, expr._parsers): + yield self.check_no_new_globals, engine, parser + + +def test_invalid_engine(): + skip_if_no_ne() + assertRaisesRegexp(KeyError, 'Invalid engine \'asdf\' passed', + pd.eval, 'x + y', local_dict={'x': 1, 'y': 2}, + engine='asdf') + + +def test_invalid_parser(): + skip_if_no_ne() + assertRaisesRegexp(KeyError, 'Invalid parser \'asdf\' passed', + pd.eval, 'x + y', local_dict={'x': 1, 'y': 2}, + parser='asdf') + + +def check_is_expr_syntax(engine): + skip_if_no_ne(engine) + s = 1 + valid1 = 's + 1' + valid2 = '__y + _xx' + assert_true(expr.isexpr(valid1, check_names=False)) + assert_true(expr.isexpr(valid2, check_names=False)) + + +def check_is_expr_names(engine): + skip_if_no_ne(engine) + r, s = 1, 2 + valid = 's + r' + invalid = '__y + __x' + assert_true(expr.isexpr(valid, check_names=True)) + assert_false(expr.isexpr(invalid, check_names=True)) + + +def test_is_expr_syntax(): + for engine in _engines: + yield check_is_expr_syntax, engine + + +def test_is_expr_names(): + for engine in _engines: + yield check_is_expr_names, engine + + +_parsers = {'python': PythonExprVisitor, 'pytables': pytables.ExprVisitor, + 'pandas': PandasExprVisitor} + +def check_disallowed_nodes(engine, parser): + skip_if_no_ne(engine) + VisitorClass = _parsers[parser] + uns_ops = VisitorClass.unsupported_nodes + inst = VisitorClass('x + 1', engine, parser) + + for ops in uns_ops: + assert_raises(NotImplementedError, getattr(inst, ops)) + + +def test_disallowed_nodes(): + for engine, visitor in product(_parsers, repeat=2): + yield check_disallowed_nodes, engine, visitor + + +def check_syntax_error_exprs(engine, parser): + skip_if_no_ne(engine) + e = 's +' + assert_raises(SyntaxError, pd.eval, e, engine=engine, parser=parser) + + +def test_syntax_error_exprs(): + for engine, parser in ENGINES_PARSERS: + yield check_syntax_error_exprs, engine, parser + + +def check_name_error_exprs(engine, parser): + skip_if_no_ne(engine) + e = 's + t' + assert_raises(NameError, pd.eval, e, engine=engine, parser=parser) + + +def test_name_error_exprs(): + for engine, parser in ENGINES_PARSERS: + yield check_name_error_exprs, engine, parser + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/core/base.py b/pandas/core/base.py index a2f7f04053b9f..fb0d56113ede9 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -48,6 +48,7 @@ def __repr__(self): """ return str(self) + class PandasObject(StringMixin): """baseclass for various pandas objects""" diff --git a/pandas/core/common.py b/pandas/core/common.py index 34aaa08b57171..d3fa10abc7681 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -3,17 +3,25 @@ """ import re +import collections +import numbers import codecs import csv import sys +from datetime import timedelta + +from distutils.version import LooseVersion + from numpy.lib.format import read_array, write_array import numpy as np + import pandas.algos as algos import pandas.lib as lib import pandas.tslib as tslib from pandas import compat -from pandas.compat import StringIO, BytesIO, range, long, u, zip, map +from pandas.compat import (StringIO, BytesIO, range, long, u, zip, map, + string_types) from datetime import timedelta from pandas.core.config import get_option @@ -27,14 +35,18 @@ class AmbiguousIndexError(PandasError, KeyError): pass _POSSIBLY_CAST_DTYPES = set([np.dtype(t) - for t in ['M8[ns]', 'm8[ns]', 'O', 'int8', 'uint8', 'int16', 'uint16', 'int32', 'uint32', 'int64', 'uint64']]) + for t in ['M8[ns]', 'm8[ns]', 'O', 'int8', + 'uint8', 'int16', 'uint16', 'int32', + 'uint32', 'int64', 'uint64']]) _NS_DTYPE = np.dtype('M8[ns]') _TD_DTYPE = np.dtype('m8[ns]') _INT64_DTYPE = np.dtype(np.int64) _DATELIKE_DTYPES = set([np.dtype(t) for t in ['M8[ns]', 'm8[ns]']]) -# define abstract base classes to enable isinstance type checking on our objects + +# define abstract base classes to enable isinstance type checking on our +# objects def create_pandas_abc_type(name, attr, comp): @classmethod def _check(cls, inst): @@ -44,15 +56,22 @@ def _check(cls, inst): meta = type("ABCBase", (type,), dct) return meta(name, tuple(), dct) + ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series",)) ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",)) ABCPanel = create_pandas_abc_type("ABCPanel", "_typ", ("panel",)) -ABCSparseSeries = create_pandas_abc_type("ABCSparseSeries", "_subtyp", ('sparse_series', 'sparse_time_series')) -ABCSparseArray = create_pandas_abc_type("ABCSparseArray", "_subtyp", ('sparse_array', 'sparse_series')) +ABCSparseSeries = create_pandas_abc_type("ABCSparseSeries", "_subtyp", + ('sparse_series', + 'sparse_time_series')) +ABCSparseArray = create_pandas_abc_type("ABCSparseArray", "_subtyp", + ('sparse_array', 'sparse_series')) + class _ABCGeneric(type): def __instancecheck__(cls, inst): return hasattr(inst, "_data") + + ABCGeneric = _ABCGeneric("ABCGeneric", tuple(), {}) def isnull(obj): @@ -223,6 +242,35 @@ def notnull(obj): return -res +def _iterable_not_string(x): + return (isinstance(x, collections.Iterable) and + not isinstance(x, compat.string_types)) + + +def flatten(l): + """Flatten an arbitrarily nested sequence. + + Parameters + ---------- + l : sequence + The non string sequence to flatten + + Notes + ----- + This doesn't consider strings sequences. + + Returns + ------- + flattened : generator + """ + for el in l: + if _iterable_not_string(el): + for s in flatten(el): + yield s + else: + yield el + + def mask_missing(arr, values_to_mask): """ Return a masking array of same size/shape as arr @@ -1657,7 +1705,7 @@ def is_bool(obj): def is_integer(obj): - return isinstance(obj, (int, long, np.integer)) + return isinstance(obj, (numbers.Integral, np.integer)) def is_float(obj): @@ -1665,7 +1713,7 @@ def is_float(obj): def is_complex(obj): - return isinstance(obj, (complex, np.complexfloating)) + return isinstance(obj, (numbers.Complex, np.complexfloating)) def is_iterator(obj): @@ -1674,7 +1722,7 @@ def is_iterator(obj): def is_number(obj): - return isinstance(obj, (np.number, int, long, float, complex)) + return isinstance(obj, (numbers.Number, np.number)) def is_integer_dtype(arr_or_dtype): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f56b6bc00cf15..86565a3a1d9e5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -28,15 +28,16 @@ _coerce_to_dtypes, _DATELIKE_DTYPES, is_list_like) from pandas.core.generic import NDFrame from pandas.core.index import Index, MultiIndex, _ensure_index -from pandas.core.indexing import (_NDFrameIndexer, _maybe_droplevels, - _convert_to_index_sliceable, _check_bool_indexer, - _maybe_convert_indices) +from pandas.core.indexing import (_maybe_droplevels, + _convert_to_index_sliceable, + _check_bool_indexer, _maybe_convert_indices) from pandas.core.internals import (BlockManager, create_block_manager_from_arrays, create_block_manager_from_blocks) from pandas.core.series import Series, _radd_compat -import pandas.core.expressions as expressions -from pandas.sparse.array import SparseArray +import pandas.computation.expressions as expressions +from pandas.computation.eval import eval as _eval +from pandas.computation.expr import _ensure_scope from pandas.compat.scipy import scoreatpercentile as _quantile from pandas.compat import(range, zip, lrange, lmap, lzip, StringIO, u, OrderedDict, raise_with_traceback) @@ -51,14 +52,12 @@ import pandas.core.datetools as datetools import pandas.core.common as com import pandas.core.format as fmt -import pandas.core.generic as generic import pandas.core.nanops as nanops import pandas.lib as lib -import pandas.tslib as tslib import pandas.algos as _algos -from pandas.core.config import get_option, set_option +from pandas.core.config import get_option #---------------------------------------------------------------------- # Docstring templates @@ -1898,6 +1897,155 @@ def _getitem_frame(self, key): raise ValueError('Must pass DataFrame with boolean values only') return self.where(key) + def _get_index_resolvers(self, axis): + # index or columns + axis_index = getattr(self, axis) + d = dict() + + for i, name in enumerate(axis_index.names): + if name is not None: + key = level = name + else: + # prefix with 'i' or 'c' depending on the input axis + # e.g., you must do ilevel_0 for the 0th level of an unnamed + # multiiindex + level_string = '{prefix}level_{i}'.format(prefix=axis[0], i=i) + key = level_string + level = i + + d[key] = Series(axis_index.get_level_values(level).values, + index=axis_index, name=level) + + # put the index/columns itself in the dict + d[axis] = axis_index + return d + + def query(self, expr, **kwargs): + """Query the columns of a frame with a boolean expression. + + Parameters + ---------- + expr : string + The query string to evaluate. The result of the evaluation of this + expression is first passed to :attr:`~pandas.DataFrame.loc` and if + that fails because of a multidimensional key (e.g., a DataFrame) + then the result will be passed to + :meth:`~pandas.DataFrame.__getitem__`. + kwargs : dict + See the documentation for :func:`~pandas.eval` for complete details + on the keyword arguments accepted by + :meth:`~pandas.DataFrame.query`. + + Returns + ------- + q : DataFrame or Series + + Notes + ----- + This method uses the top-level :func:`~pandas.eval` function to + evaluate the passed query. + + The :meth:`~pandas.DataFrame.query` method uses a slightly + modified Python syntax by default. For example, the ``&`` and ``|`` + (bitwise) operators have the precedence of their boolean cousins, + :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python, + however the semantics are different. + + You can change the semantics of the expression by passing the keyword + argument ``parser='python'``. This enforces the same semantics as + evaluation in Python space. Likewise, you can pass ``engine='python'`` + to evaluate an expression using Python itself as a backend. This is not + recommended as it is inefficient compared to using ``numexpr`` as the + engine. + + The :attr:`~pandas.DataFrame.index` and + :attr:`~pandas.DataFrame.columns` attributes of the + :class:`~pandas.DataFrame` instance is placed in the namespace by + default, which allows you to treat both the index and columns of the + frame as a column in the frame. + The identifier ``index`` is used for this variable, and you can also + use the name of the index to identify it in a query. + + For further details and examples see the ``query`` documentation in + :ref:`indexing `. + + See Also + -------- + pandas.eval + DataFrame.eval + + Examples + -------- + >>> from numpy.random import randn + >>> from pandas import DataFrame + >>> df = DataFrame(randn(10, 2), columns=list('ab')) + >>> df.query('a > b') + >>> df[df.a > df.b] # same result as the previous expression + """ + # need to go up at least 4 stack frames + # 4 expr.Scope + # 3 expr._ensure_scope + # 2 self.eval + # 1 self.query + # 0 self.query caller (implicit) + level = kwargs.setdefault('level', 4) + if level < 4: + raise ValueError("Going up fewer than 4 stack frames will not" + " capture the necessary variable scope for a " + "query expression") + + res = self.eval(expr, **kwargs) + + try: + return self.loc[res] + except ValueError: + # when res is multi-dimensional loc raises, but this is sometimes a + # valid query + return self[res] + + def eval(self, expr, **kwargs): + """Evaluate an expression in the context of the calling DataFrame + instance. + + Parameters + ---------- + expr : string + The expression string to evaluate. + kwargs : dict + See the documentation for :func:`~pandas.eval` for complete details + on the keyword arguments accepted by + :meth:`~pandas.DataFrame.query`. + + Returns + ------- + ret : ndarray, scalar, or pandas object + + See Also + -------- + pandas.DataFrame.query + pandas.eval + + Notes + ----- + For more details see the API documentation for :func:`~pandas.eval`. + For detailed examples see :ref:`enhancing performance with eval + `. + + Examples + -------- + >>> from numpy.random import randn + >>> from pandas import DataFrame + >>> df = DataFrame(randn(10, 2), columns=list('ab')) + >>> df.eval('a + b') + """ + resolvers = kwargs.pop('resolvers', None) + if resolvers is None: + index_resolvers = self._get_index_resolvers('index') + index_resolvers.update(self._get_index_resolvers('columns')) + resolvers = [self, index_resolvers] + kwargs['local_dict'] = _ensure_scope(resolvers=resolvers, **kwargs) + return _eval(expr, **kwargs) + def _slice(self, slobj, axis=0, raise_on_error=False): axis = self._get_block_manager_axis(axis) new_data = self._data.get_slice( @@ -4599,6 +4747,7 @@ def combineMult(self, other): DataFrame._setup_axes( ['index', 'columns'], info_axis=1, stat_axis=0, axes_are_reversed=True) + _EMPTY_SERIES = Series([]) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index c265d1590af95..11ce27b078b18 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -18,8 +18,7 @@ from pandas.sparse.array import _maybe_to_sparse, SparseArray import pandas.lib as lib import pandas.tslib as tslib -import pandas.core.expressions as expressions -from pandas.util.decorators import cache_readonly +import pandas.computation.expressions as expressions from pandas.tslib import Timestamp from pandas import compat diff --git a/pandas/core/series.py b/pandas/core/series.py index 893483f0f2636..beb398dfe6fd0 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -8,7 +8,6 @@ import operator from distutils.version import LooseVersion import types -import warnings from numpy import nan, ndarray import numpy as np @@ -18,8 +17,10 @@ _default_index, _maybe_promote, _maybe_upcast, _asarray_tuplesafe, is_integer_dtype, _NS_DTYPE, _TD_DTYPE, - _infer_dtype_from_scalar, is_list_like, _values_from_object, - _possibly_cast_to_datetime, _possibly_castable, _possibly_convert_platform, + _infer_dtype_from_scalar, is_list_like, + _values_from_object, + _possibly_cast_to_datetime, _possibly_castable, + _possibly_convert_platform, ABCSparseArray) from pandas.core.index import (Index, MultiIndex, InvalidIndexError, _ensure_index, _handle_legacy_indexes) @@ -29,7 +30,6 @@ from pandas.core import generic from pandas.core.internals import SingleBlockManager from pandas.core.categorical import Categorical -import pandas.core.expressions as expressions from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex, Period from pandas.tseries.offsets import DateOffset @@ -775,12 +775,9 @@ def put(self, *args, **kwargs): def __len__(self): return len(self._data) - @property - def size(self): - return self.__len__() - def view(self, dtype=None): - return self._constructor(self.values.view(dtype), index=self.index, name=self.name) + return self._constructor(self.values.view(dtype), index=self.index, + name=self.name) def __array__(self, result=None): """ the array interface, return my values """ @@ -790,7 +787,8 @@ def __array_wrap__(self, result): """ Gets called prior to a ufunc (and after) """ - return self._constructor(result, index=self.index, name=self.name, copy=False) + return self._constructor(result, index=self.index, name=self.name, + copy=False) def __contains__(self, key): return key in self.index diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index c8224f761ce17..b79408a1bf8d2 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2,12 +2,9 @@ High level interface to PyTables for reading and writing pandas data structures to disk """ -from __future__ import print_function # pylint: disable-msg=E1101,W0613,W0603 from datetime import datetime, date -from pandas.compat import map, range, zip, lrange, lmap, u -from pandas import compat import time import re import copy @@ -15,14 +12,13 @@ import warnings import numpy as np -import pandas from pandas import (Series, TimeSeries, DataFrame, Panel, Panel4D, Index, MultiIndex, Int64Index, Timestamp, _np_version_under1p7) from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel from pandas.sparse.array import BlockIndex, IntIndex from pandas.tseries.api import PeriodIndex, DatetimeIndex from pandas.core.base import StringMixin -from pandas.core.common import adjoin, is_list_like, pprint_thing +from pandas.core.common import adjoin, pprint_thing from pandas.core.algorithms import match, unique from pandas.core.categorical import Categorical from pandas.core.common import _asarray_tuplesafe @@ -33,8 +29,10 @@ import pandas.core.common as com from pandas.tools.merge import concat from pandas import compat +from pandas.compat import u, PY3, range, lrange from pandas.io.common import PerformanceWarning from pandas.core.config import get_option +from pandas.computation.pytables import Expr, maybe_expression import pandas.lib as lib import pandas.algos as algos @@ -59,11 +57,31 @@ def _ensure_decoded(s): def _ensure_encoding(encoding): # set the encoding if we need if encoding is None: - if compat.PY3: + if PY3: encoding = _default_encoding return encoding +Term = Expr + + +def _ensure_term(where): + """ + ensure that the where is a Term or a list of Term + this makes sure that we are capturing the scope of variables + that are passed + create the terms here with a frame_level=2 (we are 2 levels down) + """ + + # only consider list/tuple here as an ndarray is automaticaly a coordinate list + if isinstance(where, (list,tuple)): + where = [w if not maybe_expression(w) else Term(w, scope_level=2) + for w in where if w is not None ] + elif maybe_expression(where): + where = Term(where, scope_level=2) + return where + + class PossibleDataLossError(Exception): pass @@ -222,9 +240,12 @@ def get_store(path, **kwargs): Examples -------- + >>> from pandas import DataFrame + >>> from numpy.random import randn + >>> bar = DataFrame(randn(10, 4)) >>> with get_store('test.h5') as store: - >>> store['foo'] = bar # write to HDF5 - >>> bar = store['foo'] # retrieve + ... store['foo'] = bar # write to HDF5 + ... bar = store['foo'] # retrieve """ store = None try: @@ -237,7 +258,8 @@ def get_store(path, **kwargs): # interface to/from ### -def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, append=None, **kwargs): +def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, + append=None, **kwargs): """ store this object, close it if we opened it """ if append: f = lambda store: store.append(key, value, **kwargs) @@ -245,7 +267,8 @@ def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, app f = lambda store: store.put(key, value, **kwargs) if isinstance(path_or_buf, compat.string_types): - with get_store(path_or_buf, mode=mode, complevel=complevel, complib=complib) as store: + with get_store(path_or_buf, mode=mode, complevel=complevel, + complib=complib) as store: f(store) else: f(path_or_buf) @@ -332,6 +355,9 @@ class HDFStore(StringMixin): Examples -------- + >>> from pandas import DataFrame + >>> from numpy.random import randn + >>> bar = DataFrame(randn(10, 4)) >>> store = HDFStore('test.h5') >>> store['foo'] = bar # write to HDF5 >>> bar = store['foo'] # retrieve @@ -341,9 +367,9 @@ class HDFStore(StringMixin): def __init__(self, path, mode=None, complevel=None, complib=None, fletcher32=False, **kwargs): try: - import tables as _ + import tables except ImportError: # pragma: no cover - raise Exception('HDFStore requires PyTables') + raise ImportError('HDFStore requires PyTables') self._path = path if mode is None: @@ -477,7 +503,7 @@ def open(self, mode='a'): self._handle = h5_open(self._path, self._mode) except IOError as e: # pragma: no cover if 'can not be written' in str(e): - print('Opening %s in read-only mode' % self._path) + print ('Opening %s in read-only mode' % self._path) self._handle = h5_open(self._path, 'r') else: raise @@ -523,7 +549,8 @@ def get(self, key): raise KeyError('No object named %s in the file' % key) return self._read_group(group) - def select(self, key, where=None, start=None, stop=None, columns=None, iterator=False, chunksize=None, auto_close=False, **kwargs): + def select(self, key, where=None, start=None, stop=None, columns=None, + iterator=False, chunksize=None, auto_close=False, **kwargs): """ Retrieve pandas object stored in file, optionally based on where criteria @@ -549,22 +576,28 @@ def select(self, key, where=None, start=None, stop=None, columns=None, iterator= raise KeyError('No object named %s in the file' % key) # create the storer and axes + where = _ensure_term(where) s = self._create_storer(group) s.infer_axes() # what we are actually going to do for a chunk def func(_start, _stop): - return s.read(where=where, start=_start, stop=_stop, columns=columns, **kwargs) + return s.read(where=where, start=_start, stop=_stop, + columns=columns, **kwargs) if iterator or chunksize is not None: if not s.is_table: raise TypeError( "can only use an iterator or chunksize on a table") - return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, chunksize=chunksize, auto_close=auto_close) + return TableIterator(self, func, nrows=s.nrows, start=start, + stop=stop, chunksize=chunksize, + auto_close=auto_close) - return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, auto_close=auto_close).get_values() + return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, + auto_close=auto_close).get_values() - def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs): + def select_as_coordinates( + self, key, where=None, start=None, stop=None, **kwargs): """ return the selection as an Index @@ -575,6 +608,7 @@ def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs start : integer (defaults to None), row number to start selection stop : integer (defaults to None), row number to stop selection """ + where = _ensure_term(where) return self.get_storer(key).read_coordinates(where=where, start=start, stop=stop, **kwargs) def unique(self, key, column, **kwargs): @@ -599,7 +633,9 @@ def select_column(self, key, column, **kwargs): """ return self.get_storer(key).read_column(column=column, **kwargs) - def select_as_multiple(self, keys, where=None, selector=None, columns=None, start=None, stop=None, iterator=False, chunksize=None, auto_close=False, **kwargs): + def select_as_multiple(self, keys, where=None, selector=None, columns=None, + start=None, stop=None, iterator=False, + chunksize=None, auto_close=False, **kwargs): """ Retrieve pandas objects from multiple tables Parameters @@ -618,16 +654,19 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, star """ # default to single select + where = _ensure_term(where) if isinstance(keys, (list, tuple)) and len(keys) == 1: keys = keys[0] if isinstance(keys, compat.string_types): - return self.select(key=keys, where=where, columns=columns, start=start, stop=stop, iterator=iterator, chunksize=chunksize, **kwargs) + return self.select(key=keys, where=where, columns=columns, + start=start, stop=stop, iterator=iterator, + chunksize=chunksize, **kwargs) if not isinstance(keys, (list, tuple)): - raise Exception("keys must be a list/tuple") + raise TypeError("keys must be a list/tuple") - if len(keys) == 0: - raise Exception("keys must have a non-zero length") + if not len(keys): + raise ValueError("keys must have a non-zero length") if selector is None: selector = keys[0] @@ -642,7 +681,8 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, star raise TypeError("Invalid table [%s]" % k) if not t.is_table: raise TypeError( - "object [%s] is not a table, and cannot be used in all select as multiple" % t.pathname) + "object [%s] is not a table, and cannot be used in all select as multiple" % + t.pathname) if nrows is None: nrows = t.nrows @@ -655,7 +695,7 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, star c = self.select_as_coordinates( selector, where, start=start, stop=stop) nrows = len(c) - except (Exception) as detail: + except Exception: raise ValueError("invalid selector [%s]" % selector) def func(_start, _stop): @@ -720,6 +760,7 @@ def remove(self, key, where=None, start=None, stop=None): raises KeyError if key is not a valid store """ + where = _ensure_term(where) try: s = self.get_storer(key) except: @@ -777,8 +818,8 @@ def append(self, key, value, format=None, append=True, columns=None, dropna=None data in the table, so be careful """ if columns is not None: - raise Exception( - "columns is not a supported keyword in append, try data_columns") + raise TypeError("columns is not a supported keyword in append, " + "try data_columns") if dropna is None: dropna = get_option("io.hdf.dropna_table") @@ -809,8 +850,9 @@ def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, d """ if axes is not None: - raise Exception( - "axes is currently not accepted as a paremter to append_to_multiple; you can create the tables indepdently instead") + raise TypeError("axes is currently not accepted as a parameter to" + " append_to_multiple; you can create the " + "tables indepdently instead") if not isinstance(d, dict): raise ValueError( @@ -876,7 +918,7 @@ def create_table_index(self, key, **kwargs): # version requirements _tables() if not _table_supports_index: - raise Exception("PyTables >= 2.3 is required for table indexing") + raise ValueError("PyTables >= 2.3 is required for table indexing") s = self.get_storer(key) if s is None: @@ -930,7 +972,11 @@ def copy( """ new_store = HDFStore( - file, mode=mode, complib=complib, complevel=complevel, fletcher32 = fletcher32) + file, + mode=mode, + complib=complib, + complevel=complevel, + fletcher32=fletcher32) if keys is None: keys = list(self.keys()) if not isinstance(keys, (tuple, list)): @@ -1142,7 +1188,8 @@ class TableIterator(object): kwargs : the passed kwargs """ - def __init__(self, store, func, nrows, start=None, stop=None, chunksize=None, auto_close=False): + def __init__(self, store, func, nrows, start=None, stop=None, + chunksize=None, auto_close=False): self.store = store self.func = func self.nrows = nrows or 0 @@ -1251,7 +1298,12 @@ def set_table(self, table): def __unicode__(self): temp = tuple( - map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind))) + map(pprint_thing, + (self.name, + self.cname, + self.axis, + self.pos, + self.kind))) return "name->%s,cname->%s,axis->%s,pos->%s,kind->%s" % temp def __eq__(self, other): @@ -1361,9 +1413,7 @@ def validate_col(self, itemsize=None): """ validate this column: return the compared against itemsize """ # validate this column for string truncation (or reset to the max size) - dtype = getattr(self, 'dtype', None) if _ensure_decoded(self.kind) == u('string'): - c = self.col if c is not None: if itemsize is None: @@ -1467,7 +1517,8 @@ class DataCol(IndexCol): _info_fields = ['tz'] @classmethod - def create_for_block(cls, i=None, name=None, cname=None, version=None, **kwargs): + def create_for_block( + cls, i=None, name=None, cname=None, version=None, **kwargs): """ return a new datacol with the block i """ if cname is None: @@ -1487,11 +1538,12 @@ def create_for_block(cls, i=None, name=None, cname=None, version=None, **kwargs) return cls(name=name, cname=cname, **kwargs) - def __init__(self, values=None, kind=None, typ=None, cname=None, data=None, block=None, **kwargs): + def __init__(self, values=None, kind=None, typ=None, + cname=None, data=None, block=None, **kwargs): super(DataCol, self).__init__( values=values, kind=kind, typ=typ, cname=cname, **kwargs) self.dtype = None - self.dtype_attr = u("%s_dtype") % self.name + self.dtype_attr = u("%s_dtype" % self.name) self.set_data(data) def __unicode__(self): @@ -1540,7 +1592,8 @@ def set_kind(self): if self.typ is None: self.typ = getattr(self.description, self.cname, None) - def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=None, **kwargs): + def set_atom(self, block, existing_col, min_itemsize, + nan_rep, info, encoding=None, **kwargs): """ create and setup my atom from the block b """ self.values = list(block.items) @@ -1596,7 +1649,11 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No # end up here ### elif inferred_type == 'string' or dtype == 'object': self.set_atom_string( - block, existing_col, min_itemsize, nan_rep, encoding) + block, + existing_col, + min_itemsize, + nan_rep, + encoding) else: self.set_atom_data(block) @@ -1605,7 +1662,8 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No def get_atom_string(self, block, itemsize): return _tables().StringCol(itemsize=itemsize, shape=block.shape[0]) - def set_atom_string(self, block, existing_col, min_itemsize, nan_rep, encoding): + def set_atom_string( + self, block, existing_col, min_itemsize, nan_rep, encoding): # fill nan items with myself block = block.fillna(nan_rep)[0] data = block.values @@ -1701,13 +1759,13 @@ def validate_attr(self, append): if (existing_fields is not None and existing_fields != list(self.values)): raise ValueError("appended items do not match existing items" - " in table!") + " in table!") existing_dtype = getattr(self.attrs, self.dtype_attr, None) if (existing_dtype is not None and existing_dtype != self.dtype): raise ValueError("appended items dtype do not match existing items dtype" - " in table!") + " in table!") def convert(self, values, nan_rep, encoding): """ set the data from this selection (and convert to the correct dtype if we can) """ @@ -1855,6 +1913,9 @@ def __unicode__(self): return "%-12.12s (shape->%s)" % (self.pandas_type, s) return self.pandas_type + def __str__(self): + return self.__repr__() + def set_object_info(self): """ set my pandas type & version """ self.attrs.pandas_type = str(self.pandas_kind) @@ -2058,7 +2119,7 @@ def read_index(self, key): _, index = self.read_index_node(getattr(self.group, key)) return index else: # pragma: no cover - raise Exception('unrecognized index variety: %s' % variety) + raise TypeError('unrecognized index variety: %s' % variety) def write_index(self, key, index): if isinstance(index, MultiIndex): @@ -2241,7 +2302,7 @@ def write_array(self, key, value, items=None): warnings.warn(ws, PerformanceWarning) vlarr = self._handle.createVLArray(self.group, key, - _tables().ObjectAtom()) + _tables().ObjectAtom()) vlarr.append(value) elif value.dtype.type == np.datetime64: self._handle.createArray(self.group, key, value.view('i8')) @@ -2381,7 +2442,6 @@ def read(self, **kwargs): sdict = {} for name in items: key = 'sparse_frame_%s' % name - node = getattr(self.group, key) s = SparseFrameFixed(self.parent, getattr(self.group, key)) s.infer_axes() sdict[name] = s.read() @@ -2574,7 +2634,8 @@ def validate(self, other): oax = ov[i] if sax != oax: raise ValueError( - "invalid combinate of [%s] on appending data [%s] vs current table [%s]" % (c, sax, oax)) + "invalid combinate of [%s] on appending data [%s] vs current table [%s]" % + (c, sax, oax)) # should never get here raise Exception( @@ -2706,14 +2767,14 @@ def validate_min_itemsize(self, min_itemsize): continue if k not in q: raise ValueError( - "min_itemsize has the key [%s] which is not an axis or data_column" % k) + "min_itemsize has the key [%s] which is not an axis or data_column" % + k) @property def indexables(self): """ create/cache the indexables if they don't exist """ if self._indexables is None: - d = self.description self._indexables = [] # index columns @@ -2848,7 +2909,8 @@ def validate_data_columns(self, data_columns, min_itemsize): # return valid columns in the order of our axis return [c for c in data_columns if c in axis_labels] - def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, min_itemsize=None, **kwargs): + def create_axes(self, axes, obj, validate=True, nan_rep=None, + data_columns=None, min_itemsize=None, **kwargs): """ create and return the axes leagcy tables create an indexable column, indexable index, non-indexable fields @@ -2869,8 +2931,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, try: axes = _AXES_MAP[type(obj)] except: - raise TypeError( - "cannot properly create the storer for: [group->%s,value->%s]" % + raise TypeError("cannot properly create the storer for: [group->%s,value->%s]" % (self.group._v_name, type(obj))) # map axes to numbers @@ -2995,8 +3056,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, try: existing_col = existing_table.values_axes[i] except: - raise ValueError( - "Incompatible appended table [%s] with existing table [%s]" % + raise ValueError("Incompatible appended table [%s] with existing table [%s]" % (blocks, existing_table.values_axes)) else: existing_col = None @@ -3036,8 +3096,8 @@ def process_axes(self, obj, columns=None): obj = _reindex_axis(obj, axis, labels, columns) # apply the selection filters (but keep in the same order) - if self.selection.filter: - for field, op, filt in self.selection.filter: + if self.selection.filter is not None: + for field, op, filt in self.selection.filter.format(): def process_filter(field, filt): @@ -3070,7 +3130,8 @@ def process_filter(field, filt): return obj - def create_description(self, complib=None, complevel=None, fletcher32=False, expectedrows=None): + def create_description( + self, complib=None, complevel=None, fletcher32=False, expectedrows=None): """ create the description of the table from the axes & values """ # expected rows estimate @@ -3119,8 +3180,8 @@ def read_column(self, column, where=None, **kwargs): return False if where is not None: - raise Exception( - "read_column does not currently accept a where clause") + raise TypeError("read_column does not currently accept a where " + "clause") # find the axes for a in self.axes: @@ -3128,7 +3189,8 @@ def read_column(self, column, where=None, **kwargs): if not a.is_data_indexable: raise ValueError( - "column [%s] can not be extracted individually; it is not data indexable" % column) + "column [%s] can not be extracted individually; it is not data indexable" % + column) # column must be an indexable or a data column c = getattr(self.table.cols, column) @@ -3174,7 +3236,7 @@ class LegacyTable(Table): ndim = 3 def write(self, **kwargs): - raise Exception("write operations are not allowed on legacy tables!") + raise TypeError("write operations are not allowed on legacy tables!") def read(self, where=None, columns=None, **kwargs): """ we have n indexable columns, with an arbitrary number of data axes """ @@ -3418,15 +3480,14 @@ def write_data_chunk(self, indexes, mask, values): rows = rows[~mask.ravel().astype(bool)] except Exception as detail: - raise Exception("cannot create row-data -> %s" % str(detail)) + raise Exception("cannot create row-data -> %s" % detail) try: if len(rows): self.table.append(rows) self.table.flush() except Exception as detail: - raise Exception( - "tables cannot write this data -> %s" % str(detail)) + raise TypeError("tables cannot write this data -> %s" % detail) def delete(self, where=None, **kwargs): @@ -3626,9 +3687,9 @@ def get_attrs(self): self.levels = [] t = self.table self.index_axes = [a.infer(t) - for a in self.indexables if a.is_an_indexable] + for a in self.indexables if a.is_an_indexable] self.values_axes = [a.infer(t) - for a in self.indexables if not a.is_an_indexable] + for a in self.indexables if not a.is_an_indexable] self.data_columns = [a.name for a in self.values_axes] @property @@ -3755,7 +3816,7 @@ def _convert_index(index, encoding=None): index_name=index_name) if isinstance(index, MultiIndex): - raise Exception('MultiIndex not supported here!') + raise TypeError('MultiIndex not supported here!') inferred_type = lib.infer_dtype(index) @@ -3904,32 +3965,13 @@ def _need_convert(kind): return False -class Term(StringMixin): - - """create a term object that holds a field, op, and value - - Parameters - ---------- - field : dict, string term expression, or the field to operate (must be a valid index/column type of DataFrame/Panel) - op : a valid op (defaults to '=') (optional) - >, >=, <, <=, =, != (not equal) are allowed - value : a value or list of values (required) - queryables : a kinds map (dict of column name -> kind), or None i column is non-indexable - encoding : an encoding that will encode the query terms +class Coordinates(object): - Returns - ------- - a Term object + """ holds a returned coordinates list, useful to select the same rows from different tables - Examples - -------- - >>> Term(dict(field = 'index', op = '>', value = '20121114')) - >>> Term('index', '20121114') - >>> Term('index', '>', '20121114') - >>> Term('index', ['20121114','20121114']) - >>> Term('index', datetime(2012,11,14)) - >>> Term('major_axis>20121114') - >>> Term('minor_axis', ['A','U']) + coordinates : holds the array of coordinates + group : the source group + where : the source where """ _ops = ['<=', '<', '>=', '>', '!=', '==', '='] @@ -4134,23 +4176,13 @@ def stringify(value): return TermValue(v, stringify(v), u('string')) -class TermValue(object): - - """ hold a term value the we use to construct a condition/filter """ - def __init__(self, value, converted, kind): - self.value = value - self.converted = converted - self.kind = kind + def __len__(self): + return len(self.values) - def tostring(self, encoding): - """ quote the string if not encoded - else encode and return """ - if self.kind == u('string'): - if encoding is not None: - return self.converted - return '"%s"' % self.converted - return self.converted + def __getitem__(self, key): + """ return a new coordinates object, sliced by the key """ + return Coordinates(self.values[key], self.group, self.where) class Selection(object): @@ -4204,41 +4236,32 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs): self.terms = self.generate(where) # create the numexpr & the filter - if self.terms: - terms = [t for t in self.terms if t.condition is not None] - if len(terms): - self.condition = "(%s)" % ' & '.join( - [t.condition for t in terms]) - self.filter = [] - for t in self.terms: - if t.filter is not None: - self.filter.append(t.filter) + if self.terms is not None: + self.condition, self.filter = self.terms.evaluate() def generate(self, where): """ where can be a : dict,list,tuple,string """ if where is None: return None - if not isinstance(where, (list, tuple)): - where = [where] - else: - - # make this a list of we think that we only have a sigle term & no - # operands inside any terms - if not any([isinstance(w, (list, tuple, Term)) for w in where]): - - if not any([isinstance(w, compat.string_types) and Term._search.match(w) for w in where]): - where = [where] + q = self.table.queryables() + try: + return Expr(where, queryables=q, encoding=self.table.encoding) + except (NameError) as detail: - queryables = self.table.queryables() - return [Term(c, queryables=queryables, encoding=self.table.encoding) for c in where] + # raise a nice message, suggesting that the user should use data_columns + raise ValueError("The passed where expression: {0}\n" + " contains an invalid variable reference\n" + " all of the variable refrences must be a reference to\n" + " an axis (e.g. 'index' or 'columns'), or a data_column\n" + " The currently defined references are: {1}\n".format(where,','.join(q.keys()))) def select(self): """ generate the selection """ if self.condition is not None: - return self.table.table.readWhere(self.condition, start=self.start, stop=self.stop) + return self.table.table.readWhere(self.condition.format(), start=self.start, stop=self.stop) elif self.coordinates is not None: return self.table.table.readCoordinates(self.coordinates) return self.table.table.read(start=self.start, stop=self.stop) @@ -4250,7 +4273,7 @@ def select_coords(self): if self.condition is None: return np.arange(self.table.nrows) - return self.table.table.getWhereList(self.condition, start=self.start, stop=self.stop, sort=True) + return self.table.table.getWhereList(self.condition.format(), start=self.start, stop=self.stop, sort=True) # utilities ### diff --git a/pandas/io/tests/test_data.py b/pandas/io/tests/test_data.py index 34b2811876f30..1cffccea2289f 100644 --- a/pandas/io/tests/test_data.py +++ b/pandas/io/tests/test_data.py @@ -277,7 +277,7 @@ def setUpClass(cls): except ImportError: raise nose.SkipTest - with assert_produces_warning(): + with assert_produces_warning(FutureWarning): cls.aapl = web.Options('aapl') today = datetime.today() diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 861b4dd7567a0..322b626acc0ad 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1,10 +1,9 @@ -from __future__ import print_function -from pandas.compat import range, lrange, u import nose import unittest -import os import sys +import os import warnings +from contextlib import contextmanager import datetime import numpy as np @@ -23,9 +22,8 @@ assert_series_equal) from pandas import concat, Timestamp from pandas import compat, _np_version_under1p7 -from pandas.core import common as com - -from numpy.testing.decorators import slow +from pandas.compat import range, lrange, u +from pandas.util.testing import assert_produces_warning try: import tables @@ -42,12 +40,12 @@ # contextmanager to ensure the file cleanup def safe_remove(path): if path is not None: - import os try: os.remove(path) except: pass + def safe_close(store): try: if store is not None: @@ -55,7 +53,6 @@ def safe_close(store): except: pass -from contextlib import contextmanager @contextmanager def ensure_clean(path, mode='a', complevel=None, complib=None, @@ -82,6 +79,7 @@ def _maybe_remove(store, key): except: pass + def compat_assert_produces_warning(w,f): """ don't produce a warning under PY3 """ if compat.PY3: @@ -90,6 +88,7 @@ def compat_assert_produces_warning(w,f): with tm.assert_produces_warning(expected_warning=w): f() + class TestHDFStore(unittest.TestCase): def setUp(self): @@ -329,8 +328,8 @@ def test_contains(self): self.assert_('bar' not in store) # GH 2694 - with tm.assert_produces_warning(expected_warning=tables.NaturalNameWarning): - store['node())'] = tm.makeDataFrame() + warnings.filterwarnings('ignore', category=tables.NaturalNameWarning) + store['node())'] = tm.makeDataFrame() self.assert_('node())' in store) def test_versioning(self): @@ -751,7 +750,7 @@ def test_encoding(self): raise nose.SkipTest('system byteorder is not little, skipping test_encoding!') with ensure_clean(self.path) as store: - df = DataFrame(dict(A='foo',B='bar'),index=lrange(5)) + df = DataFrame(dict(A='foo',B='bar'),index=range(5)) df.loc[2,'A'] = np.nan df.loc[3,'B'] = np.nan _maybe_remove(store, 'df') @@ -887,16 +886,16 @@ def test_append_frame_column_oriented(self): expected = df.reindex(columns=['A']) tm.assert_frame_equal(expected, result) - # this isn't supported - self.assertRaises(TypeError, store.select, 'df1', ( - 'columns=A', Term('index', '>', df.index[4]))) - # selection on the non-indexable result = store.select( - 'df1', ('columns=A', Term('index', '=', df.index[0:4]))) + 'df1', ('columns=A', Term('index=df.index[0:4]'))) expected = df.reindex(columns=['A'], index=df.index[0:4]) tm.assert_frame_equal(expected, result) + # this isn't supported + self.assertRaises(TypeError, store.select, 'df1', ( + 'columns=A', Term('index>df.index[4]'))) + def test_append_with_different_block_ordering(self): #GH 4096; using same frames, but different block orderings @@ -905,7 +904,7 @@ def test_append_with_different_block_ordering(self): for i in range(10): df = DataFrame(np.random.randn(10,2),columns=list('AB')) - df['index'] = lrange(10) + df['index'] = range(10) df['index'] += i*10 df['int64'] = Series([1]*len(df),dtype='int64') df['int16'] = Series([1]*len(df),dtype='int16') @@ -1081,7 +1080,7 @@ def check_col(key,name,size): def check_col(key,name,size): self.assert_(getattr(store.get_storer(key).table.description,name).itemsize == size) - df = DataFrame(dict(A = 'foo', B = 'bar'),index=lrange(10)) + df = DataFrame(dict(A = 'foo', B = 'bar'),index=range(10)) # a min_itemsize that creates a data_column _maybe_remove(store, 'df') @@ -1134,7 +1133,7 @@ def test_append_with_data_columns(self): # data column searching (with an indexable and a data_columns) result = store.select( - 'df', [Term('B>0'), Term('index', '>', df.index[3])]) + 'df', [Term('B>0'), Term('index>df.index[3]')]) df_new = df.reindex(index=df.index[4:]) expected = df_new[df_new.B > 0] tm.assert_frame_equal(result, expected) @@ -1146,7 +1145,7 @@ def test_append_with_data_columns(self): df_new['string'][5:6] = 'bar' _maybe_remove(store, 'df') store.append('df', df_new, data_columns=['string']) - result = store.select('df', [Term('string', '=', 'foo')]) + result = store.select('df', [Term('string=foo')]) expected = df_new[df_new.string == 'foo'] tm.assert_frame_equal(result, expected) @@ -1192,14 +1191,14 @@ def check_col(key,name,size): _maybe_remove(store, 'df') store.append( 'df', df_new, data_columns=['A', 'B', 'string', 'string2']) - result = store.select('df', [Term('string', '=', 'foo'), Term( + result = store.select('df', [Term('string=foo'), Term( 'string2=foo'), Term('A>0'), Term('B<0')]) expected = df_new[(df_new.string == 'foo') & ( df_new.string2 == 'foo') & (df_new.A > 0) & (df_new.B < 0)] tm.assert_frame_equal(result, expected) # yield an empty frame - result = store.select('df', [Term('string', '=', 'foo'), Term( + result = store.select('df', [Term('string=foo'), Term( 'string2=cool')]) expected = df_new[(df_new.string == 'foo') & ( df_new.string2 == 'cool')] @@ -1316,9 +1315,8 @@ def test_big_table_frame(self): raise nose.SkipTest('no big table frame') # create and write a big table - df = DataFrame(np.random.randn(2000 * 100, 100), - index=lrange(2000 * 100), - columns=['E%03d' % i for i in range(100)]) + df = DataFrame(np.random.randn(2000 * 100, 100), index=range( + 2000 * 100), columns=['E%03d' % i for i in range(100)]) for x in range(20): df['String%03d' % x] = 'string%03d' % x @@ -1328,8 +1326,9 @@ def test_big_table_frame(self): store.append('df', df) rows = store.root.df.table.nrows recons = store.select('df') + assert isinstance(recons, DataFrame) - print("\nbig_table frame [%s] -> %5.2f" % (rows, time.time() - x)) + print ("\nbig_table frame [%s] -> %5.2f" % (rows, time.time() - x)) def test_big_table2_frame(self): # this is a really big table: 1m rows x 60 float columns, 20 string, 20 datetime @@ -1340,15 +1339,14 @@ def test_big_table2_frame(self): print ("\nbig_table2 start") import time start_time = time.time() - df = DataFrame(np.random.randn(1000 * 1000, 60), - index=lrange(int(1000 * 1000)), - columns=['E%03d' % i for i in range(60)]) + df = DataFrame(np.random.randn(1000 * 1000, 60), index=range(int( + 1000 * 1000)), columns=['E%03d' % i for i in range(60)]) for x in range(20): df['String%03d' % x] = 'string%03d' % x for x in range(20): df['datetime%03d' % x] = datetime.datetime(2001, 1, 2, 0, 0) - print("\nbig_table2 frame (creation of df) [rows->%s] -> %5.2f" + print ("\nbig_table2 frame (creation of df) [rows->%s] -> %5.2f" % (len(df.index), time.time() - start_time)) def f(chunksize): @@ -1359,9 +1357,9 @@ def f(chunksize): for c in [10000, 50000, 250000]: start_time = time.time() - print("big_table2 frame [chunk->%s]" % c) + print ("big_table2 frame [chunk->%s]" % c) rows = f(c) - print("big_table2 frame [rows->%s,chunk->%s] -> %5.2f" + print ("big_table2 frame [rows->%s,chunk->%s] -> %5.2f" % (rows, c, time.time() - start_time)) def test_big_put_frame(self): @@ -1370,23 +1368,23 @@ def test_big_put_frame(self): print ("\nbig_put start") import time start_time = time.time() - df = DataFrame(np.random.randn(1000 * 1000, 60), index=lrange(int( + df = DataFrame(np.random.randn(1000 * 1000, 60), index=range(int( 1000 * 1000)), columns=['E%03d' % i for i in range(60)]) for x in range(20): df['String%03d' % x] = 'string%03d' % x for x in range(20): df['datetime%03d' % x] = datetime.datetime(2001, 1, 2, 0, 0) - print("\nbig_put frame (creation of df) [rows->%s] -> %5.2f" + print ("\nbig_put frame (creation of df) [rows->%s] -> %5.2f" % (len(df.index), time.time() - start_time)) with ensure_clean(self.path, mode='w') as store: start_time = time.time() - store = HDFStore(fn, mode='w') + store = HDFStore(self.path, mode='w') store.put('df', df) - print(df.get_dtype_counts()) - print("big_put frame [shape->%s] -> %5.2f" + print (df.get_dtype_counts()) + print ("big_put frame [shape->%s] -> %5.2f" % (df.shape, time.time() - start_time)) def test_big_table_panel(self): @@ -1410,8 +1408,9 @@ def test_big_table_panel(self): store.append('wp', wp) rows = store.root.wp.table.nrows recons = store.select('wp') + assert isinstance(recons, Panel) - print("\nbig_table panel [%s] -> %5.2f" % (rows, time.time() - x)) + print ("\nbig_table panel [%s] -> %5.2f" % (rows, time.time() - x)) def test_append_diff_item_order(self): @@ -1654,7 +1653,6 @@ def test_table_values_dtypes_roundtrip(self): expected.sort() tm.assert_series_equal(result,expected) - def test_table_mixed_dtypes(self): # frame @@ -1713,7 +1711,7 @@ def test_unimplemented_dtypes_table_columns(self): # py3 ok for unicode if not compat.PY3: - l.append(('unicode', u('\u03c3'))) + l.append(('unicode', u('\\u03c3'))) ### currently not supported dtypes #### for n, f in l: @@ -1759,17 +1757,17 @@ def compare(a,b): assert_frame_equal(result,df) # select with tz aware - compare(store.select('df_tz',where=Term('A','>=',df.A[3])),df[df.A>=df.A[3]]) + compare(store.select('df_tz',where=Term('A>=df.A[3]')),df[df.A>=df.A[3]]) _maybe_remove(store, 'df_tz') - df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130103',tz='US/Eastern')),index=lrange(5)) + df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130103',tz='US/Eastern')),index=range(5)) store.append('df_tz',df) result = store['df_tz'] compare(result,df) assert_frame_equal(result,df) _maybe_remove(store, 'df_tz') - df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130102',tz='EET')),index=lrange(5)) + df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130102',tz='EET')),index=range(5)) self.assertRaises(TypeError, store.append, 'df_tz', df) # this is ok @@ -1780,7 +1778,7 @@ def compare(a,b): assert_frame_equal(result,df) # can't append with diff timezone - df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130102',tz='CET')),index=lrange(5)) + df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130102',tz='CET')),index=range(5)) self.assertRaises(ValueError, store.append, 'df_tz', df) # as index @@ -1866,16 +1864,16 @@ def test_append_with_timedelta(self): result = store.select('df',Term("C","<",-3*86400)) assert_frame_equal(result,df.iloc[3:]) - result = store.select('df',Term("C","<",'-3D')) + result = store.select('df',"C<'-3D'") assert_frame_equal(result,df.iloc[3:]) # a bit hacky here as we don't really deal with the NaT properly - result = store.select('df',Term("C","<",'-500000s')) + result = store.select('df',"C<'-500000s'") result = result.dropna(subset=['C']) assert_frame_equal(result,df.iloc[6:]) - result = store.select('df',Term("C","<",'-3.5D')) + result = store.select('df',"C<'-3.5D'") result = result.iloc[1:] assert_frame_equal(result,df.iloc[4:]) @@ -1927,14 +1925,14 @@ def test_remove_where(self): with ensure_clean(self.path) as store: # non-existance - crit1 = Term('index', '>', 'foo') + crit1 = Term('index>foo') self.assertRaises(KeyError, store.remove, 'a', [crit1]) # try to remove non-table (with crit) # non-table ok (where = None) wp = tm.makePanel() - store.put('wp', wp, format='t') - store.remove('wp', [('minor_axis', ['A', 'D'])]) + store.put('wp', wp, format='table') + store.remove('wp', ["minor_axis=['A', 'D']"]) rs = store.select('wp') expected = wp.reindex(minor_axis=['B', 'C']) assert_panel_equal(rs, expected) @@ -1966,8 +1964,8 @@ def test_remove_crit(self): # group row removal date4 = wp.major_axis.take([0, 1, 2, 4, 5, 6, 8, 9, 10]) - crit4 = Term('major_axis', date4) - store.put('wp3', wp, format='table') + crit4 = Term('major_axis=date4') + store.put('wp3', wp, format='t') n = store.remove('wp3', where=[crit4]) assert(n == 36) result = store.select('wp3') @@ -1978,8 +1976,8 @@ def test_remove_crit(self): store.put('wp', wp, format='table') date = wp.major_axis[len(wp.major_axis) // 2] - crit1 = Term('major_axis', '>', date) - crit2 = Term('minor_axis', ['A', 'D']) + crit1 = Term('major_axis>date') + crit2 = Term("minor_axis=['A', 'D']") n = store.remove('wp', where=[crit1]) assert(n == 56) @@ -1995,14 +1993,14 @@ def test_remove_crit(self): store.put('wp2', wp, format='table') date1 = wp.major_axis[1:3] - crit1 = Term('major_axis', date1) + crit1 = Term('major_axis=date1') store.remove('wp2', where=[crit1]) result = store.select('wp2') expected = wp.reindex(major_axis=wp.major_axis - date1) assert_panel_equal(result, expected) date2 = wp.major_axis[5] - crit2 = Term('major_axis', date2) + crit2 = Term('major_axis=date2') store.remove('wp2', where=[crit2]) result = store['wp2'] expected = wp.reindex( @@ -2010,7 +2008,7 @@ def test_remove_crit(self): assert_panel_equal(result, expected) date3 = [wp.major_axis[7], wp.major_axis[9]] - crit3 = Term('major_axis', date3) + crit3 = Term('major_axis=date3') store.remove('wp2', where=[crit3]) result = store['wp2'] expected = wp.reindex( @@ -2020,62 +2018,102 @@ def test_remove_crit(self): # corners store.put('wp4', wp, format='table') n = store.remove( - 'wp4', where=[Term('major_axis', '>', wp.major_axis[-1])]) + 'wp4', where=[Term('major_axis>wp.major_axis[-1]')]) result = store.select('wp4') assert_panel_equal(result, wp) - def test_terms(self): + def test_invalid_terms(self): with ensure_clean(self.path) as store: + df = tm.makeTimeDataFrame() + df['string'] = 'foo' + df.ix[0:4,'string'] = 'bar' wp = tm.makePanel() p4d = tm.makePanel4D() + store.put('df', df, format='table') store.put('wp', wp, format='table') store.put('p4d', p4d, format='table') # some invalid terms - terms = [ - ['minor', ['A', 'B']], - ['index', ['20121114']], - ['index', ['20121114', '20121114']], - ] - for t in terms: - self.assertRaises(Exception, store.select, 'wp', t) + self.assertRaises(ValueError, store.select, 'wp', "minor=['A', 'B']") + self.assertRaises(ValueError, store.select, 'wp', ["index=['20121114']"]) + self.assertRaises(ValueError, store.select, 'wp', ["index=['20121114', '20121114']"]) + self.assertRaises(TypeError, Term) - self.assertRaises(Exception, Term.__init__) - self.assertRaises(Exception, Term.__init__, 'blah') - self.assertRaises(Exception, Term.__init__, 'index') - self.assertRaises(Exception, Term.__init__, 'index', '==') - self.assertRaises(Exception, Term.__init__, 'index', '>', 5) + # more invalid + self.assertRaises(ValueError, store.select, 'df','df.index[3]') + self.assertRaises(SyntaxError, store.select, 'df','index>') + self.assertRaises(ValueError, store.select, 'wp', "major_axis<'20000108' & minor_axis['A', 'B']") + + # from the docs + with tm.ensure_clean(self.path) as path: + dfq = DataFrame(np.random.randn(10,4),columns=list('ABCD'),index=date_range('20130101',periods=10)) + dfq.to_hdf(path,'dfq',format='table',data_columns=True) + + # check ok + read_hdf(path,'dfq',where="index>Timestamp('20130104') & columns=['A', 'B']") + read_hdf(path,'dfq',where="A>0 or C>0") + + # catch the invalid reference + with tm.ensure_clean(self.path) as path: + dfq = DataFrame(np.random.randn(10,4),columns=list('ABCD'),index=date_range('20130101',periods=10)) + dfq.to_hdf(path,'dfq',format='table') + + self.assertRaises(ValueError, read_hdf, path,'dfq',where="A>0 or C>0") + + def test_terms(self): + + with ensure_clean(self.path) as store: + + wp = tm.makePanel() + p4d = tm.makePanel4D() + store.put('wp', wp, table=True) + store.put('p4d', p4d, table=True) # panel result = store.select('wp', [Term( - 'major_axis<20000108'), Term('minor_axis', '=', ['A', 'B'])]) + 'major_axis<"20000108"'), Term("minor_axis=['A', 'B']")]) expected = wp.truncate(after='20000108').reindex(minor=['A', 'B']) assert_panel_equal(result, expected) + # with deprecation + result = store.select('wp', [Term( + 'major_axis','<',"20000108"), Term("minor_axis=['A', 'B']")]) + expected = wp.truncate(after='20000108').reindex(minor=['A', 'B']) + tm.assert_panel_equal(result, expected) + # p4d - result = store.select('p4d', [Term('major_axis<20000108'), - Term('minor_axis', '=', ['A', 'B']), - Term('items', '=', ['ItemA', 'ItemB'])]) + result = store.select('p4d', [Term('major_axis<"20000108"'), + Term("minor_axis=['A', 'B']"), + Term("items=['ItemA', 'ItemB']")]) expected = p4d.truncate(after='20000108').reindex( minor=['A', 'B'], items=['ItemA', 'ItemB']) assert_panel4d_equal(result, expected) - # valid terms + # back compat invalid terms terms = [ dict(field='major_axis', op='>', value='20121114'), - ('major_axis', '20121114'), - ('major_axis', '>', '20121114'), - (('major_axis', ['20121114', '20121114']),), - ('major_axis', datetime.datetime(2012, 11, 14)), + [ dict(field='major_axis', op='>', value='20121114') ], + [ "minor_axis=['A','B']", dict(field='major_axis', op='>', value='20121114') ] + ] + for t in terms: + with tm.assert_produces_warning(expected_warning=DeprecationWarning): + Term(t) + + # valid terms + terms = [ + ('major_axis=20121114'), + ('major_axis>20121114'), + (("major_axis=['20121114', '20121114']"),), + ('major_axis=datetime.datetime(2012, 11, 14)'), 'major_axis> 20121114', 'major_axis >20121114', 'major_axis > 20121114', - (('minor_axis', ['A', 'B']),), - (('minor_axis', ['A', 'B']),), - ((('minor_axis', ['A', 'B']),),), - (('items', ['ItemA', 'ItemB']),), + (("minor_axis=['A', 'B']"),), + (("minor_axis=['A', 'B']"),), + ((("minor_axis==['A', 'B']"),),), + (("items=['ItemA', 'ItemB']"),), ('items=ItemA'), ] @@ -2085,13 +2123,53 @@ def test_terms(self): # valid for p4d only terms = [ - (('labels', '=', ['l1', 'l2']),), - Term('labels', '=', ['l1', 'l2']), + (("labels=['l1', 'l2']"),), + Term("labels=['l1', 'l2']"), ] for t in terms: store.select('p4d', t) + def test_term_compat(self): + with ensure_clean(self.path) as store: + + wp = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + store.append('wp',wp) + + result = store.select('wp', [Term('major_axis>20000102'), + Term('minor_axis', '=', ['A','B']) ]) + expected = wp.loc[:,wp.major_axis>Timestamp('20000102'),['A','B']] + assert_panel_equal(result, expected) + + store.remove('wp', Term('major_axis>20000103')) + result = store.select('wp') + expected = wp.loc[:,wp.major_axis<=Timestamp('20000103'),:] + assert_panel_equal(result, expected) + + def test_same_name_scoping(self): + + with ensure_clean(self.path) as store: + + import pandas as pd + df = DataFrame(np.random.randn(20, 2),index=pd.date_range('20130101',periods=20)) + store.put('df', df, table=True) + expected = df[df.index>pd.Timestamp('20130105')] + + import datetime + result = store.select('df','index>datetime.datetime(2013,1,5)') + assert_frame_equal(result,expected) + + from datetime import datetime + + # technically an error, but allow it + result = store.select('df','index>datetime.datetime(2013,1,5)') + assert_frame_equal(result,expected) + + result = store.select('df','index>datetime(2013,1,5)') + assert_frame_equal(result,expected) + def test_series(self): s = tm.makeStringSeries() @@ -2211,7 +2289,7 @@ def test_index_types(self): self._check_roundtrip(ser, func) ser = Series(values, [datetime.datetime( - 2012, 1, 1), datetime.datetime(2012, 1, 2)]) + 2012, 1, 1), datetime.datetime(2012, 1, 2)]) self._check_roundtrip(ser, func) def test_timeseries_preepoch(self): @@ -2525,7 +2603,7 @@ def test_select(self): _maybe_remove(store, 'wp') store.append('wp', wp) items = ['Item%03d' % i for i in range(80)] - result = store.select('wp', Term('items', items)) + result = store.select('wp', Term('items=items')) expected = wp.reindex(items=items) assert_panel_equal(expected, result) @@ -2542,7 +2620,7 @@ def test_select(self): tm.assert_frame_equal(expected, result) # equivalentsly - result = store.select('df', [('columns', ['A', 'B'])]) + result = store.select('df', [("columns=['A', 'B']")]) expected = df.reindex(columns=['A', 'B']) tm.assert_frame_equal(expected, result) @@ -2575,7 +2653,8 @@ def test_select_dtypes(self): df = DataFrame(dict(ts=bdate_range('2012-01-01', periods=300), A=np.random.randn(300))) _maybe_remove(store, 'df') store.append('df', df, data_columns=['ts', 'A']) - result = store.select('df', [Term('ts', '>=', Timestamp('2012-02-01'))]) + + result = store.select('df', [Term("ts>=Timestamp('2012-02-01')")]) expected = df[df.ts >= Timestamp('2012-02-01')] tm.assert_frame_equal(expected, result) @@ -2602,7 +2681,7 @@ def test_select_dtypes(self): _maybe_remove(store, 'df_int') store.append('df_int', df) result = store.select( - 'df_int', [Term("index<10"), Term("columns", "=", ["A"])]) + 'df_int', [Term("index<10"), Term("columns=['A']")]) expected = df.reindex(index=list(df.index)[0:10],columns=['A']) tm.assert_frame_equal(expected, result) @@ -2612,7 +2691,7 @@ def test_select_dtypes(self): _maybe_remove(store, 'df_float') store.append('df_float', df) result = store.select( - 'df_float', [Term("index<10.0"), Term("columns", "=", ["A"])]) + 'df_float', [Term("index<10.0"), Term("columns=['A']")]) expected = df.reindex(index=list(df.index)[0:10],columns=['A']) tm.assert_frame_equal(expected, result) @@ -2622,36 +2701,36 @@ def test_select_with_many_inputs(self): df = DataFrame(dict(ts=bdate_range('2012-01-01', periods=300), A=np.random.randn(300), - B=lrange(300), + B=range(300), users = ['a']*50 + ['b']*50 + ['c']*100 + ['a%03d' % i for i in range(100)])) _maybe_remove(store, 'df') store.append('df', df, data_columns=['ts', 'A', 'B', 'users']) # regular select - result = store.select('df', [Term('ts', '>=', Timestamp('2012-02-01'))]) + result = store.select('df', [Term("ts>=Timestamp('2012-02-01')")]) expected = df[df.ts >= Timestamp('2012-02-01')] tm.assert_frame_equal(expected, result) # small selector - result = store.select('df', [Term('ts', '>=', Timestamp('2012-02-01')),Term('users',['a','b','c'])]) + result = store.select('df', [Term("ts>=Timestamp('2012-02-01') & users=['a','b','c']")]) expected = df[ (df.ts >= Timestamp('2012-02-01')) & df.users.isin(['a','b','c']) ] tm.assert_frame_equal(expected, result) # big selector along the columns selector = [ 'a','b','c' ] + [ 'a%03d' % i for i in range(60) ] - result = store.select('df', [Term('ts', '>=', Timestamp('2012-02-01')),Term('users',selector)]) + result = store.select('df', [Term("ts>=Timestamp('2012-02-01')"),Term('users=selector')]) expected = df[ (df.ts >= Timestamp('2012-02-01')) & df.users.isin(selector) ] tm.assert_frame_equal(expected, result) - selector = lrange(100,200) - result = store.select('df', [Term('B', selector)]) + selector = range(100,200) + result = store.select('df', [Term('B=selector')]) expected = df[ df.B.isin(selector) ] tm.assert_frame_equal(expected, result) self.assert_(len(result) == 100) # big selector along the index selector = Index(df.ts[0:100].values) - result = store.select('df', [Term('ts', selector)]) + result = store.select('df', [Term('ts=selector')]) expected = df[ df.ts.isin(selector.values) ] tm.assert_frame_equal(expected, result) self.assert_(len(result) == 100) @@ -2807,15 +2886,15 @@ def test_panel_select(self): store.put('wp', wp, format='table') date = wp.major_axis[len(wp.major_axis) // 2] - crit1 = ('major_axis', '>=', date) - crit2 = ('minor_axis', '=', ['A', 'D']) + crit1 = ('major_axis>=date') + crit2 = ("minor_axis=['A', 'D']") result = store.select('wp', [crit1, crit2]) expected = wp.truncate(before=date).reindex(minor=['A', 'D']) assert_panel_equal(result, expected) result = store.select( - 'wp', ['major_axis>=20000124', ('minor_axis', '=', ['A', 'B'])]) + 'wp', ['major_axis>="20000124"', ("minor_axis=['A', 'B']")]) expected = wp.truncate(before='20000124').reindex(minor=['A', 'B']) assert_panel_equal(result, expected) @@ -2827,9 +2906,9 @@ def test_frame_select(self): store.put('frame', df,format='table') date = df.index[len(df) // 2] - crit1 = ('index', '>=', date) - crit2 = ('columns', ['A', 'D']) - crit3 = ('columns', 'A') + crit1 = Term('index>=date') + crit2 = ("columns=['A', 'D']") + crit3 = ('columns=A') result = store.select('frame', [crit1, crit2]) expected = df.ix[date:, ['A', 'D']] @@ -2850,6 +2929,67 @@ def test_frame_select(self): # self.assertRaises(ValueError, store.select, # 'frame', [crit1, crit2]) + def test_frame_select_complex(self): + """ select via complex criteria """ + + df = tm.makeTimeDataFrame() + df['string'] = 'foo' + df.loc[df.index[0:4],'string'] = 'bar' + + with ensure_clean(self.path) as store: + store.put('df', df, table=True, data_columns=['string']) + + # empty + result = store.select('df', 'index>df.index[3] & string="bar"') + expected = df.loc[(df.index>df.index[3]) & (df.string=='bar')] + tm.assert_frame_equal(result, expected) + + result = store.select('df', 'index>df.index[3] & string="foo"') + expected = df.loc[(df.index>df.index[3]) & (df.string=='foo')] + tm.assert_frame_equal(result, expected) + + # or + result = store.select('df', 'index>df.index[3] | string="bar"') + expected = df.loc[(df.index>df.index[3]) | (df.string=='bar')] + tm.assert_frame_equal(result, expected) + + result = store.select('df', '(index>df.index[3] & index<=df.index[6]) | string="bar"') + expected = df.loc[((df.index>df.index[3]) & (df.index<=df.index[6])) | (df.string=='bar')] + tm.assert_frame_equal(result, expected) + + # invert + result = store.select('df', 'string!="bar"') + expected = df.loc[df.string!='bar'] + tm.assert_frame_equal(result, expected) + + # invert not implemented in numexpr :( + self.assertRaises(NotImplementedError, store.select, 'df', '~(string="bar")') + + # invert ok for filters + result = store.select('df', "~(columns=['A','B'])") + expected = df.loc[:,df.columns-['A','B']] + tm.assert_frame_equal(result, expected) + + # in + result = store.select('df', "index>df.index[3] & columns in ['A','B']") + expected = df.loc[df.index>df.index[3]].reindex(columns=['A','B']) + tm.assert_frame_equal(result, expected) + + def test_invalid_filtering(self): + + # can't use more than one filter (atm) + + df = tm.makeTimeDataFrame() + + with ensure_clean(self.path) as store: + store.put('df', df, table=True) + + # not implemented + self.assertRaises(NotImplementedError, store.select, 'df', "columns=['A'] | columns=['B']") + + # in theory we could deal with this + self.assertRaises(NotImplementedError, store.select, 'df', "columns=['A','B'] & columns=['C']") + def test_string_select(self): # GH 2973 @@ -2898,7 +3038,6 @@ def test_string_select(self): expected = df[df.int!=2] assert_frame_equal(result,expected) - def test_read_column(self): df = tm.makeTimeDataFrame() @@ -2917,7 +3056,7 @@ def f(): # valid result = store.select_column('df', 'index') tm.assert_almost_equal(result.values, Series(df.index).values) - tm.assert_isinstance(result,Series) + self.assert_(isinstance(result,Series)) # not a data indexable column self.assertRaises( @@ -3116,18 +3255,11 @@ def test_select_as_multiple(self): tm.assert_frame_equal(result, expected) # multiple (diff selector) - try: - result = store.select_as_multiple(['df1', 'df2'], where=[Term( - 'index', '>', df2.index[4])], selector='df2') - expected = concat([df1, df2], axis=1) - expected = expected[5:] - tm.assert_frame_equal(result, expected) - except (Exception) as detail: - print("error in select_as_multiple %s" % str(detail)) - print("store: %s" % store) - print("df1: %s" % df1) - print("df2: %s" % df2) - + result = store.select_as_multiple(['df1', 'df2'], where=[Term( + 'index>df2.index[4]')], selector='df2') + expected = concat([df1, df2], axis=1) + expected = expected[5:] + tm.assert_frame_equal(result, expected) # test excpection for diff rows store.append('df3', tm.makeTimeDataFrame(nper=50)) @@ -3142,15 +3274,15 @@ def test_start_stop(self): store.append('df', df) result = store.select( - 'df', [Term("columns", "=", ["A"])], start=0, stop=5) + 'df', [Term("columns=['A']")], start=0, stop=5) expected = df.ix[0:4, ['A']] tm.assert_frame_equal(result, expected) # out of range result = store.select( - 'df', [Term("columns", "=", ["A"])], start=30, stop=40) + 'df', [Term("columns=['A']")], start=30, stop=40) assert(len(result) == 0) - tm.assert_isinstance(result, DataFrame) + assert(type(result) == DataFrame) def test_select_filter_corner(self): @@ -3161,7 +3293,7 @@ def test_select_filter_corner(self): with ensure_clean(self.path) as store: store.put('frame', df, format='table') - crit = Term('columns', df.columns[:75]) + crit = Term('columns=df.columns[:75]') result = store.select('frame', [crit]) tm.assert_frame_equal(result, df.ix[:, df.columns[:75]]) @@ -3190,7 +3322,6 @@ def _check_double_roundtrip(self, obj, comparator, compression=False, again = store['obj'] comparator(again, obj, **kwargs) - def _check_roundtrip_table(self, obj, comparator, compression=False): options = {} if compression: @@ -3296,6 +3427,7 @@ def test_pytables_native_read(self): try: store = HDFStore(tm.get_data_path('legacy_hdf/pytables_native.h5'), 'r') d2 = store['detector/readout'] + assert isinstance(d2, DataFrame) finally: safe_close(store) @@ -3303,6 +3435,7 @@ def test_pytables_native_read(self): store = HDFStore(tm.get_data_path('legacy_hdf/pytables_native2.h5'), 'r') str(store) d1 = store['detector'] + assert isinstance(d1, DataFrame) finally: safe_close(store) @@ -3330,11 +3463,12 @@ def test_legacy_table_read(self): # old version warning with tm.assert_produces_warning(expected_warning=IncompatibilityWarning): self.assertRaises( - Exception, store.select, 'wp1', Term('minor_axis', '=', 'B')) + Exception, store.select, 'wp1', Term('minor_axis=B')) - with tm.assert_produces_warning(expected_warning=IncompatibilityWarning): df2 = store.select('df2') - store.select('df2', Term('index', '>', df2.index[2])) + result = store.select('df2', Term('index>df2.index[2]')) + expected = df2[df2.index > df2.index[2]] + assert_frame_equal(expected, result) finally: safe_close(store) @@ -3352,11 +3486,18 @@ def test_legacy_0_10_read(self): def test_legacy_0_11_read(self): # legacy from 0.11 try: - store = HDFStore(tm.get_data_path('legacy_hdf/legacy_table_0.11.h5'), 'r') + path = os.path.join('legacy_hdf', 'legacy_table_0.11.h5') + store = HDFStore(tm.get_data_path(path), 'r') str(store) + assert 'df' in store + assert 'df1' in store + assert 'mi' in store df = store.select('df') df1 = store.select('df1') mi = store.select('mi') + assert isinstance(df, DataFrame) + assert isinstance(df1, DataFrame) + assert isinstance(mi, DataFrame) finally: safe_close(store) @@ -3364,10 +3505,9 @@ def test_copy(self): def do_copy(f = None, new_f = None, keys = None, propindexes = True, **kwargs): try: - import os - if f is None: - f = tm.get_data_path('legacy_hdf/legacy_0.10.h5') + f = tm.get_data_path(os.path.join('legacy_hdf', + 'legacy_0.10.h5')) store = HDFStore(f, 'r') @@ -3380,7 +3520,7 @@ def do_copy(f = None, new_f = None, keys = None, propindexes = True, **kwargs): # check keys if keys is None: - keys = list(store.keys()) + keys = store.keys() self.assert_(set(keys) == set(tstore.keys())) # check indicies & nrows @@ -3437,6 +3577,7 @@ def test_legacy_table_write(self): df = DataFrame(dict(A = 'foo', B = 'bar'),index=lrange(10)) store.append('df', df, data_columns = ['B'], min_itemsize={'A' : 200 }) + store.append('wp', wp) store.close() @@ -3524,6 +3665,7 @@ def _test_sort(obj): else: raise ValueError('type not supported here') + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index e2051eba7f42a..8c5764a3f59a6 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -4,15 +4,13 @@ import nose from nose.tools import assert_equal -import unittest import numpy as np from pandas.tslib import iNaT from pandas import Series, DataFrame, date_range, DatetimeIndex, Timestamp -import pandas.compat as compat +from pandas import compat from pandas.compat import range, long, lrange, lmap, u from pandas.core.common import notnull, isnull -import pandas.compat as compat import pandas.core.common as com import pandas.util.testing as tm import pandas.core.config as cf @@ -42,6 +40,7 @@ def __getitem__(self): assert(not is_seq(A())) + def test_notnull(): assert notnull(1.) assert not notnull(None) @@ -121,11 +120,13 @@ def test_isnull_datetime(): assert(mask[0]) assert(not mask[1:].any()) + def test_datetimeindex_from_empty_datetime64_array(): for unit in [ 'ms', 'us', 'ns' ]: idx = DatetimeIndex(np.array([], dtype='datetime64[%s]' % unit)) assert(len(idx) == 0) + def test_nan_to_nat_conversions(): df = DataFrame(dict({ @@ -144,6 +145,7 @@ def test_nan_to_nat_conversions(): if LooseVersion(np.__version__) >= '1.7.0': assert(s[8].value == np.datetime64('NaT').astype(np.int64)) + def test_any_none(): assert(com._any_none(1, 2, 3, None)) assert(not com._any_none(1, 2, 3, 4)) @@ -308,6 +310,7 @@ def test_ensure_int32(): result = com._ensure_int32(values) assert(result.dtype == np.int32) + def test_ensure_platform_int(): # verify that when we create certain types of indices diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index ff76c7c070946..f81620b897a4a 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -4,31 +4,25 @@ import unittest import nose -import operator -from numpy import random, nan from numpy.random import randn + +import operator import numpy as np from numpy.testing import assert_array_equal -import pandas as pan -from pandas.core.api import DataFrame, Series, notnull, isnull -from pandas.core import expressions as expr +from pandas.core.api import DataFrame +from pandas.computation import expressions as expr -from pandas.util.testing import (assert_almost_equal, - assert_series_equal, - assert_frame_equal) +from pandas.util.testing import assert_series_equal, assert_frame_equal from pandas import compat -import pandas.util.testing as tm -import pandas.lib as lib - -from numpy.testing.decorators import slow if not expr._USE_NUMEXPR: - raise nose.SkipTest + raise nose.SkipTest("numexpr not available") + -_frame = DataFrame(np.random.randn(10000, 4), columns = list('ABCD'), dtype='float64') -_frame2 = DataFrame(np.random.randn(100, 4), columns = list('ABCD'), dtype='float64') +_frame = DataFrame(randn(10000, 4), columns=list('ABCD'), dtype='float64') +_frame2 = DataFrame(randn(100, 4), columns = list('ABCD'), dtype='float64') _mixed = DataFrame({ 'A' : _frame['A'].copy(), 'B' : _frame['B'].astype('float32'), 'C' : _frame['C'].astype('int64'), 'D' : _frame['D'].astype('int32') }) _mixed2 = DataFrame({ 'A' : _frame2['A'].copy(), 'B' : _frame2['B'].astype('float32'), 'C' : _frame2['C'].astype('int64'), 'D' : _frame2['D'].astype('int32') }) _integer = DataFrame(np.random.randint(1, 100, size=(10001, 4)), columns = list('ABCD'), dtype='int64') @@ -128,11 +122,11 @@ def testit(): result = expr.evaluate(op, op_str, f, f, use_numexpr=True) expected = expr.evaluate(op, op_str, f, f, use_numexpr=False) assert_array_equal(result,expected.values) - + result = expr._can_use_numexpr(op, op_str, f2, f2, 'evaluate') self.assert_(result == False) - + expr.set_use_numexpr(False) testit() expr.set_use_numexpr(True) @@ -149,7 +143,7 @@ def testit(): f11 = f f12 = f + 1 - + f21 = f2 f22 = f2 + 1 @@ -163,7 +157,7 @@ def testit(): result = expr.evaluate(op, op_str, f11, f12, use_numexpr=True) expected = expr.evaluate(op, op_str, f11, f12, use_numexpr=False) assert_array_equal(result,expected.values) - + result = expr._can_use_numexpr(op, op_str, f21, f22, 'evaluate') self.assert_(result == False) @@ -180,7 +174,7 @@ def test_where(self): def testit(): for f in [ self.frame, self.frame2, self.mixed, self.mixed2 ]: - + for cond in [ True, False ]: c = np.empty(f.shape,dtype=np.bool_) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index a5c1941a7f2d3..423707e0016d8 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -11,6 +11,8 @@ import nose import functools import itertools +from itertools import product + from pandas.compat import( map, zip, range, long, lrange, lmap, lzip, OrderedDict, cPickle as pickle, u, StringIO @@ -18,7 +20,7 @@ from pandas import compat from numpy import random, nan -from numpy.random import randn +from numpy.random import randn, rand import numpy as np import numpy.ma as ma from numpy.testing import assert_array_equal @@ -30,7 +32,7 @@ import pandas.core.format as fmt import pandas.core.datetools as datetools from pandas.core.api import (DataFrame, Index, Series, notnull, isnull, - MultiIndex, DatetimeIndex, Timestamp, Period) + MultiIndex, DatetimeIndex, Timestamp) from pandas import date_range import pandas as pd from pandas.io.parsers import read_csv @@ -40,10 +42,14 @@ assert_series_equal, assert_frame_equal, assertRaisesRegexp, + assertRaises, makeCustomDataframe as mkdf, ensure_clean) from pandas.core.indexing import IndexingError from pandas.core.common import PandasError +from pandas.compat import OrderedDict +from pandas.computation.expr import Expr +import pandas.computation as comp import pandas.util.testing as tm import pandas.lib as lib @@ -81,6 +87,7 @@ def _check_mixed_float(df, dtype = None): if dtypes.get('D'): assert(df.dtypes['D'] == dtypes['D']) + def _check_mixed_int(df, dtype = None): dtypes = dict(A = 'int32', B = 'uint64', C = 'uint8', D = 'int64') if isinstance(dtype, compat.string_types): @@ -97,8 +104,6 @@ def _check_mixed_int(df, dtype = None): assert(df.dtypes['D'] == dtypes['D']) - - class CheckIndexing(object): _multiprocess_can_split_ = True @@ -122,6 +127,14 @@ def test_getitem(self): with assertRaisesRegexp(KeyError, 'no item named random'): self.frame['random'] + df = self.frame.copy() + df['$10'] = randn(len(df)) + ad = randn(len(df)) + df['@awesome_domain'] = ad + self.assertRaises(KeyError, df.__getitem__, 'df["$10"]') + res = df['@awesome_domain'] + assert_array_equal(ad, res.values) + def test_getitem_dupe_cols(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=['a', 'a', 'b']) try: @@ -2119,7 +2132,6 @@ def test_constructor_cast_failure(self): # this is ok df['foo2'] = np.ones((4,2)).tolist() - def test_constructor_dtype_nocast_view(self): df = DataFrame([[1, 2]]) should_be_view = DataFrame(df, dtype=df[0].dtype) @@ -3166,7 +3178,6 @@ def test_constructor_single_value(self): with tm.assertRaisesRegexp(TypeError, 'incompatible data and dtype'): DataFrame('a', [1, 2], ['a', 'c'], float) - def test_constructor_with_datetimes(self): intname = np.dtype(np.int_).name floatname = np.dtype(np.float_).name @@ -5238,8 +5249,6 @@ def make_dtnat_arr(n,nnat=None): _do_test(mkdf(nrows, ncols,r_idx_nlevels=2,c_idx_nlevels=2), path,rnlvl=2,cnlvl=2) - - def test_to_csv_from_csv_w_some_infs(self): # test roundtrip with inf, -inf, nan, as full columns and mix @@ -8100,6 +8109,7 @@ def test_mask_edge_case_1xN_frame(self): #---------------------------------------------------------------------- # Transposing + def test_transpose(self): frame = self.frame dft = frame.T @@ -8228,7 +8238,6 @@ def test_diff(self): assert_series_equal(the_diff['A'], tf['A'] - tf['A'].shift(1)) - def test_diff_mixed_dtype(self): df = DataFrame(np.random.randn(5, 3)) df['A'] = np.array([1, 2, 3, 4, 5], dtype=object) @@ -10137,7 +10146,6 @@ def test_unstack_dtypes(self): expected = Series({'float64' : 2, 'object' : 2}) assert_series_equal(result, expected) - def test_reset_index(self): stacked = self.frame.stack()[::2] stacked = DataFrame({'foo': stacked, 'bar': stacked}) @@ -11106,10 +11114,632 @@ def test_isin_with_string_scalar(self): with tm.assertRaises(TypeError): df.isin('aaa') + +def skip_if_no_ne(engine='numexpr'): + if engine == 'numexpr': + try: + import numexpr as ne + except ImportError: + raise nose.SkipTest("cannot query engine numexpr when numexpr not " + "installed") + + +def skip_if_no_pandas_parser(parser): + if parser != 'pandas': + raise nose.SkipTest("cannot evaluate with parser {0!r}".format(parser)) + + +class TestDataFrameQueryWithMultiIndex(object): + def check_query_with_named_multiindex(self, parser, engine): + skip_if_no_ne(engine) + a = tm.choice(['red', 'green'], size=10) + b = tm.choice(['eggs', 'ham'], size=10) + index = MultiIndex.from_arrays([a, b], names=['color', 'food']) + df = DataFrame(randn(10, 2), index=index) + ind = Series(df.index.get_level_values('color').values, index=index, + name='color') + + # equality + #import ipdb; ipdb.set_trace() + res1 = df.query('color == "red"', parser=parser, engine=engine) + res2 = df.query('"red" == color', parser=parser, engine=engine) + exp = df[ind == 'red'] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # inequality + res1 = df.query('color != "red"', parser=parser, engine=engine) + res2 = df.query('"red" != color', parser=parser, engine=engine) + exp = df[ind != 'red'] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # list equality (really just set membership) + res1 = df.query('color == ["red"]', parser=parser, engine=engine) + res2 = df.query('["red"] == color', parser=parser, engine=engine) + exp = df[ind.isin(['red'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + res1 = df.query('color != ["red"]', parser=parser, engine=engine) + res2 = df.query('["red"] != color', parser=parser, engine=engine) + exp = df[~ind.isin(['red'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # in/not in ops + res1 = df.query('["red"] in color', parser=parser, engine=engine) + res2 = df.query('"red" in color', parser=parser, engine=engine) + exp = df[ind.isin(['red'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + res1 = df.query('["red"] not in color', parser=parser, engine=engine) + res2 = df.query('"red" not in color', parser=parser, engine=engine) + exp = df[~ind.isin(['red'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + def test_query_with_named_multiindex(self): + for parser, engine in product(['pandas'], ENGINES): + yield self.check_query_with_named_multiindex, parser, engine + + def check_query_with_unnamed_multiindex(self, parser, engine): + skip_if_no_ne(engine) + a = tm.choice(['red', 'green'], size=10) + b = tm.choice(['eggs', 'ham'], size=10) + index = MultiIndex.from_arrays([a, b]) + df = DataFrame(randn(10, 2), index=index) + ind = Series(df.index.get_level_values(0).values, index=index) + + res1 = df.query('ilevel_0 == "red"', parser=parser, engine=engine) + res2 = df.query('"red" == ilevel_0', parser=parser, engine=engine) + exp = df[ind == 'red'] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # inequality + res1 = df.query('ilevel_0 != "red"', parser=parser, engine=engine) + res2 = df.query('"red" != ilevel_0', parser=parser, engine=engine) + exp = df[ind != 'red'] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # list equality (really just set membership) + res1 = df.query('ilevel_0 == ["red"]', parser=parser, engine=engine) + res2 = df.query('["red"] == ilevel_0', parser=parser, engine=engine) + exp = df[ind.isin(['red'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + res1 = df.query('ilevel_0 != ["red"]', parser=parser, engine=engine) + res2 = df.query('["red"] != ilevel_0', parser=parser, engine=engine) + exp = df[~ind.isin(['red'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # in/not in ops + res1 = df.query('["red"] in ilevel_0', parser=parser, engine=engine) + res2 = df.query('"red" in ilevel_0', parser=parser, engine=engine) + exp = df[ind.isin(['red'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + res1 = df.query('["red"] not in ilevel_0', parser=parser, engine=engine) + res2 = df.query('"red" not in ilevel_0', parser=parser, engine=engine) + exp = df[~ind.isin(['red'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + #### LEVEL 1 #### + ind = Series(df.index.get_level_values(1).values, index=index) + res1 = df.query('ilevel_1 == "eggs"', parser=parser, engine=engine) + res2 = df.query('"eggs" == ilevel_1', parser=parser, engine=engine) + exp = df[ind == 'eggs'] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # inequality + res1 = df.query('ilevel_1 != "eggs"', parser=parser, engine=engine) + res2 = df.query('"eggs" != ilevel_1', parser=parser, engine=engine) + exp = df[ind != 'eggs'] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # list equality (really just set membership) + res1 = df.query('ilevel_1 == ["eggs"]', parser=parser, engine=engine) + res2 = df.query('["eggs"] == ilevel_1', parser=parser, engine=engine) + exp = df[ind.isin(['eggs'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + res1 = df.query('ilevel_1 != ["eggs"]', parser=parser, engine=engine) + res2 = df.query('["eggs"] != ilevel_1', parser=parser, engine=engine) + exp = df[~ind.isin(['eggs'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # in/not in ops + res1 = df.query('["eggs"] in ilevel_1', parser=parser, engine=engine) + res2 = df.query('"eggs" in ilevel_1', parser=parser, engine=engine) + exp = df[ind.isin(['eggs'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + res1 = df.query('["eggs"] not in ilevel_1', parser=parser, engine=engine) + res2 = df.query('"eggs" not in ilevel_1', parser=parser, engine=engine) + exp = df[~ind.isin(['eggs'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + def test_query_with_unnamed_multiindex(self): + for parser, engine in product(['pandas'], ENGINES): + yield self.check_query_with_unnamed_multiindex, parser, engine + + def check_query_with_partially_named_multiindex(self, parser, engine): + skip_if_no_ne(engine) + a = tm.choice(['red', 'green'], size=10) + b = np.arange(10) + index = MultiIndex.from_arrays([a, b]) + index.names = [None, 'rating'] + df = DataFrame(randn(10, 2), index=index) + res = df.query('rating == 1', parser=parser, engine=engine) + ind = Series(df.index.get_level_values('rating').values, index=index, + name='rating') + exp = df[ind == 1] + assert_frame_equal(res, exp) + + res = df.query('rating != 1', parser=parser, engine=engine) + ind = Series(df.index.get_level_values('rating').values, index=index, + name='rating') + exp = df[ind != 1] + assert_frame_equal(res, exp) + + res = df.query('ilevel_0 == "red"', parser=parser, engine=engine) + ind = Series(df.index.get_level_values(0).values, index=index) + exp = df[ind == "red"] + assert_frame_equal(res, exp) + + res = df.query('ilevel_0 != "red"', parser=parser, engine=engine) + ind = Series(df.index.get_level_values(0).values, index=index) + exp = df[ind != "red"] + assert_frame_equal(res, exp) + + def test_query_with_partially_named_multiindex(self): + for parser, engine in product(['pandas'], ENGINES): + yield self.check_query_with_partially_named_multiindex, parser, engine + + +class TestDataFrameQueryNumExprPandas(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.engine = 'numexpr' + cls.parser = 'pandas' + skip_if_no_ne() + + @classmethod + def tearDownClass(cls): + del cls.engine, cls.parser + + def test_date_query_method(self): + engine, parser = self.engine, self.parser + df = DataFrame(randn(5, 3)) + df['dates1'] = date_range('1/1/2012', periods=5) + df['dates2'] = date_range('1/1/2013', periods=5) + df['dates3'] = date_range('1/1/2014', periods=5) + res = df.query('dates1 < 20130101 < dates3', engine=engine, + parser=parser) + expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] + assert_frame_equal(res, expec) + + def test_query_scope(self): + engine, parser = self.engine, self.parser + from pandas.computation.common import NameResolutionError + + df = DataFrame({"i": lrange(10), "+": lrange(3, 13), + "r": lrange(4, 14)}) + i, s = 5, 6 + self.assertRaises(NameResolutionError, df.query, 'i < 5', + engine=engine, parser=parser, local_dict={'i': i}) + self.assertRaises(SyntaxError, df.query, 'i - +', engine=engine, + parser=parser) + self.assertRaises(NameResolutionError, df.query, 'i == s', + engine=engine, parser=parser, local_dict={'i': i, + 's': s}) + + def test_query_scope_index(self): + engine, parser = self.engine, self.parser + from pandas.computation.common import NameResolutionError + df = DataFrame(np.random.randint(10, size=(10, 3)), + index=Index(range(10), name='blob'), + columns=['a', 'b', 'c']) + from numpy import sin + df.index.name = 'sin' + self.assertRaises(NameResolutionError, df.query, 'sin > 5', + engine=engine, parser=parser, local_dict={'sin': + sin}) + + def test_query(self): + engine, parser = self.engine, self.parser + df = DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c']) + + assert_frame_equal(df.query('a < b', engine=engine, parser=parser), + df[df.a < df.b]) + assert_frame_equal(df.query('a + b > b * c', engine=engine, + parser=parser), + df[df.a + df.b > df.b * df.c]) + + local_dict = dict(df.iteritems()) + local_dict.update({'df': df}) + self.assertRaises(NameError, df.query, 'a < d & b < f', + local_dict=local_dict, engine=engine, parser=parser) + + # make sure that it's not just because we didn't pass the locals in + self.assertRaises(AssertionError, self.assertRaises, NameError, + df.query, 'a < b', local_dict={'df': df}, + engine=engine, parser=parser) + + def test_query_index_with_name(self): + engine, parser = self.engine, self.parser + df = DataFrame(np.random.randint(10, size=(10, 3)), + index=Index(range(10), name='blob'), + columns=['a', 'b', 'c']) + res = df.query('(blob < 5) & (a < b)', engine=engine, parser=parser) + expec = df[(df.index < 5) & (df.a < df.b)] + assert_frame_equal(res, expec) + + res = df.query('blob < b', engine=engine, parser=parser) + expec = df[df.index < df.b] + + assert_frame_equal(res, expec) + + def test_query_index_without_name(self): + engine, parser = self.engine, self.parser + df = DataFrame(np.random.randint(10, size=(10, 3)), + index=range(10), columns=['a', 'b', 'c']) + + # "index" should refer to the index + res = df.query('index < b', engine=engine, parser=parser) + expec = df[df.index < df.b] + assert_frame_equal(res, expec) + + # test against a scalar + res = df.query('index < 5', engine=engine, parser=parser) + expec = df[df.index < 5] + assert_frame_equal(res, expec) + + def test_nested_scope(self): + engine = self.engine + parser = self.parser + # smoke test + x = 1 + result = pd.eval('x + 1', engine=engine, parser=parser) + self.assertEqual(result, 2) + + df = DataFrame(np.random.randn(5, 3)) + df2 = DataFrame(np.random.randn(5, 3)) + expected = df[(df>0) & (df2>0)] + + result = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) + assert_frame_equal(result, expected) + + result = pd.eval('df[(df > 0) and (df2 > 0)]', engine=engine, + parser=parser) + assert_frame_equal(result, expected) + + result = pd.eval('df[(df > 0) and (df2 > 0) and df[df > 0] > 0]', + engine=engine, parser=parser) + expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] + assert_frame_equal(result, expected) + + result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser) + expected = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) + assert_frame_equal(result, expected) + + def test_local_syntax(self): + skip_if_no_pandas_parser(self.parser) + + from pandas.computation.common import NameResolutionError + + engine, parser = self.engine, self.parser + df = DataFrame(randn(100, 10), columns=list('abcdefghij')) + b = 1 + expect = df[df.a < b] + result = df.query('a < @b', engine=engine, parser=parser) + assert_frame_equal(result, expect) + + # scope issue with self.assertRaises so just catch it and let it pass + try: + df.query('a < @b', engine=engine, parser=parser) + except NameResolutionError: + pass + + del b + expect = df[df.a < df.b] + result = df.query('a < b', engine=engine, parser=parser) + assert_frame_equal(result, expect) + + def test_chained_cmp_and_in(self): + skip_if_no_pandas_parser(self.parser) + engine, parser = self.engine, self.parser + cols = list('abc') + df = DataFrame(randn(100, len(cols)), columns=cols) + res = df.query('a < b < c and a not in b not in c', engine=engine, + parser=parser) + ind = (df.a < df.b) & (df.b < df.c) & ~df.b.isin(df.a) & ~df.c.isin(df.b) + expec = df[ind] + assert_frame_equal(res, expec) + + +class TestDataFrameQueryNumExprPython(TestDataFrameQueryNumExprPandas): + @classmethod + def setUpClass(cls): + cls.engine = 'numexpr' + cls.parser = 'python' + skip_if_no_ne(cls.engine) + cls.frame = _frame.copy() + + @classmethod + def tearDownClass(cls): + del cls.frame, cls.engine, cls.parser + + def test_date_query_method(self): + engine, parser = self.engine, self.parser + df = DataFrame(randn(5, 3)) + df['dates1'] = date_range('1/1/2012', periods=5) + df['dates2'] = date_range('1/1/2013', periods=5) + df['dates3'] = date_range('1/1/2014', periods=5) + res = df.query('(df.dates1 < 20130101) & (20130101 < df.dates3)', + engine=engine, parser=parser) + expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] + assert_frame_equal(res, expec) + + def test_nested_scope(self): + engine = self.engine + parser = self.parser + # smoke test + x = 1 + result = pd.eval('x + 1', engine=engine, parser=parser) + self.assertEqual(result, 2) + + df = DataFrame(np.random.randn(5, 3)) + df2 = DataFrame(np.random.randn(5, 3)) + expected = df[(df>0) & (df2>0)] + + result = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) + assert_frame_equal(result, expected) + + result = pd.eval('df[(df > 0) & (df2 > 0)]', engine=engine, + parser=parser) + assert_frame_equal(result, expected) + + result = pd.eval('df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]', + engine=engine, parser=parser) + expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] + assert_frame_equal(result, expected) + + result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser) + expected = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) + assert_frame_equal(result, expected) + + +class TestDataFrameQueryPythonPandas(TestDataFrameQueryNumExprPandas): + @classmethod + def setUpClass(cls): + cls.engine = 'python' + cls.parser = 'pandas' + cls.frame = _frame.copy() + + @classmethod + def tearDownClass(cls): + del cls.frame, cls.engine, cls.parser + + +class TestDataFrameQueryPythonPython(TestDataFrameQueryNumExprPython): + @classmethod + def setUpClass(cls): + cls.engine = cls.parser = 'python' + cls.frame = _frame.copy() + + @classmethod + def tearDownClass(cls): + del cls.frame, cls.engine, cls.parser + + +PARSERS = 'python', 'pandas' +ENGINES = 'python', 'numexpr' + + +class TestDataFrameQueryStrings(object): + def check_str_query_method(self, parser, engine): + skip_if_no_ne(engine) + df = DataFrame(randn(10, 1), columns=['b']) + df['strings'] = Series(list('aabbccddee')) + expect = df[df.strings == 'a'] + + if parser != 'pandas': + col = 'strings' + lst = '"a"' + + lhs = [col] * 2 + [lst] * 2 + rhs = lhs[::-1] + + eq, ne = '==', '!=' + ops = 2 * ([eq] + [ne]) + + for lhs, op, rhs in zip(lhs, ops, rhs): + ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs) + assertRaises(NotImplementedError, df.query, ex, engine=engine, + parser=parser, local_dict={'strings': df.strings}) + else: + res = df.query('"a" == strings', engine=engine, parser=parser) + assert_frame_equal(res, expect) + + res = df.query('strings == "a"', engine=engine, parser=parser) + assert_frame_equal(res, expect) + assert_frame_equal(res, df[df.strings.isin(['a'])]) + + expect = df[df.strings != 'a'] + res = df.query('strings != "a"', engine=engine, parser=parser) + assert_frame_equal(res, expect) + + res = df.query('"a" != strings', engine=engine, parser=parser) + assert_frame_equal(res, expect) + assert_frame_equal(res, df[~df.strings.isin(['a'])]) + + def test_str_query_method(self): + for parser, engine in product(PARSERS, ENGINES): + yield self.check_str_query_method, parser, engine + + def test_str_list_query_method(self): + for parser, engine in product(PARSERS, ENGINES): + yield self.check_str_list_query_method, parser, engine + + def check_str_list_query_method(self, parser, engine): + skip_if_no_ne(engine) + df = DataFrame(randn(10, 1), columns=['b']) + df['strings'] = Series(list('aabbccddee')) + expect = df[df.strings.isin(['a', 'b'])] + + if parser != 'pandas': + col = 'strings' + lst = '["a", "b"]' + + lhs = [col] * 2 + [lst] * 2 + rhs = lhs[::-1] + + eq, ne = '==', '!=' + ops = 2 * ([eq] + [ne]) + + for lhs, op, rhs in zip(lhs, ops, rhs): + ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs) + assertRaises(NotImplementedError, df.query, ex, engine=engine, + parser=parser, local_dict={'strings': df.strings}) + else: + res = df.query('strings == ["a", "b"]', engine=engine, + parser=parser) + assert_frame_equal(res, expect) + + res = df.query('["a", "b"] == strings', engine=engine, + parser=parser) + assert_frame_equal(res, expect) + + expect = df[~df.strings.isin(['a', 'b'])] + + res = df.query('strings != ["a", "b"]', engine=engine, + parser=parser) + assert_frame_equal(res, expect) + + res = df.query('["a", "b"] != strings', engine=engine, + parser=parser) + assert_frame_equal(res, expect) + + def check_query_with_string_columns(self, parser, engine): + skip_if_no_ne(engine) + df = DataFrame({'a': list('aaaabbbbcccc'), + 'b': list('aabbccddeeff'), + 'c': np.random.randint(5, size=12), + 'd': np.random.randint(9, size=12)}) + if parser == 'pandas': + res = df.query('a in b', parser=parser, engine=engine) + expec = df[df.a.isin(df.b)] + assert_frame_equal(res, expec) + + res = df.query('a in b and c < d', parser=parser, engine=engine) + expec = df[df.a.isin(df.b) & (df.c < df.d)] + assert_frame_equal(res, expec) + else: + with assertRaises(NotImplementedError): + df.query('a in b', parser=parser, engine=engine) + + with assertRaises(NotImplementedError): + df.query('a in b and c < d', parser=parser, engine=engine) + + def test_query_with_string_columns(self): + for parser, engine in product(PARSERS, ENGINES): + yield self.check_query_with_string_columns, parser, engine + + def check_object_array_eq_ne(self, parser, engine): + skip_if_no_ne(engine) + df = DataFrame({'a': list('aaaabbbbcccc'), + 'b': list('aabbccddeeff'), + 'c': np.random.randint(5, size=12), + 'd': np.random.randint(9, size=12)}) + res = df.query('a == b', parser=parser, engine=engine) + exp = df[df.a == df.b] + assert_frame_equal(res, exp) + + res = df.query('a != b', parser=parser, engine=engine) + exp = df[df.a != df.b] + assert_frame_equal(res, exp) + + def test_object_array_eq_ne(self): + for parser, engine in product(PARSERS, ENGINES): + yield self.check_object_array_eq_ne, parser, engine + + +class TestDataFrameEvalNumExprPandas(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.engine = 'numexpr' + cls.parser = 'pandas' + skip_if_no_ne() + + @classmethod + def tearDownClass(cls): + del cls.engine, cls.parser + + def setUp(self): + self.frame = DataFrame(randn(10, 3), columns=list('abc')) + + def tearDown(self): + del self.frame + + def test_simple_expr(self): + res = self.frame.eval('a + b', engine=self.engine, parser=self.parser) + expect = self.frame.a + self.frame.b + assert_series_equal(res, expect) + + def test_bool_arith_expr(self): + res = self.frame.eval('a[a < 1] + b', engine=self.engine, + parser=self.parser) + expect = self.frame.a[self.frame.a < 1] + self.frame.b + assert_series_equal(res, expect) + + +class TestDataFrameEvalNumExprPython(TestDataFrameEvalNumExprPandas): + @classmethod + def setUpClass(cls): + cls.engine = 'numexpr' + cls.parser = 'python' + skip_if_no_ne() + + @classmethod + def tearDownClass(cls): + del cls.engine, cls.parser + + +class TestDataFrameEvalPythonPandas(TestDataFrameEvalNumExprPandas): + @classmethod + def setUpClass(cls): + cls.engine = 'python' + cls.parser = 'pandas' + + @classmethod + def tearDownClass(cls): + del cls.engine, cls.parser + + +class TestDataFrameEvalPythonPython(TestDataFrameEvalNumExprPython): + @classmethod + def setUpClass(cls): + cls.engine = cls.parser = 'python' + + @classmethod + def tearDownClass(cls): + del cls.engine, cls.parser + + if __name__ == '__main__': - # unittest.main() - import nose - # nose.runmodule(argv=[__file__,'-vvs','-x', '--ipdb-failure'], - # exit=False) nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 1572ca481d8a4..8646d261306ca 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -926,7 +926,8 @@ def join(self, other, how='left', level=None, return_indexers=False): See Index.join """ if (not isinstance(other, DatetimeIndex) and len(other) > 0 and - other.inferred_type != 'mixed-integer'): + other.inferred_type not in ('floating', 'mixed-integer', + 'mixed-integer-float', 'mixed')): try: other = DatetimeIndex(other) except TypeError: diff --git a/pandas/util/testing.py b/pandas/util/testing.py index abc13fb2ad9ee..0718dc8926011 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -15,7 +15,7 @@ from contextlib import contextmanager from distutils.version import LooseVersion -from numpy.random import randn +from numpy.random import randn, rand import numpy as np from pandas.core.common import isnull, _is_sequence @@ -27,14 +27,14 @@ import pandas.compat as compat from pandas.compat import( map, zip, range, unichr, lrange, lmap, lzip, u, callable, Counter, - raise_with_traceback + raise_with_traceback, httplib ) from pandas import bdate_range from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex -from pandas.io.common import urlopen, HTTPException +from pandas.io.common import urlopen Index = index.Index MultiIndex = index.MultiIndex @@ -48,6 +48,10 @@ _RAISE_NETWORK_ERROR_DEFAULT = False +def randbool(size=(), p=0.5): + return rand(*size) <= p + + def rands(n): choices = string.ascii_letters + string.digits return ''.join(random.choice(choices) for _ in range(n)) @@ -58,10 +62,17 @@ def randu(n): choices += string.digits return ''.join([random.choice(choices) for _ in range(n)]) + +def choice(x, size=10): + """sample with replacement; uniform over the input""" + try: + return np.random.choice(x, size=size) + except AttributeError: + return np.random.randint(len(x), size=size).choose(x) + #------------------------------------------------------------------------------ # Console debugging tools - def debug(f, *args, **kwargs): from pdb import Pdb as OldPdb try: @@ -752,7 +763,7 @@ def dec(f): return wrapper -_network_error_classes = IOError, HTTPException +_network_error_classes = IOError, httplib.HTTPException @optional_args @@ -796,13 +807,13 @@ def network(t, raise_on_error=_RAISE_NETWORK_ERROR_DEFAULT, >>> import nose >>> @network ... def test_network(): - ... with urlopen("rabbit://bonanza.com") as f: - ... pass + ... with urlopen("rabbit://bonanza.com") as f: + ... pass ... >>> try: - ... test_network() + ... test_network() ... except nose.SkipTest: - ... print "SKIPPING!" + ... print("SKIPPING!") ... SKIPPING! @@ -811,8 +822,8 @@ def network(t, raise_on_error=_RAISE_NETWORK_ERROR_DEFAULT, >>> @network(raise_on_error=True) ... def test_network(): - ... with urlopen("complaint://deadparrot.com") as f: - ... pass + ... with urlopen("complaint://deadparrot.com") as f: + ... pass ... >>> test_network() Traceback (most recent call last): diff --git a/setup.py b/setup.py index b7df339daf75a..ffd6089bdc88d 100755 --- a/setup.py +++ b/setup.py @@ -83,7 +83,7 @@ except ImportError: cython = False -from os.path import splitext, basename, join as pjoin +from os.path import join as pjoin class build_ext(_build_ext): @@ -506,6 +506,8 @@ def pxd(name): maintainer=AUTHOR, packages=['pandas', 'pandas.compat', + 'pandas.computation', + 'pandas.computation.tests', 'pandas.core', 'pandas.io', 'pandas.rpy', diff --git a/vb_suite/binary_ops.py b/vb_suite/binary_ops.py index 54774344520c9..3f076f9f922a3 100644 --- a/vb_suite/binary_ops.py +++ b/vb_suite/binary_ops.py @@ -21,7 +21,7 @@ start_date=datetime(2012, 1, 1)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_numexpr_threads(1) @@ -32,7 +32,7 @@ start_date=datetime(2013, 2, 26)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_use_numexpr(False) @@ -53,7 +53,7 @@ start_date=datetime(2012, 1, 1)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_numexpr_threads(1) @@ -63,7 +63,7 @@ start_date=datetime(2013, 2, 26)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_use_numexpr(False) @@ -84,7 +84,7 @@ start_date=datetime(2012, 1, 1)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_numexpr_threads(1) @@ -94,7 +94,7 @@ start_date=datetime(2013, 2, 26)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_use_numexpr(False) diff --git a/vb_suite/eval.py b/vb_suite/eval.py new file mode 100644 index 0000000000000..c666cd431cbb4 --- /dev/null +++ b/vb_suite/eval.py @@ -0,0 +1,114 @@ +from vbench.benchmark import Benchmark +from datetime import datetime + +common_setup = """from pandas_vb_common import * +import pandas as pd +df = DataFrame(np.random.randn(20000, 100)) +df2 = DataFrame(np.random.randn(20000, 100)) +df3 = DataFrame(np.random.randn(20000, 100)) +df4 = DataFrame(np.random.randn(20000, 100)) +""" + +setup = common_setup + """ +import pandas.computation.expressions as expr +expr.set_numexpr_threads(1) +""" + +SECTION = 'Eval' + +#---------------------------------------------------------------------- +# binary ops + +#---------------------------------------------------------------------- +# add +eval_frame_add_all_threads = \ + Benchmark("pd.eval('df + df2 + df3 + df4')", common_setup, + name='eval_frame_add_all_threads', + start_date=datetime(2013, 7, 21)) + + + +eval_frame_add_one_thread = \ + Benchmark("pd.eval('df + df2 + df3 + df4')", setup, + name='eval_frame_add_one_thread', + start_date=datetime(2013, 7, 26)) + +eval_frame_add_python = \ + Benchmark("pd.eval('df + df2 + df3 + df4', engine='python')", common_setup, + name='eval_frame_add_python', start_date=datetime(2013, 7, 21)) + +eval_frame_add_python_one_thread = \ + Benchmark("pd.eval('df + df2 + df3 + df4', engine='python')", setup, + name='eval_frame_add_python_one_thread', + start_date=datetime(2013, 7, 26)) +#---------------------------------------------------------------------- +# mult + +eval_frame_mult_all_threads = \ + Benchmark("pd.eval('df * df2 * df3 * df4')", common_setup, + name='eval_frame_mult_all_threads', + start_date=datetime(2012, 7, 21)) + +eval_frame_mult_one_thread = \ + Benchmark("pd.eval('df * df2 * df3 * df4')", setup, + name='eval_frame_mult_one_thread', + start_date=datetime(2012, 7, 26)) + +eval_frame_mult_python = \ + Benchmark("pdl.eval('df * df2 * df3 * df4', engine='python')", + common_setup, + name='eval_frame_mult_python', start_date=datetime(2013, 7, 21)) + +eval_frame_mult_python_one_thread = \ + Benchmark("pd.eval('df * df2 * df3 * df4', engine='python')", setup, + name='eval_frame_mult_python_one_thread', + start_date=datetime(2012, 7, 26)) + +#---------------------------------------------------------------------- +# multi and + +eval_frame_and_all_threads = \ + Benchmark("pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)')", + common_setup, + name='eval_frame_and_all_threads', + start_date=datetime(2012, 7, 21)) + +eval_frame_and_one_thread = \ + Benchmark("pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)')", setup, + name='eval_frame_and_one_thread', + start_date=datetime(2012, 7, 26)) + +setup = common_setup +eval_frame_and_python = \ + Benchmark("pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python')", + common_setup, name='eval_frame_and_python', + start_date=datetime(2013, 7, 21)) + +eval_frame_and_one_thread = \ + Benchmark("pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python')", + setup, + name='eval_frame_and_python_one_thread', + start_date=datetime(2012, 7, 26)) + +#-------------------------------------------------------------------- +# chained comp +eval_frame_chained_cmp_all_threads = \ + Benchmark("pd.eval('df < df2 < df3 < df4')", common_setup, + name='eval_frame_chained_cmp_all_threads', + start_date=datetime(2012, 7, 21)) + +eval_frame_chained_cmp_one_thread = \ + Benchmark("pd.eval('df < df2 < df3 < df4')", setup, + name='eval_frame_chained_cmp_one_thread', + start_date=datetime(2012, 7, 26)) + +setup = common_setup +eval_frame_chained_cmp_python = \ + Benchmark("pd.eval('df < df2 < df3 < df4', engine='python')", + common_setup, name='eval_frame_chained_cmp_python', + start_date=datetime(2013, 7, 26)) + +eval_frame_chained_cmp_one_thread = \ + Benchmark("pd.eval('df < df2 < df3 < df4', engine='python')", setup, + name='eval_frame_chained_cmp_python_one_thread', + start_date=datetime(2012, 7, 26)) diff --git a/vb_suite/indexing.py b/vb_suite/indexing.py index 1264ae053ffca..beefec256ed81 100644 --- a/vb_suite/indexing.py +++ b/vb_suite/indexing.py @@ -106,7 +106,7 @@ start_date=datetime(2012, 1, 1)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(50000, 100)) df2 = DataFrame(np.random.randn(50000, 100)) expr.set_numexpr_threads(1) @@ -118,7 +118,7 @@ setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(50000, 100)) df2 = DataFrame(np.random.randn(50000, 100)) expr.set_use_numexpr(False) diff --git a/vb_suite/suite.py b/vb_suite/suite.py index ca83855c2a109..f3c8dfe3032e0 100644 --- a/vb_suite/suite.py +++ b/vb_suite/suite.py @@ -23,7 +23,8 @@ 'sparse', 'reshape', 'stat_ops', - 'timeseries'] + 'timeseries', + 'eval'] by_module = {} benchmarks = []