CLN: correct in and not in

cpcloud · cpcloud · commit b8a3ba3bcecd · 2013-09-16T10:38:19.000-04:00
Also added tests for nan in and not in and disallowed ops like

    pd.eval('1 or 2')

since that should be performed in regular Python
diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst
@@ -384,6 +384,14 @@ Now let's do the same thing but with comparisons:
 
    %timeit pd.eval('df1 + df2 + df3 + df4 + s')
 
+.. note::
+
+   Operations such as ``1 and 2`` should be performed in Python. An exception
+   will be raised if you try to performed any boolean or bitwise operations
+   with scalar operands that are not of type ``bool`` or ``np.bool_``. *This
+   includes bitwise operations on scalars.* You should perform these kinds of
+   operations in Python.
+
 The ``DataFrame.eval`` method
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -393,7 +401,7 @@ evaluate an expression in the "context" of a ``DataFrame``.
 
 .. ipython:: python
 
-   df = DataFrame(randn(10, 2), columns=['a', 'b'])
+   df = DataFrame(randn(5, 2), columns=['a', 'b'])
    df.eval('a + b')
 
 
@@ -410,7 +418,7 @@ You can refer to local variables the same way you would in vanilla Python
 
 .. ipython:: python
 
-   df = DataFrame(randn(10, 2), columns=['a', 'b'])
+   df = DataFrame(randn(5, 2), columns=['a', 'b'])
    newcol = randn(len(df))
    df.eval('b + newcol')
 
@@ -419,16 +427,22 @@ You can refer to local variables the same way you would in vanilla Python
    The one exception is when you have a local (or global) with the same name as
    a column in the ``DataFrame``
 
-    .. ipython:: python
-       :okexcept:
+    .. code-block:: python
 
-       df = DataFrame(randn(10, 2), columns=['a', 'b'])
+       df = DataFrame(randn(5, 2), columns=['a', 'b'])
        a = randn(len(df))
        df.eval('a + b')
+       NameResolutionError: resolvers and locals overlap on names ['a']
+
 
    To deal with these conflicts, a special syntax exists for referring
    variables with the same name as a column
 
+    .. ipython:: python
+       :suppress:
+
+       a = randn(len(df))
+
     .. ipython:: python
 
        df.eval('@a + b')
diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst
@@ -1014,8 +1014,7 @@ The :meth:`~pandas.DataFrame.query` Method
 .. versionadded:: 0.13
 
 :class:`~pandas.DataFrame` objects have a :meth:`~pandas.DataFrame.query`
-method that allows selection using a string consisting of columns of the
-calling :class:`~pandas.DataFrame`.
+method that allows selection using a boolean expression.
 
 You can get the value of the frame where column ``b`` has values
 between the values of columns ``a`` and ``c``.
@@ -1027,7 +1026,7 @@ between the values of columns ``a`` and ``c``.
 
 .. ipython:: python
 
-   n = 20
+   n = 10
    df = DataFrame(rand(n, 3), columns=list('abc'))
    df
    df[(df.a < df.b) & (df.b < df.c)]
@@ -1038,7 +1037,7 @@ with the name ``a``.
 
 .. ipython:: python
 
-   df = DataFrame(randint(n, size=(n, 2)), columns=list('bc'))
+   df = DataFrame(randint(n / 2, size=(n, 2)), columns=list('bc'))
    df.index.name = 'a'
    df
    df.query('a < b and b < c')
@@ -1075,13 +1074,14 @@ You can also use the levels of a ``DataFrame`` with a
 
    import pandas.util.testing as tm
 
-   colors = tm.choice(['red', 'green'], size=10)
-   foods = tm.choice(['eggs', 'ham'], size=10)
+   n = 10
+   colors = tm.choice(['red', 'green'], size=n)
+   foods = tm.choice(['eggs', 'ham'], size=n)
    colors
    foods
 
    index = MultiIndex.from_arrays([colors, foods], names=['color', 'food'])
-   df = DataFrame(randn(10, 2), index=index)
+   df = DataFrame(randn(n, 2), index=index)
    df
    df.query('color == "red"')
 
@@ -1091,8 +1091,7 @@ special names:
 
 .. ipython:: python
 
-   index.names = [None, None]
-   df = DataFrame(randn(10, 2), index=index)
+   df.index.names = [None, None]
    df
    df.query('ilevel_0 == "red"')
 
@@ -1111,9 +1110,9 @@ having to specify which frame you're interested in querying
 
 .. ipython:: python
 
-   df = DataFrame(randint(n, size=(n, 2)), columns=list('bc'))
+   df = DataFrame(randint(n / 2, size=(n, 2)), columns=list('bc'))
    df.index.name = 'a'
-   df2 = DataFrame(randint(n + 10, size=(n + 10, 3)), columns=list('abc'))
+   df2 = DataFrame(randint(n + 5, size=(n + 5, 3)), columns=list('abc'))
    df2
    expr = 'a < b & b < c'
    map(lambda frame: frame.query(expr), [df, df2])
@@ -1141,7 +1140,7 @@ Full numpy-like syntax
 
 .. ipython:: python
 
-   df = DataFrame(randint(n, size=(n, 3)), columns=list('abc'))
+   df = DataFrame(randint(n / 2, size=(n, 3)), columns=list('abc'))
    df
    df['(a < b) & (b < c)']
    df[(df.a < df.b) & (df.b < df.c)]
@@ -1164,10 +1163,6 @@ Pretty close to how you might write it on paper
 
    df['a < b < c']
 
-As you can see, these are all equivalent ways to express the same operation (in
-fact, they are all ultimately parsed into something very similar to the first
-example of the indexing syntax above).
-
 The ``in`` and ``not in`` operators
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -1184,7 +1179,7 @@ The ``in`` and ``not in`` operators
 .. ipython:: python
 
    # get all rows where columns "a" and "b" have overlapping values
-   df = DataFrame({'a': list('aaaabbbbcccc'), 'b': list('aabbccddeeff'),
+   df = DataFrame({'a': list('aabbccddeeff'), 'b': list('aaaabbbbcccc'),
                    'c': randint(5, size=12), 'd': randint(9, size=12)})
    df
    df['a in b']
diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py
@@ -461,10 +461,10 @@ def _rewrite_membership_op(self, node, left, right):
                 name = self.env.add_tmp([right.value])
                 right = self.term_type(name, self.env)
 
-            # swap the operands so things like a == [1, 2] are translated to
-            # [1, 2] in a -> a.isin([1, 2])
-            if right_list or right_str:
-                left, right = right, left
+            if left_str:
+                self.env.remove_tmp(left.name)
+                name = self.env.add_tmp([left.value])
+                left = self.term_type(name, self.env)
 
         op = self.visit(op_instance)
         return op, op_instance, left, right
@@ -662,13 +662,14 @@ def visitor(x, y):
         return reduce(visitor, operands)
 
 
-_python_not_supported = frozenset(['Assign', 'Tuple', 'Dict', 'Call',
-                                   'BoolOp', 'In', 'NotIn'])
+_python_not_supported = frozenset(['Assign', 'Dict', 'Call', 'BoolOp',
+                                   'In', 'NotIn'])
 _numexpr_supported_calls = frozenset(_reductions + _mathops)
 
 
 @disallow((_unsupported_nodes | _python_not_supported) -
-          (_boolop_nodes | frozenset(['BoolOp', 'Attribute', 'In', 'NotIn'])))
+          (_boolop_nodes | frozenset(['BoolOp', 'Attribute', 'In', 'NotIn',
+                                      'Tuple'])))
 class PandasExprVisitor(BaseExprVisitor):
     def __init__(self, env, engine, parser,
                  preparser=lambda x: _replace_locals(_replace_booleans(x))):
diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py
@@ -193,6 +193,13 @@ def name(self):
     def name(self, new_name):
         self._name = new_name
 
+    @property
+    def ndim(self):
+        try:
+            return self._value.ndim
+        except AttributeError:
+            return 0
+
 
 class Constant(Term):
     def __init__(self, value, env, side=None, encoding=None):
@@ -207,6 +214,7 @@ def name(self):
         return self.value
 
 
+
 _bool_op_map = {'not': '~', 'and': '&', 'or': '|'}
 
 
@@ -236,29 +244,39 @@ def return_type(self):
             return np.bool_
         return np.result_type(*(term.type for term in com.flatten(self)))
 
+    @property
+    def isscalar(self):
+        return all(operand.isscalar for operand in self.operands)
+
 
 def _in(x, y):
     """Compute the vectorized membership of ``x in y`` if possible, otherwise
     use Python.
     """
     try:
-        return y.isin(x)
+        return x.isin(y)
     except AttributeError:
+        if com.is_list_like(x):
+            try:
+                return y.isin(x)
+            except AttributeError:
+                pass
         return x in y
-    except TypeError:
-        return y.isin([x])
 
 
 def _not_in(x, y):
     """Compute the vectorized membership of ``x not in y`` if possible,
     otherwise use Python.
     """
     try:
-        return ~y.isin(x)
+        return ~x.isin(y)
     except AttributeError:
+        if com.is_list_like(x):
+            try:
+                return ~y.isin(x)
+            except AttributeError:
+                pass
         return x not in y
-    except TypeError:
-        return ~y.isin([x])
 
 
 _cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=', 'in', 'not in'
@@ -322,14 +340,17 @@ def __init__(self, op, lhs, rhs, **kwargs):
         self.lhs = lhs
         self.rhs = rhs
 
+        self._disallow_scalar_only_bool_ops()
+
         self.convert_values()
 
         try:
             self.func = _binary_ops_dict[op]
         except KeyError:
-            keys = _binary_ops_dict.keys()
+            # has to be made a list for python3
+            keys = list(_binary_ops_dict.keys())
             raise ValueError('Invalid binary operator {0!r}, valid'
-                                      ' operators are {1}'.format(op, keys))
+                             ' operators are {1}'.format(op, keys))
 
     def __call__(self, env):
         """Recursively evaluate an expression in Python space.
@@ -425,6 +446,13 @@ def stringify(value):
                 v = v.tz_convert('UTC')
             self.lhs.update(v)
 
+    def _disallow_scalar_only_bool_ops(self):
+        if ((self.lhs.isscalar or self.rhs.isscalar) and
+            self.op in _bool_ops_dict and
+            (not (issubclass(self.rhs.return_type, (bool, np.bool_)) and
+                  issubclass(self.lhs.return_type, (bool, np.bool_))))):
+            raise NotImplementedError("cannot evaluate scalar only bool ops")
+
 
 class Div(BinOp):
     """Div operator to special case casting.
diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py