diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index 946256d585c49..9503675af8681 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -570,18 +570,51 @@ prefix the name of the :class:`~pandas.DataFrame` to the column(s) you're interested in evaluating. In addition, you can perform assignment of columns within an expression. -This allows for *formulaic evaluation*. Only a single assignment is permitted. -The assignment target can be a new column name or an existing column name, and -it must be a valid Python identifier. +This allows for *formulaic evaluation*. The assignment target can be a +new column name or an existing column name, and it must be a valid Python +identifier. + +.. versionadded:: 0.18.0 + +The ``inplace`` keyword determines whether this assignment will performed +on the original ``DataFrame`` or return a copy with the new column. + +.. warning:: + + For backwards compatability, ``inplace`` defaults to ``True`` if not + specified. This will change in a future version of pandas - if your + code depends on an inplace assignment you should update to explicitly + set ``inplace=True`` .. ipython:: python df = pd.DataFrame(dict(a=range(5), b=range(5, 10))) - df.eval('c = a + b') - df.eval('d = a + b + c') - df.eval('a = 1') + df.eval('c = a + b', inplace=True) + df.eval('d = a + b + c', inplace=True) + df.eval('a = 1', inplace=True) df +When ``inplace`` is set to ``False``, a copy of the ``DataFrame`` with the +new or modified columns is returned and the original frame is unchanged. + +.. ipython:: python + + df + df.eval('e = a - c', inplace=False) + df + +.. versionadded:: 0.18.0 + +As a convenience, multiple assignments can be performed by using a +multi-line string. + +.. ipython:: python + + df.eval(""" + c = a + b + d = a + b + c + a = 1""", inplace=False) + The equivalent in standard Python would be .. ipython:: python @@ -592,6 +625,23 @@ The equivalent in standard Python would be df['a'] = 1 df +.. versionadded:: 0.18.0 + +The ``query`` method gained the ``inplace`` keyword which determines +whether the query modifies the original frame. + +.. ipython:: python + + df = pd.DataFrame(dict(a=range(5), b=range(5, 10))) + df.query('a > 2') + df.query('a > 2', inplace=True) + df + +.. warning:: + + Unlike with ``eval``, the default value for ``inplace`` for ``query`` + is ``False``. This is consistent with prior versions of pandas. + Local Variables ~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 94c2dddbe1ef0..5c21e04251f44 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -295,15 +295,60 @@ Other API Changes - ``.memory_usage`` now includes values in the index, as does memory_usage in ``.info`` (:issue:`11597`) +Changes to eval +^^^^^^^^^^^^^^^ +In prior versions, new columns assignments in an ``eval`` expression resulted +in an inplace change to the ``DataFrame``. (:issue:`9297`) +.. ipython:: python + df = pd.DataFrame({'a': np.linspace(0, 10, 5), 'b': range(5)}) + df.eval('c = a + b') + df +In version 0.18.0, a new ``inplace`` keyword was added to choose whether the +assignment should be done inplace or return a copy. +.. ipython:: python + df + df.eval('d = c - b', inplace=False) + df + df.eval('d = c - b', inplace=True) + df +.. warning:: + + For backwards compatability, ``inplace`` defaults to ``True`` if not specified. + This will change in a future version of pandas - if your code depends on an + inplace assignment you should update to explicitly set ``inplace=True`` +The ``inplace`` keyword parameter was also added the ``query`` method. +.. ipython:: python + + df.query('a > 5') + df.query('a > 5', inplace=True) + df + +.. warning:: + + Note that the default value for ``inplace`` in a ``query`` + is ``False``, which is consistent with prior verions. + +``eval`` has also been updated to allow multi-line expressions for multiple +assignments. These expressions will be evaluated one at a time in order. Only +assginments are valid for multi-line expressions. + +.. ipython:: python + + df + df.eval(""" + e = d + a + f = e - 22 + g = f / 2.0""", inplace=True) + df .. _whatsnew_0180.deprecations: @@ -410,7 +455,7 @@ Bug Fixes - Bug in ``pd.read_clipboard`` and ``pd.to_clipboard`` functions not supporting Unicode; upgrade included ``pyperclip`` to v1.5.15 (:issue:`9263`) - +- Bug in ``DataFrame.query`` containing an assignment (:issue:`8664`) diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index e3096a85ca7d7..d2d16acc27fb6 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -3,11 +3,12 @@ """Top level ``eval`` module. """ +import warnings import tokenize from pandas.core import common as com from pandas.computation.expr import Expr, _parsers, tokenize_string from pandas.computation.scope import _ensure_scope -from pandas.compat import DeepChainMap, builtins +from pandas.compat import string_types from pandas.computation.engines import _engines from distutils.version import LooseVersion @@ -138,7 +139,7 @@ def _check_for_locals(expr, stack_level, parser): def eval(expr, parser='pandas', engine='numexpr', truediv=True, local_dict=None, global_dict=None, resolvers=(), level=0, - target=None): + target=None, inplace=None): """Evaluate a Python expression as a string using various backends. The following arithmetic operations are supported: ``+``, ``-``, ``*``, @@ -196,6 +197,13 @@ def eval(expr, parser='pandas', engine='numexpr', truediv=True, scope. Most users will **not** need to change this parameter. target : a target object for assignment, optional, default is None essentially this is a passed in resolver + inplace : bool, default True + If expression mutates, whether to modify object inplace or return + copy with mutation. + + WARNING: inplace=None currently falls back to to True, but + in a future version, will default to False. Use inplace=True + explicitly rather than relying on the default. Returns ------- @@ -214,29 +222,78 @@ def eval(expr, parser='pandas', engine='numexpr', truediv=True, pandas.DataFrame.query pandas.DataFrame.eval """ - expr = _convert_expression(expr) - _check_engine(engine) - _check_parser(parser) - _check_resolvers(resolvers) - _check_for_locals(expr, level, parser) - - # get our (possibly passed-in) scope - level += 1 - env = _ensure_scope(level, global_dict=global_dict, - local_dict=local_dict, resolvers=resolvers, - target=target) - - parsed_expr = Expr(expr, engine=engine, parser=parser, env=env, - truediv=truediv) - - # construct the engine and evaluate the parsed expression - eng = _engines[engine] - eng_inst = eng(parsed_expr) - ret = eng_inst.evaluate() - - # assign if needed - if env.target is not None and parsed_expr.assigner is not None: - env.target[parsed_expr.assigner] = ret - return None + first_expr = True + if isinstance(expr, string_types): + exprs = [e for e in expr.splitlines() if e != ''] + else: + exprs = [expr] + multi_line = len(exprs) > 1 + + if multi_line and target is None: + raise ValueError("multi-line expressions are only valid in the " + "context of data, use DataFrame.eval") + + first_expr = True + for expr in exprs: + expr = _convert_expression(expr) + _check_engine(engine) + _check_parser(parser) + _check_resolvers(resolvers) + _check_for_locals(expr, level, parser) + + # get our (possibly passed-in) scope + level += 1 + env = _ensure_scope(level, global_dict=global_dict, + local_dict=local_dict, resolvers=resolvers, + target=target) + + parsed_expr = Expr(expr, engine=engine, parser=parser, env=env, + truediv=truediv) + + # construct the engine and evaluate the parsed expression + eng = _engines[engine] + eng_inst = eng(parsed_expr) + ret = eng_inst.evaluate() + + if parsed_expr.assigner is None and multi_line: + raise ValueError("Multi-line expressions are only valid" + " if all expressions contain an assignment") + + # assign if needed + if env.target is not None and parsed_expr.assigner is not None: + if inplace is None: + warnings.warn( + "eval expressions containing an assignment currently" + "default to operating inplace.\nThis will change in " + "a future version of pandas, use inplace=True to " + "avoid this warning.", + FutureWarning, stacklevel=3) + inplace = True + + # if returning a copy, copy only on the first assignment + if not inplace and first_expr: + target = env.target.copy() + else: + target = env.target + + target[parsed_expr.assigner] = ret + + if not resolvers: + resolvers = ({parsed_expr.assigner: ret},) + else: + # existing resolver needs updated to handle + # case of mutating existing column in copy + for resolver in resolvers: + if parsed_expr.assigner in resolver: + resolver[parsed_expr.assigner] = ret + break + else: + resolvers += ({parsed_expr.assigner: ret},) + + ret = None + first_expr = False + + if not inplace and inplace is not None: + return target return ret diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 7474c0d118612..3c529c26b453d 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -663,6 +663,13 @@ def test_identical(self): tm.assert_numpy_array_equal(result, np.array([False])) self.assertEqual(result.shape, (1, )) + def test_line_continuation(self): + # GH 11149 + exp = """1 + 2 * \ + 5 - 1 + 2 """ + result = pd.eval(exp, engine=self.engine, parser=self.parser) + self.assertEqual(result, 12) + class TestEvalNumexprPython(TestEvalNumexprPandas): @@ -1220,21 +1227,21 @@ def test_assignment_column(self): expected = orig_df.copy() expected['a'] = expected['a'] + expected['b'] df = orig_df.copy() - df.eval('a = a + b') + df.eval('a = a + b', inplace=True) assert_frame_equal(df, expected) # single assignment - new variable expected = orig_df.copy() expected['c'] = expected['a'] + expected['b'] df = orig_df.copy() - df.eval('c = a + b') + df.eval('c = a + b', inplace=True) assert_frame_equal(df, expected) # with a local name overlap def f(): df = orig_df.copy() - a = 1 - df.eval('a = 1 + b') + a = 1 # noqa + df.eval('a = 1 + b', inplace=True) return df df = f() @@ -1245,9 +1252,9 @@ def f(): df = orig_df.copy() def f(): - a = 1 + a = 1 # noqa old_a = df.a.copy() - df.eval('a = a + b') + df.eval('a = a + b', inplace=True) result = old_a + df.b assert_series_equal(result, df.a, check_names=False) self.assertTrue(result.name is None) @@ -1256,12 +1263,13 @@ def f(): # multiple assignment df = orig_df.copy() - df.eval('c = a + b') + df.eval('c = a + b', inplace=True) self.assertRaises(SyntaxError, df.eval, 'c = a = b') # explicit targets df = orig_df.copy() - self.eval('c = df.a + df.b', local_dict={'df': df}, target=df) + self.eval('c = df.a + df.b', local_dict={'df': df}, + target=df, inplace=True) expected = orig_df.copy() expected['c'] = expected['a'] + expected['b'] assert_frame_equal(df, expected) @@ -1273,6 +1281,88 @@ def test_column_in(self): expected = Series([True]) assert_series_equal(result, expected) + def assignment_not_inplace(self): + # GH 9297 + tm.skip_if_no_ne('numexpr') + df = DataFrame(np.random.randn(5, 2), columns=list('ab')) + + actual = df.eval('c = a + b', inplace=False) # noqa + expected = df.copy() + expected['c'] = expected['a'] + expected['b'] + assert_frame_equal(df, expected) + + # default for inplace will change + with tm.assert_produces_warnings(FutureWarning): + df.eval('c = a + b') + + # but don't warn without assignment + with tm.assert_produces_warnings(None): + df.eval('a + b') + + def test_multi_line_expression(self): + # GH 11149 + tm.skip_if_no_ne('numexpr') + df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + expected = df.copy() + + expected['c'] = expected['a'] + expected['b'] + expected['d'] = expected['c'] + expected['b'] + ans = df.eval(""" + c = a + b + d = c + b""", inplace=True) + assert_frame_equal(expected, df) + self.assertIsNone(ans) + + expected['a'] = expected['a'] - 1 + expected['e'] = expected['a'] + 2 + ans = df.eval(""" + a = a - 1 + e = a + 2""", inplace=True) + assert_frame_equal(expected, df) + self.assertIsNone(ans) + + # multi-line not valid if not all assignments + with tm.assertRaises(ValueError): + df.eval(""" + a = b + 2 + b - 2""", inplace=False) + + def test_multi_line_expression_not_inplace(self): + # GH 11149 + tm.skip_if_no_ne('numexpr') + df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + expected = df.copy() + + expected['c'] = expected['a'] + expected['b'] + expected['d'] = expected['c'] + expected['b'] + df = df.eval(""" + c = a + b + d = c + b""", inplace=False) + assert_frame_equal(expected, df) + + expected['a'] = expected['a'] - 1 + expected['e'] = expected['a'] + 2 + df = df.eval(""" + a = a - 1 + e = a + 2""", inplace=False) + assert_frame_equal(expected, df) + + def test_assignment_in_query(self): + # GH 8664 + df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + df_orig = df.copy() + with tm.assertRaises(ValueError): + df.query('a = 1') + assert_frame_equal(df, df_orig) + + def query_inplace(self): + # GH 11149 + df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + expected = df.copy() + expected = expected[expected['a'] == 2] + df.query('a == 2', inplace=True) + assert_frame_equal(expected, df) + def test_basic_period_index_boolean_expression(self): df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i') @@ -1502,7 +1592,7 @@ def test_df_use_case(self): 'b': np.random.randn(10)}) df.eval("e = arctan2(sin(a), b)", engine=self.engine, - parser=self.parser) + parser=self.parser, inplace=True) got = df.e expect = np.arctan2(np.sin(df.a), df.b) pd.util.testing.assert_almost_equal(got, expect) @@ -1512,7 +1602,7 @@ def test_df_arithmetic_subexpression(self): 'b': np.random.randn(10)}) df.eval("e = sin(a + b)", engine=self.engine, - parser=self.parser) + parser=self.parser, inplace=True) got = df.e expect = np.sin(df.a + df.b) pd.util.testing.assert_almost_equal(got, expect) @@ -1522,7 +1612,7 @@ def check_result_type(self, dtype, expect_dtype): self.assertEqual(df.a.dtype, dtype) df.eval("b = sin(a)", engine=self.engine, - parser=self.parser) + parser=self.parser, inplace=True) got = df.b expect = np.sin(df.a) self.assertEqual(expect.dtype, got.dtype) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b66c51bc4411e..6207ac5dc5c12 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2051,7 +2051,7 @@ def _getitem_frame(self, key): raise ValueError('Must pass DataFrame with boolean values only') return self.where(key) - def query(self, expr, **kwargs): + def query(self, expr, inplace=False, **kwargs): """Query the columns of a frame with a boolean expression. .. versionadded:: 0.13 @@ -2062,6 +2062,12 @@ def query(self, expr, **kwargs): The query string to evaluate. You can refer to variables in the environment by prefixing them with an '@' character like ``@a + b``. + inplace : bool + Whether the query should modify the data in place or return + a modified copy + + .. versionadded:: 0.18.0 + kwargs : dict See the documentation for :func:`pandas.eval` for complete details on the keyword arguments accepted by :meth:`DataFrame.query`. @@ -2118,16 +2124,22 @@ def query(self, expr, **kwargs): >>> df[df.a > df.b] # same result as the previous expression """ kwargs['level'] = kwargs.pop('level', 0) + 1 + kwargs['target'] = None res = self.eval(expr, **kwargs) try: - return self.loc[res] + new_data = self.loc[res] except ValueError: # when res is multi-dimensional loc raises, but this is sometimes a # valid query - return self[res] + new_data = self[res] - def eval(self, expr, **kwargs): + if inplace: + self._update_inplace(new_data) + else: + return new_data + + def eval(self, expr, inplace=None, **kwargs): """Evaluate an expression in the context of the calling DataFrame instance. @@ -2135,6 +2147,16 @@ def eval(self, expr, **kwargs): ---------- expr : string The expression string to evaluate. + inplace : bool + If the expression contains an assignment, whether to return a new + DataFrame or mutate the existing. + + WARNING: inplace=None currently falls back to to True, but + in a future version, will default to False. Use inplace=True + explicitly rather than relying on the default. + + .. versionadded:: 0.18.0 + kwargs : dict See the documentation for :func:`~pandas.eval` for complete details on the keyword arguments accepted by @@ -2147,6 +2169,7 @@ def eval(self, expr, **kwargs): See Also -------- pandas.DataFrame.query + pandas.DataFrame.assign pandas.eval Notes @@ -2168,9 +2191,10 @@ def eval(self, expr, **kwargs): if resolvers is None: index_resolvers = self._get_index_resolvers() resolvers = dict(self.iteritems()), index_resolvers - kwargs['target'] = self + if 'target' not in kwargs: + kwargs['target'] = self kwargs['resolvers'] = kwargs.get('resolvers', ()) + resolvers - return _eval(expr, **kwargs) + return _eval(expr, inplace=inplace, **kwargs) def select_dtypes(self, include=None, exclude=None): """Return a subset of a DataFrame including/excluding columns based on