Skip to content

ENH: Quoting column names containing spaces with backticks to use them in query and eval. #24955

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Mar 20, 2019
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ Other Enhancements
- :meth:`DataFrame.at_time` and :meth:`Series.at_time` now support :meth:`datetime.time` objects with timezones (:issue:`24043`)
- :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`)
- :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`)
- :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`)
-

.. _whatsnew_0250.api_breaking:
Expand Down
25 changes: 24 additions & 1 deletion pandas/core/computation/common.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import numpy as np

from pandas.compat import reduce
from pandas.compat import reduce, string_types

import pandas as pd

# A token value Python's tokenizer probably will never use.
_BACKTICK_QUOTED_STRING = 100


def _ensure_decoded(s):
""" if we have bytes, decode them to unicode """
Expand All @@ -22,5 +25,25 @@ def _result_type_many(*arrays_and_dtypes):
return reduce(np.result_type, arrays_and_dtypes)


def _clean_column_name_with_spaces(name):
"""Check if name contains any spaces, if it contains any spaces
the spaces will be removed and an underscore suffix is added."""
if not isinstance(name, string_types) or " " not in name:
return name
return "_BACKTICK_QUOTED_STRING_" + name.replace(" ", "_")


def _get_column_resolvers(dataFrame):
"""Return the axis resolvers of a dataframe.

Column names with spaces are 'cleaned up' so that they can be referred to
by backtick quoting. See also :func:`_clean_spaces_backtick_quoted_names`
from :mod:`pandas.core.computation`
"""

return {_clean_column_name_with_spaces(k): v for k, v
in dataFrame.iteritems()}


class NameResolutionError(NameError):
pass
47 changes: 42 additions & 5 deletions pandas/core/computation/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,20 @@

import ast
from functools import partial
import itertools as it
import operator
import tokenize

import numpy as np

from pandas.compat import StringIO, lmap, reduce, string_types, zip
from pandas.compat import StringIO, lmap, map, reduce, string_types, zip

import pandas as pd
from pandas import compat
from pandas.core import common as com
from pandas.core.base import StringMixin
from pandas.core.computation.common import (
_BACKTICK_QUOTED_STRING, _clean_column_name_with_spaces)
from pandas.core.computation.ops import (
_LOCAL_TAG, BinOp, Constant, Div, FuncNode, Op, Term, UnaryOp,
UndefinedVariableError, _arith_ops_syms, _bool_ops_syms, _cmp_ops_syms,
Expand All @@ -31,7 +35,13 @@ def tokenize_string(source):
A Python source code string
"""
line_reader = StringIO(source).readline
for toknum, tokval, _, _, _ in tokenize.generate_tokens(line_reader):
token_generator = tokenize.generate_tokens(line_reader)
for toknum, tokval, _, _, _ in token_generator:
if tokval == '`':
tokval = " ".join(it.takewhile(
lambda tokval: tokval != '`',
map(operator.itemgetter(1), token_generator)))
toknum = _BACKTICK_QUOTED_STRING
yield toknum, tokval


Expand Down Expand Up @@ -102,6 +112,31 @@ def _replace_locals(tok):
return toknum, tokval


def _clean_spaces_backtick_quoted_names(tok):
"""Clean up a column name if surrounded by backticks.

Backtick quoted string are indicated by a certain tokval value. If a string
is a backtick quoted token it will processed by
:func:`_clean_column_name_with_spaces` so that the parser can find this
string when the query is executed. See also :func:`_get_column_resolvers`
used in :meth:`DataFrame.eval`.

Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module

Returns
-------
t : tuple of int, str
Either the input or token or the replacement values
"""
toknum, tokval = tok
if toknum == _BACKTICK_QUOTED_STRING:
return tokenize.NAME, _clean_column_name_with_spaces(tokval)
return toknum, tokval


def _compose2(f, g):
"""Compose 2 callables"""
return lambda *args, **kwargs: f(g(*args, **kwargs))
Expand All @@ -114,7 +149,8 @@ def _compose(*funcs):


def _preparse(source, f=_compose(_replace_locals, _replace_booleans,
_rewrite_assign)):
_rewrite_assign,
_clean_spaces_backtick_quoted_names)):
"""Compose a collection of tokenization functions

Parameters
Expand Down Expand Up @@ -711,8 +747,9 @@ def visitor(x, y):
class PandasExprVisitor(BaseExprVisitor):

def __init__(self, env, engine, parser,
preparser=partial(_preparse, f=_compose(_replace_locals,
_replace_booleans))):
preparser=partial(_preparse, f=_compose(
_replace_locals, _replace_booleans,
_clean_spaces_backtick_quoted_names))):
super(PandasExprVisitor, self).__init__(env, engine, parser, preparser)


Expand Down
11 changes: 10 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2967,6 +2967,12 @@ def query(self, expr, inplace=False, **kwargs):
The query string to evaluate. You can refer to variables
in the environment by prefixing them with an '@' character like
``@a + b``.

.. versionadded:: 0.25.0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add an example in the Examples section as well

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, but don't know what this means:

1 Warnings found:
No extended summary found
Docstring for "pandas.DataFrame.query" correct. :)


You can refer to column names that contain spaces by surrounding
them in backticks like ```a a` + b``.

inplace : bool
Whether the query should modify the data in place or return
a modified copy.
Expand Down Expand Up @@ -3159,8 +3165,11 @@ def eval(self, expr, inplace=False, **kwargs):
resolvers = kwargs.pop('resolvers', None)
kwargs['level'] = kwargs.pop('level', 0) + 1
if resolvers is None:
from pandas.core.computation.common import _get_column_resolvers

index_resolvers = self._get_index_resolvers()
resolvers = dict(self.iteritems()), index_resolvers
column_resolvers = _get_column_resolvers(self)
resolvers = column_resolvers, index_resolvers
if 'target' not in kwargs:
kwargs['target'] = self
kwargs['resolvers'] = kwargs.get('resolvers', ()) + tuple(resolvers)
Expand Down
51 changes: 51 additions & 0 deletions pandas/tests/frame/test_query_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -1030,3 +1030,54 @@ def test_invalid_type_for_operator_raises(self, parser, engine, op):

with pytest.raises(TypeError, match=msg):
df.eval('a {0} b'.format(op), engine=engine, parser=parser)


class TestDataFrameQueryBacktickQuoting(object):

@pytest.fixture(scope='class')
def df(self):
yield DataFrame({'A': [1, 2, 3],
'B B': [3, 2, 1],
'C C': [4, 5, 6],
'C_C': [8, 9, 10],
'D_D D': [11, 1, 101]})

def test_single_backtick_variable_query(self, df):
res = df.query('1 < `B B`')
expect = df[1 < df['B B']]
assert_frame_equal(res, expect)

def test_two_backtick_variables_query(self, df):
res = df.query('1 < `B B` and 4 < `C C`')
expect = df[(1 < df['B B']) & (4 < df['C C'])]
assert_frame_equal(res, expect)

def test_single_backtick_variable_expr(self, df):
res = df.eval('A + `B B`')
expect = df['A'] + df['B B']
assert_series_equal(res, expect)

def test_two_backtick_variables_expr(self, df):
res = df.eval('`B B` + `C C`')
expect = df['B B'] + df['C C']
assert_series_equal(res, expect)

def test_already_underscore_variable(self, df):
res = df.eval('`C_C` + A')
expect = df['C_C'] + df['A']
assert_series_equal(res, expect)

def test_same_name_but_underscores(self, df):
res = df.eval('C_C + `C C`')
expect = df['C_C'] + df['C C']
assert_series_equal(res, expect)

def test_mixed_underscores_and_spaces(self, df):
res = df.eval('A + `D_D D`')
expect = df['A'] + df['D_D D']
assert_series_equal(res, expect)

def backtick_quote_name_with_no_spaces(self, df):
res = df.eval('A + `C_C`')
expect = df['A'] + df['C_C']
assert_series_equal(res, expect)