Skip to content

Commit bfebb9d

Browse files
committed
Backtick quotes are now tokenized. More tests and pytest fixtures
1 parent 22686fd commit bfebb9d

File tree

4 files changed

+101
-26
lines changed

4 files changed

+101
-26
lines changed

pandas/core/computation/common.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
11
import numpy as np
22

3-
from pandas.compat import reduce
3+
from pandas.compat import reduce, string_types
44

55
import pandas as pd
66

77

8+
# A token value Python's tokenizer probably will never use.
9+
_BACKTICK_QUOTED_STRING = 100
10+
11+
812
def _ensure_decoded(s):
913
""" if we have bytes, decode them to unicode """
1014
if isinstance(s, (np.bytes_, bytes)):
@@ -22,5 +26,13 @@ def _result_type_many(*arrays_and_dtypes):
2226
return reduce(np.result_type, arrays_and_dtypes)
2327

2428

29+
def clean_column_name_with_spaces(name):
30+
"""Check if name contains any spaces, if it contains any spaces
31+
the spaces will be removed and an underscore suffix is added."""
32+
if not isinstance(name, string_types) or " " not in name:
33+
return name
34+
return "_BACKTICK_QUOTED_STRING_" + name.replace(" ", "_")
35+
36+
2537
class NameResolutionError(NameError):
2638
pass

pandas/core/computation/expr.py

+40-4
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
import ast
55
from functools import partial
6+
import itertools as it
7+
import operator as op
68
import tokenize
79

810
import numpy as np
@@ -13,6 +15,8 @@
1315
from pandas import compat
1416
from pandas.core import common as com
1517
from pandas.core.base import StringMixin
18+
from pandas.core.computation.common import (
19+
_BACKTICK_QUOTED_STRING, clean_column_name_with_spaces)
1620
from pandas.core.computation.ops import (
1721
_LOCAL_TAG, BinOp, Constant, Div, FuncNode, Op, Term, UnaryOp,
1822
UndefinedVariableError, _arith_ops_syms, _bool_ops_syms, _cmp_ops_syms,
@@ -31,7 +35,13 @@ def tokenize_string(source):
3135
A Python source code string
3236
"""
3337
line_reader = StringIO(source).readline
34-
for toknum, tokval, _, _, _ in tokenize.generate_tokens(line_reader):
38+
token_generator = tokenize.generate_tokens(line_reader)
39+
for toknum, tokval, _, _, _ in token_generator:
40+
if tokval == '`':
41+
tokval = " ".join(it.takewhile(
42+
lambda tokval: tokval != '`',
43+
map(op.itemgetter(1), token_generator)))
44+
toknum = _BACKTICK_QUOTED_STRING
3545
yield toknum, tokval
3646

3747

@@ -102,6 +112,30 @@ def _replace_locals(tok):
102112
return toknum, tokval
103113

104114

115+
def _clean_spaces_backtick_quoted_names(tok):
116+
"""Clean up a column name if surrounded by backticks.
117+
118+
Backtick quoted string are indicated by a certain tokval value. If a string
119+
is a backtick quoted token it will processed by
120+
:func:`clean_column_name_with_spaces` so that the parser can find this
121+
string when the query is executed. See also :meth:`DataFrame.eval`.
122+
123+
Parameters
124+
----------
125+
tok : tuple of int, str
126+
ints correspond to the all caps constants in the tokenize module
127+
128+
Returns
129+
-------
130+
t : tuple of int, str
131+
Either the input or token or the replacement values
132+
"""
133+
toknum, tokval = tok
134+
if toknum == _BACKTICK_QUOTED_STRING:
135+
return tokenize.NAME, clean_column_name_with_spaces(tokval)
136+
return toknum, tokval
137+
138+
105139
def _compose2(f, g):
106140
"""Compose 2 callables"""
107141
return lambda *args, **kwargs: f(g(*args, **kwargs))
@@ -114,7 +148,8 @@ def _compose(*funcs):
114148

115149

116150
def _preparse(source, f=_compose(_replace_locals, _replace_booleans,
117-
_rewrite_assign)):
151+
_rewrite_assign,
152+
_clean_spaces_backtick_quoted_names)):
118153
"""Compose a collection of tokenization functions
119154
120155
Parameters
@@ -711,8 +746,9 @@ def visitor(x, y):
711746
class PandasExprVisitor(BaseExprVisitor):
712747

713748
def __init__(self, env, engine, parser,
714-
preparser=partial(_preparse, f=_compose(_replace_locals,
715-
_replace_booleans))):
749+
preparser=partial(_preparse, f=_compose(
750+
_replace_locals, _replace_booleans,
751+
_clean_spaces_backtick_quoted_names))):
716752
super(PandasExprVisitor, self).__init__(env, engine, parser, preparser)
717753

718754

pandas/core/frame.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
PY36, raise_with_traceback,
3737
string_and_binary_types)
3838
from pandas.compat.numpy import function as nv
39+
from pandas.core.computation.common import clean_column_name_with_spaces
3940
from pandas.core.dtypes.cast import (
4041
maybe_upcast,
4142
cast_scalar_to_array,
@@ -3160,7 +3161,13 @@ def eval(self, expr, inplace=False, **kwargs):
31603161
kwargs['level'] = kwargs.pop('level', 0) + 1
31613162
if resolvers is None:
31623163
index_resolvers = self._get_index_resolvers()
3163-
resolvers = dict(self.iteritems()), index_resolvers
3164+
# column names with spaces are altered so that they can be referred
3165+
# to by backtick quoting.
3166+
# Also see _clean_spaces_backtick_quoted_names from
3167+
# pandas/core/computation/expr.py
3168+
column_resolvers = {clean_column_name_with_spaces(k): v
3169+
for k, v in self.iteritems()}
3170+
resolvers = column_resolvers, index_resolvers
31643171
if 'target' not in kwargs:
31653172
kwargs['target'] = self
31663173
kwargs['resolvers'] = kwargs.get('resolvers', ()) + tuple(resolvers)

pandas/tests/frame/test_query_eval.py

+40-20
Original file line numberDiff line numberDiff line change
@@ -1034,30 +1034,50 @@ def test_invalid_type_for_operator_raises(self, parser, engine, op):
10341034

10351035
class TestDataFrameQueryBacktickQuoting(object):
10361036

1037-
def setup_method(self, method):
1038-
self.df = DataFrame({'A': [1, 2, 3],
1039-
'B B': [3, 2, 1],
1040-
'C C': [4, 5, 6]})
1041-
1042-
def teardown_method(self, method):
1043-
del self.df
1044-
1045-
def test_single_backtick_variable_query(self):
1046-
res = self.df.query('1 < `B B`')
1047-
expect = self.df[1 < self.df['B B']]
1037+
@pytest.fixture(scope='class')
1038+
def df(self):
1039+
yield DataFrame({'A': [1, 2, 3],
1040+
'B B': [3, 2, 1],
1041+
'C C': [4, 5, 6],
1042+
'C_C': [8, 9, 10],
1043+
'D_D D': [11, 1, 101]})
1044+
1045+
def test_single_backtick_variable_query(self, df):
1046+
res = df.query('1 < `B B`')
1047+
expect = df[1 < df['B B']]
10481048
assert_frame_equal(res, expect)
10491049

1050-
def test_two_backtick_variables_query(self):
1051-
res = self.df.query('1 < `B B` and 4 < `C C`')
1052-
expect = self.df[(1 < self.df['B B']) & (4 < self.df['C C'])]
1050+
def test_two_backtick_variables_query(self, df):
1051+
res = df.query('1 < `B B` and 4 < `C C`')
1052+
expect = df[(1 < df['B B']) & (4 < df['C C'])]
10531053
assert_frame_equal(res, expect)
10541054

1055-
def test_single_backtick_variable_expr(self):
1056-
res = self.df.eval('A + `B B`')
1057-
expect = self.df['A'] + self.df['B B']
1055+
def test_single_backtick_variable_expr(self, df):
1056+
res = df.eval('A + `B B`')
1057+
expect = df['A'] + df['B B']
1058+
assert_series_equal(res, expect)
1059+
1060+
def test_two_backtick_variables_expr(self, df):
1061+
res = df.eval('`B B` + `C C`')
1062+
expect = df['B B'] + df['C C']
1063+
assert_series_equal(res, expect)
1064+
1065+
def test_already_underscore_variable(self, df):
1066+
res = df.eval('`C_C` + A')
1067+
expect = df['C_C'] + df['A']
1068+
assert_series_equal(res, expect)
1069+
1070+
def test_same_name_but_underscores(self, df):
1071+
res = df.eval('C_C + `C C`')
1072+
expect = df['C_C'] + df['C C']
1073+
assert_series_equal(res, expect)
1074+
1075+
def test_mixed_underscores_and_spaces(self, df):
1076+
res = df.eval('A + `D_D D`')
1077+
expect = df['A'] + df['D_D D']
10581078
assert_series_equal(res, expect)
10591079

1060-
def test_two_backtick_variables_expr(self):
1061-
res = self.df.eval('`B B` + `C C`')
1062-
expect = self.df['B B'] + self.df['C C']
1080+
def backtick_quote_name_with_no_spaces(self, df):
1081+
res = df.eval('A + `C_C`')
1082+
expect = df['A'] + df['C_C']
10631083
assert_series_equal(res, expect)

0 commit comments

Comments
 (0)