Skip to content

Commit 02ada08

Browse files
hwalingajreback
authored andcommitted
ENH: Quoting column names containing spaces with backticks to use them in query and eval. (#24955)
1 parent 6e979d8 commit 02ada08

File tree

6 files changed

+160
-18
lines changed

6 files changed

+160
-18
lines changed

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ Other Enhancements
2929
- :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`)
3030
- :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`)
3131
- :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`)
32+
- :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`)
3233

3334
.. _whatsnew_0250.api_breaking:
3435

pandas/core/computation/common.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
import numpy as np
22

3-
from pandas.compat import reduce
3+
from pandas.compat import reduce, string_types
44

55
import pandas as pd
66

7+
# A token value Python's tokenizer probably will never use.
8+
_BACKTICK_QUOTED_STRING = 100
9+
710

811
def _ensure_decoded(s):
912
""" if we have bytes, decode them to unicode """
@@ -22,5 +25,14 @@ def _result_type_many(*arrays_and_dtypes):
2225
return reduce(np.result_type, arrays_and_dtypes)
2326

2427

28+
def _remove_spaces_column_name(name):
29+
"""Check if name contains any spaces, if it contains any spaces
30+
the spaces will be removed and an underscore suffix is added."""
31+
if not isinstance(name, string_types) or " " not in name:
32+
return name
33+
34+
return name.replace(" ", "_") + "_BACKTICK_QUOTED_STRING"
35+
36+
2537
class NameResolutionError(NameError):
2638
pass

pandas/core/computation/expr.py

+46-5
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,20 @@
33

44
import ast
55
from functools import partial
6+
import itertools as it
7+
import operator
68
import tokenize
79

810
import numpy as np
911

10-
from pandas.compat import StringIO, lmap, reduce, string_types, zip
12+
from pandas.compat import StringIO, lmap, map, reduce, string_types, zip
1113

1214
import pandas as pd
1315
from pandas import compat
1416
from pandas.core import common as com
1517
from pandas.core.base import StringMixin
18+
from pandas.core.computation.common import (
19+
_BACKTICK_QUOTED_STRING, _remove_spaces_column_name)
1620
from pandas.core.computation.ops import (
1721
_LOCAL_TAG, BinOp, Constant, Div, FuncNode, Op, Term, UnaryOp,
1822
UndefinedVariableError, _arith_ops_syms, _bool_ops_syms, _cmp_ops_syms,
@@ -31,7 +35,17 @@ def tokenize_string(source):
3135
A Python source code string
3236
"""
3337
line_reader = StringIO(source).readline
34-
for toknum, tokval, _, _, _ in tokenize.generate_tokens(line_reader):
38+
token_generator = tokenize.generate_tokens(line_reader)
39+
40+
# Loop over all tokens till a backtick (`) is found.
41+
# Then, take all tokens till the next backtick to form a backtick quoted
42+
# string.
43+
for toknum, tokval, _, _, _ in token_generator:
44+
if tokval == '`':
45+
tokval = " ".join(it.takewhile(
46+
lambda tokval: tokval != '`',
47+
map(operator.itemgetter(1), token_generator)))
48+
toknum = _BACKTICK_QUOTED_STRING
3549
yield toknum, tokval
3650

3751

@@ -102,6 +116,31 @@ def _replace_locals(tok):
102116
return toknum, tokval
103117

104118

119+
def _clean_spaces_backtick_quoted_names(tok):
120+
"""Clean up a column name if surrounded by backticks.
121+
122+
Backtick quoted string are indicated by a certain tokval value. If a string
123+
is a backtick quoted token it will processed by
124+
:func:`_remove_spaces_column_name` so that the parser can find this
125+
string when the query is executed.
126+
See also :meth:`NDFrame._get_space_character_free_column_resolver`.
127+
128+
Parameters
129+
----------
130+
tok : tuple of int, str
131+
ints correspond to the all caps constants in the tokenize module
132+
133+
Returns
134+
-------
135+
t : tuple of int, str
136+
Either the input or token or the replacement values
137+
"""
138+
toknum, tokval = tok
139+
if toknum == _BACKTICK_QUOTED_STRING:
140+
return tokenize.NAME, _remove_spaces_column_name(tokval)
141+
return toknum, tokval
142+
143+
105144
def _compose2(f, g):
106145
"""Compose 2 callables"""
107146
return lambda *args, **kwargs: f(g(*args, **kwargs))
@@ -114,7 +153,8 @@ def _compose(*funcs):
114153

115154

116155
def _preparse(source, f=_compose(_replace_locals, _replace_booleans,
117-
_rewrite_assign)):
156+
_rewrite_assign,
157+
_clean_spaces_backtick_quoted_names)):
118158
"""Compose a collection of tokenization functions
119159
120160
Parameters
@@ -711,8 +751,9 @@ def visitor(x, y):
711751
class PandasExprVisitor(BaseExprVisitor):
712752

713753
def __init__(self, env, engine, parser,
714-
preparser=partial(_preparse, f=_compose(_replace_locals,
715-
_replace_booleans))):
754+
preparser=partial(_preparse, f=_compose(
755+
_replace_locals, _replace_booleans,
756+
_clean_spaces_backtick_quoted_names))):
716757
super(PandasExprVisitor, self).__init__(env, engine, parser, preparser)
717758

718759

pandas/core/frame.py

+37-12
Original file line numberDiff line numberDiff line change
@@ -2967,6 +2967,15 @@ def query(self, expr, inplace=False, **kwargs):
29672967
The query string to evaluate. You can refer to variables
29682968
in the environment by prefixing them with an '@' character like
29692969
``@a + b``.
2970+
2971+
.. versionadded:: 0.25.0
2972+
2973+
You can refer to column names that contain spaces by surrounding
2974+
them in backticks.
2975+
2976+
For example, if one of your columns is called ``a a`` and you want
2977+
to sum it with ``b``, your query should be ```a a` + b``.
2978+
29702979
inplace : bool
29712980
Whether the query should modify the data in place or return
29722981
a modified copy.
@@ -3025,23 +3034,37 @@ def query(self, expr, inplace=False, **kwargs):
30253034
30263035
Examples
30273036
--------
3028-
>>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
3037+
>>> df = pd.DataFrame({'A': range(1, 6),
3038+
... 'B': range(10, 0, -2),
3039+
... 'C C': range(10, 5, -1)})
30293040
>>> df
3030-
A B
3031-
0 1 10
3032-
1 2 8
3033-
2 3 6
3034-
3 4 4
3035-
4 5 2
3041+
A B C C
3042+
0 1 10 10
3043+
1 2 8 9
3044+
2 3 6 8
3045+
3 4 4 7
3046+
4 5 2 6
30363047
>>> df.query('A > B')
3037-
A B
3038-
4 5 2
3048+
A B C C
3049+
4 5 2 6
30393050
30403051
The previous expression is equivalent to
30413052
30423053
>>> df[df.A > df.B]
3043-
A B
3044-
4 5 2
3054+
A B C C
3055+
4 5 2 6
3056+
3057+
For columns with spaces in their name, you can use backtick quoting.
3058+
3059+
>>> df.query('B == `C C`')
3060+
A B C C
3061+
0 1 10 10
3062+
3063+
The previous expression is equivalent to
3064+
3065+
>>> df[df.B == df['C C']]
3066+
A B C C
3067+
0 1 10 10
30453068
"""
30463069
inplace = validate_bool_kwarg(inplace, 'inplace')
30473070
if not isinstance(expr, compat.string_types):
@@ -3160,7 +3183,9 @@ def eval(self, expr, inplace=False, **kwargs):
31603183
kwargs['level'] = kwargs.pop('level', 0) + 1
31613184
if resolvers is None:
31623185
index_resolvers = self._get_index_resolvers()
3163-
resolvers = dict(self.iteritems()), index_resolvers
3186+
column_resolvers = \
3187+
self._get_space_character_free_column_resolvers()
3188+
resolvers = column_resolvers, index_resolvers
31643189
if 'target' not in kwargs:
31653190
kwargs['target'] = self
31663191
kwargs['resolvers'] = kwargs.get('resolvers', ()) + tuple(resolvers)

pandas/core/generic.py

+12
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,18 @@ def _get_index_resolvers(self):
423423
d.update(self._get_axis_resolvers(axis_name))
424424
return d
425425

426+
def _get_space_character_free_column_resolvers(self):
427+
"""Return the space character free column resolvers of a dataframe.
428+
429+
Column names with spaces are 'cleaned up' so that they can be referred
430+
to by backtick quoting.
431+
Used in :meth:`DataFrame.eval`.
432+
"""
433+
from pandas.core.computation.common import _remove_spaces_column_name
434+
435+
return {_remove_spaces_column_name(k): v for k, v
436+
in self.iteritems()}
437+
426438
@property
427439
def _info_axis(self):
428440
return getattr(self, self._info_axis_name)

pandas/tests/frame/test_query_eval.py

+51
Original file line numberDiff line numberDiff line change
@@ -1031,3 +1031,54 @@ def test_invalid_type_for_operator_raises(self, parser, engine, op):
10311031

10321032
with pytest.raises(TypeError, match=msg):
10331033
df.eval('a {0} b'.format(op), engine=engine, parser=parser)
1034+
1035+
1036+
class TestDataFrameQueryBacktickQuoting(object):
1037+
1038+
@pytest.fixture(scope='class')
1039+
def df(self):
1040+
yield DataFrame({'A': [1, 2, 3],
1041+
'B B': [3, 2, 1],
1042+
'C C': [4, 5, 6],
1043+
'C_C': [8, 9, 10],
1044+
'D_D D': [11, 1, 101]})
1045+
1046+
def test_single_backtick_variable_query(self, df):
1047+
res = df.query('1 < `B B`')
1048+
expect = df[1 < df['B B']]
1049+
assert_frame_equal(res, expect)
1050+
1051+
def test_two_backtick_variables_query(self, df):
1052+
res = df.query('1 < `B B` and 4 < `C C`')
1053+
expect = df[(1 < df['B B']) & (4 < df['C C'])]
1054+
assert_frame_equal(res, expect)
1055+
1056+
def test_single_backtick_variable_expr(self, df):
1057+
res = df.eval('A + `B B`')
1058+
expect = df['A'] + df['B B']
1059+
assert_series_equal(res, expect)
1060+
1061+
def test_two_backtick_variables_expr(self, df):
1062+
res = df.eval('`B B` + `C C`')
1063+
expect = df['B B'] + df['C C']
1064+
assert_series_equal(res, expect)
1065+
1066+
def test_already_underscore_variable(self, df):
1067+
res = df.eval('`C_C` + A')
1068+
expect = df['C_C'] + df['A']
1069+
assert_series_equal(res, expect)
1070+
1071+
def test_same_name_but_underscores(self, df):
1072+
res = df.eval('C_C + `C C`')
1073+
expect = df['C_C'] + df['C C']
1074+
assert_series_equal(res, expect)
1075+
1076+
def test_mixed_underscores_and_spaces(self, df):
1077+
res = df.eval('A + `D_D D`')
1078+
expect = df['A'] + df['D_D D']
1079+
assert_series_equal(res, expect)
1080+
1081+
def backtick_quote_name_with_no_spaces(self, df):
1082+
res = df.eval('A + `C_C`')
1083+
expect = df['A'] + df['C_C']
1084+
assert_series_equal(res, expect)

0 commit comments

Comments
 (0)