ENH: Quoting column names containing spaces with backticks to use them in query and eval. (#24955)

hwalinga · jreback · commit 02ada08e0cb0 · 2019-03-20T08:24:17.000-04:00
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -29,6 +29,7 @@ Other Enhancements
 - :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`)
 - :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`)
 - :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`)
+- :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`)
 
 .. _whatsnew_0250.api_breaking:
 
diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py
@@ -1,9 +1,12 @@
 import numpy as np
 
-from pandas.compat import reduce
+from pandas.compat import reduce, string_types
 
 import pandas as pd
 
+# A token value Python's tokenizer probably will never use.
+_BACKTICK_QUOTED_STRING = 100
+
 
 def _ensure_decoded(s):
     """ if we have bytes, decode them to unicode """
@@ -22,5 +25,14 @@ def _result_type_many(*arrays_and_dtypes):
         return reduce(np.result_type, arrays_and_dtypes)
 
 
+def _remove_spaces_column_name(name):
+    """Check if name contains any spaces, if it contains any spaces
+    the spaces will be removed and an underscore suffix is added."""
+    if not isinstance(name, string_types) or " " not in name:
+        return name
+
+    return name.replace(" ", "_") + "_BACKTICK_QUOTED_STRING"
+
+
 class NameResolutionError(NameError):
     pass
diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py
@@ -3,16 +3,20 @@
 
 import ast
 from functools import partial
+import itertools as it
+import operator
 import tokenize
 
 import numpy as np
 
-from pandas.compat import StringIO, lmap, reduce, string_types, zip
+from pandas.compat import StringIO, lmap, map, reduce, string_types, zip
 
 import pandas as pd
 from pandas import compat
 from pandas.core import common as com
 from pandas.core.base import StringMixin
+from pandas.core.computation.common import (
+    _BACKTICK_QUOTED_STRING, _remove_spaces_column_name)
 from pandas.core.computation.ops import (
     _LOCAL_TAG, BinOp, Constant, Div, FuncNode, Op, Term, UnaryOp,
     UndefinedVariableError, _arith_ops_syms, _bool_ops_syms, _cmp_ops_syms,
@@ -31,7 +35,17 @@ def tokenize_string(source):
         A Python source code string
     """
     line_reader = StringIO(source).readline
-    for toknum, tokval, _, _, _ in tokenize.generate_tokens(line_reader):
+    token_generator = tokenize.generate_tokens(line_reader)
+
+    # Loop over all tokens till a backtick (`) is found.
+    # Then, take all tokens till the next backtick to form a backtick quoted
+    # string.
+    for toknum, tokval, _, _, _ in token_generator:
+        if tokval == '`':
+            tokval = " ".join(it.takewhile(
+                lambda tokval: tokval != '`',
+                map(operator.itemgetter(1), token_generator)))
+            toknum = _BACKTICK_QUOTED_STRING
         yield toknum, tokval
 
 
@@ -102,6 +116,31 @@ def _replace_locals(tok):
     return toknum, tokval
 
 
+def _clean_spaces_backtick_quoted_names(tok):
+    """Clean up a column name if surrounded by backticks.
+
+    Backtick quoted string are indicated by a certain tokval value. If a string
+    is a backtick quoted token it will processed by
+    :func:`_remove_spaces_column_name` so that the parser can find this
+    string when the query is executed.
+    See also :meth:`NDFrame._get_space_character_free_column_resolver`.
+
+    Parameters
+    ----------
+    tok : tuple of int, str
+        ints correspond to the all caps constants in the tokenize module
+
+    Returns
+    -------
+    t : tuple of int, str
+        Either the input or token or the replacement values
+    """
+    toknum, tokval = tok
+    if toknum == _BACKTICK_QUOTED_STRING:
+        return tokenize.NAME, _remove_spaces_column_name(tokval)
+    return toknum, tokval
+
+
 def _compose2(f, g):
     """Compose 2 callables"""
     return lambda *args, **kwargs: f(g(*args, **kwargs))
@@ -114,7 +153,8 @@ def _compose(*funcs):
 
 
 def _preparse(source, f=_compose(_replace_locals, _replace_booleans,
-                                 _rewrite_assign)):
+                                 _rewrite_assign,
+                                 _clean_spaces_backtick_quoted_names)):
     """Compose a collection of tokenization functions
 
     Parameters
@@ -711,8 +751,9 @@ def visitor(x, y):
 class PandasExprVisitor(BaseExprVisitor):
 
     def __init__(self, env, engine, parser,
-                 preparser=partial(_preparse, f=_compose(_replace_locals,
-                                                         _replace_booleans))):
+                 preparser=partial(_preparse, f=_compose(
+                     _replace_locals, _replace_booleans,
+                     _clean_spaces_backtick_quoted_names))):
         super(PandasExprVisitor, self).__init__(env, engine, parser, preparser)
 
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -2967,6 +2967,15 @@ def query(self, expr, inplace=False, **kwargs):
             The query string to evaluate.  You can refer to variables
             in the environment by prefixing them with an '@' character like
             ``@a + b``.
+
+            .. versionadded:: 0.25.0
+
+            You can refer to column names that contain spaces by surrounding
+            them in backticks.
+
+            For example, if one of your columns is called ``a a`` and you want
+            to sum it with ``b``, your query should be ```a a` + b``.
+
         inplace : bool
             Whether the query should modify the data in place or return
             a modified copy.
@@ -3025,23 +3034,37 @@ def query(self, expr, inplace=False, **kwargs):
 
         Examples
         --------
-        >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
+        >>> df = pd.DataFrame({'A': range(1, 6),
+        ...                    'B': range(10, 0, -2),
+        ...                    'C C': range(10, 5, -1)})
         >>> df
-           A   B
-        0  1  10
-        1  2   8
-        2  3   6
-        3  4   4
-        4  5   2
+           A   B  C C
+        0  1  10   10
+        1  2   8    9
+        2  3   6    8
+        3  4   4    7
+        4  5   2    6
         >>> df.query('A > B')
-           A  B
-        4  5  2
+           A  B  C C
+        4  5  2    6
 
         The previous expression is equivalent to
 
         >>> df[df.A > df.B]
-           A  B
-        4  5  2
+           A  B  C C
+        4  5  2    6
+
+        For columns with spaces in their name, you can use backtick quoting.
+
+        >>> df.query('B == `C C`')
+           A   B  C C
+        0  1  10   10
+
+        The previous expression is equivalent to
+
+        >>> df[df.B == df['C C']]
+           A   B  C C
+        0  1  10   10
         """
         inplace = validate_bool_kwarg(inplace, 'inplace')
         if not isinstance(expr, compat.string_types):
@@ -3160,7 +3183,9 @@ def eval(self, expr, inplace=False, **kwargs):
         kwargs['level'] = kwargs.pop('level', 0) + 1
         if resolvers is None:
             index_resolvers = self._get_index_resolvers()
-            resolvers = dict(self.iteritems()), index_resolvers
+            column_resolvers = \
+                self._get_space_character_free_column_resolvers()
+            resolvers = column_resolvers, index_resolvers
         if 'target' not in kwargs:
             kwargs['target'] = self
         kwargs['resolvers'] = kwargs.get('resolvers', ()) + tuple(resolvers)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -423,6 +423,18 @@ def _get_index_resolvers(self):
             d.update(self._get_axis_resolvers(axis_name))
         return d
 
+    def _get_space_character_free_column_resolvers(self):
+        """Return the space character free column resolvers of a dataframe.
+
+        Column names with spaces are 'cleaned up' so that they can be referred
+        to by backtick quoting.
+        Used in :meth:`DataFrame.eval`.
+        """
+        from pandas.core.computation.common import _remove_spaces_column_name
+
+        return {_remove_spaces_column_name(k): v for k, v
+                in self.iteritems()}
+
     @property
     def _info_axis(self):
         return getattr(self, self._info_axis_name)
diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py
@@ -1031,3 +1031,54 @@ def test_invalid_type_for_operator_raises(self, parser, engine, op):
 
         with pytest.raises(TypeError, match=msg):
             df.eval('a {0} b'.format(op), engine=engine, parser=parser)
+
+
+class TestDataFrameQueryBacktickQuoting(object):
+
+    @pytest.fixture(scope='class')
+    def df(self):
+        yield DataFrame({'A': [1, 2, 3],
+                         'B B': [3, 2, 1],
+                         'C C': [4, 5, 6],
+                         'C_C': [8, 9, 10],
+                         'D_D D': [11, 1, 101]})
+
+    def test_single_backtick_variable_query(self, df):
+        res = df.query('1 < `B B`')
+        expect = df[1 < df['B B']]
+        assert_frame_equal(res, expect)
+
+    def test_two_backtick_variables_query(self, df):
+        res = df.query('1 < `B B` and 4 < `C C`')
+        expect = df[(1 < df['B B']) & (4 < df['C C'])]
+        assert_frame_equal(res, expect)
+
+    def test_single_backtick_variable_expr(self, df):
+        res = df.eval('A + `B B`')
+        expect = df['A'] + df['B B']
+        assert_series_equal(res, expect)
+
+    def test_two_backtick_variables_expr(self, df):
+        res = df.eval('`B B` + `C C`')
+        expect = df['B B'] + df['C C']
+        assert_series_equal(res, expect)
+
+    def test_already_underscore_variable(self, df):
+        res = df.eval('`C_C` + A')
+        expect = df['C_C'] + df['A']
+        assert_series_equal(res, expect)
+
+    def test_same_name_but_underscores(self, df):
+        res = df.eval('C_C + `C C`')
+        expect = df['C_C'] + df['C C']
+        assert_series_equal(res, expect)
+
+    def test_mixed_underscores_and_spaces(self, df):
+        res = df.eval('A + `D_D D`')
+        expect = df['A'] + df['D_D D']
+        assert_series_equal(res, expect)
+
+    def backtick_quote_name_with_no_spaces(self, df):
+        res = df.eval('A + `C_C`')
+        expect = df['A'] + df['C_C']
+        assert_series_equal(res, expect)