pandas-dev · jreback · Jan 4, 2020 · Jan 4, 2020 · jreback · Jan 1, 2020
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -1016,6 +1016,7 @@ Other
 - Bug in :meth:`Series.diff` where a boolean series would incorrectly raise a ``TypeError`` (:issue:`17294`)
 - :meth:`Series.append` will no longer raise a ``TypeError`` when passed a tuple of ``Series`` (:issue:`28410`)
 - Fix corrupted error message when calling ``pandas.libs._json.encode()`` on a 0d array (:issue:`18878`)
+- Backtick quoting in :meth:`DataFrame.query` and :meth:`DataFrame.eval` can now also be used to use invalid identifiers like names that start with a digit, are python keywords, or are using single character operators. (:issue:`27017`)
 - Bug in ``pd.core.util.hashing.hash_pandas_object`` where arrays containing tuples were incorrectly treated as non-hashable (:issue:`28969`)
 - Bug in :meth:`DataFrame.append` that raised ``IndexError`` when appending with empty list (:issue:`28769`)
 - Fix :class:`AbstractHolidayCalendar` to return correct results for

diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py
@@ -4,9 +4,6 @@
 
 from pandas._config import get_option
 
-# A token value Python's tokenizer probably will never use.
-_BACKTICK_QUOTED_STRING = 100
-
 
 def _ensure_decoded(s):
     """
@@ -29,16 +26,5 @@ def result_type_many(*arrays_and_dtypes):
         return reduce(np.result_type, arrays_and_dtypes)
 
 
-def _remove_spaces_column_name(name):
-    """
-    Check if name contains any spaces, if it contains any spaces
-    the spaces will be removed and an underscore suffix is added.
-    """
-    if not isinstance(name, str) or " " not in name:
-        return name
-
-    return name.replace(" ", "_") + "_BACKTICK_QUOTED_STRING"
-
-
 class NameResolutionError(NameError):
     pass
diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py
@@ -12,7 +12,8 @@
 from pandas.util._validators import validate_bool_kwarg
 
 from pandas.core.computation.engines import _engines
-from pandas.core.computation.expr import Expr, _parsers, tokenize_string
+from pandas.core.computation.expr import Expr, _parsers
+from pandas.core.computation.parsing import tokenize_string
 from pandas.core.computation.scope import ensure_scope
 
 from pandas.io.formats.printing import pprint_thing

diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py
@@ -3,19 +3,13 @@
 
 import ast
 from functools import partial, reduce
-from io import StringIO
-import itertools as it
-import operator
+from keyword import iskeyword
 import tokenize
 from typing import Optional, Type
 
 import numpy as np
 
 import pandas.core.common as com
-from pandas.core.computation.common import (
-    _BACKTICK_QUOTED_STRING,
-    _remove_spaces_column_name,
-)
 from pandas.core.computation.ops import (
     _LOCAL_TAG,
     BinOp,
@@ -34,38 +28,12 @@
     _unary_ops_syms,
     is_term,
 )
+from pandas.core.computation.parsing import clean_backtick_quoted_toks, tokenize_string
 from pandas.core.computation.scope import Scope
 
 import pandas.io.formats.printing as printing
 
 
-def tokenize_string(source: str):
-    """
-    Tokenize a Python source code string.
-
-    Parameters
-    ----------
-    source : str
-        A Python source code string
-    """
-    line_reader = StringIO(source).readline
-    token_generator = tokenize.generate_tokens(line_reader)
-
-    # Loop over all tokens till a backtick (`) is found.
-    # Then, take all tokens till the next backtick to form a backtick quoted
-    # string.
-    for toknum, tokval, _, _, _ in token_generator:
-        if tokval == "`":
-            tokval = " ".join(
-                it.takewhile(
-                    lambda tokval: tokval != "`",
-                    map(operator.itemgetter(1), token_generator),
-                )
-            )
-            toknum = _BACKTICK_QUOTED_STRING
-        yield toknum, tokval
-
-
 def _rewrite_assign(tok):
     """Rewrite the assignment operator for PyTables expressions that use ``=``
     as a substitute for ``==``.
@@ -133,31 +101,6 @@ def _replace_locals(tok):
     return toknum, tokval
 
 
-def _clean_spaces_backtick_quoted_names(tok):
-    """Clean up a column name if surrounded by backticks.
-
-    Backtick quoted string are indicated by a certain tokval value. If a string
-    is a backtick quoted token it will processed by
-    :func:`_remove_spaces_column_name` so that the parser can find this
-    string when the query is executed.
-    See also :meth:`NDFrame._get_space_character_free_column_resolver`.
-
-    Parameters
-    ----------
-    tok : tuple of int, str
-        ints correspond to the all caps constants in the tokenize module
-
-    Returns
-    -------
-    t : tuple of int, str
-        Either the input or token or the replacement values
-    """
-    toknum, tokval = tok
-    if toknum == _BACKTICK_QUOTED_STRING:
-        return tokenize.NAME, _remove_spaces_column_name(tokval)
-    return toknum, tokval
-
-
 def _compose2(f, g):
     """Compose 2 callables"""
     return lambda *args, **kwargs: f(g(*args, **kwargs))
@@ -172,10 +115,7 @@ def _compose(*funcs):
 def _preparse(
     source: str,
     f=_compose(
-        _replace_locals,
-        _replace_booleans,
-        _rewrite_assign,
-        _clean_spaces_backtick_quoted_names,
+        _replace_locals, _replace_booleans, _rewrite_assign, clean_backtick_quoted_toks
     ),
 ):
     """Compose a collection of tokenization functions
@@ -426,8 +366,6 @@ def visit(self, node, **kwargs):
             try:
                 node = ast.fix_missing_locations(ast.parse(clean))
             except SyntaxError as e:
-                from keyword import iskeyword
-
                 if any(iskeyword(x) for x in clean.split()):
                     e.msg = "Python keyword not valid identifier in numexpr query"
                 raise e
@@ -781,9 +719,7 @@ def __init__(
         parser,
         preparser=partial(
             _preparse,
-            f=_compose(
-                _replace_locals, _replace_booleans, _clean_spaces_backtick_quoted_names
-            ),
+            f=_compose(_replace_locals, _replace_booleans, clean_backtick_quoted_toks),
         ),
     ):
         super().__init__(env, engine, parser, preparser)

diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py
@@ -0,0 +1,190 @@
+""":func:`~pandas.eval` source string parsing functions
+"""
+
+from io import StringIO
+from keyword import iskeyword
+import token
+import tokenize
+from typing import Iterator, Tuple
+
+# A token value Python's tokenizer probably will never use.
+BACKTICK_QUOTED_STRING = 100
+
+
+def create_valid_python_identifier(name: str) -> str:
+    """
+    Create valid Python identifiers from any string.
+
+    Check if name contains any special characters. If it contains any
+    special characters, the special characters will be replaced by
+    a special string and a prefix is added.
+
+    Raises
+    ------
+    SyntaxError
+        If the returned name is not a Python valid identifier, raise an exception.
+        This can happen if there is a hashtag in the name, as the tokenizer will
+        than terminate and not find the backtick.
+        But also for characters that fall out of the range of (U+0001..U+007F).
+    """
+    if name.isidentifier() and not iskeyword(name):
+        return name
+
+    # Create a dict with the special characters and their replacement string.
+    # EXACT_TOKEN_TYPES contains these special characters
+    # toke.tok_name contains a readable description of the replacement string.
+    special_characters_replacements = {
+        char: f"_{token.tok_name[tokval]}_"
+        # The ignore here is because of a bug in mypy that is resolved in 0.740
+        for char, tokval in tokenize.EXACT_TOKEN_TYPES.items()  # type: ignore
+    }
+    special_characters_replacements.update(
+        {
+            " ": "_",
+            "?": "_QUESTIONMARK_",
+            "!": "_EXCLAMATIONMARK_",
+            "$": "_DOLLARSIGN_",
+            "€": "_EUROSIGN_",
+            # Including quotes works, but there are exceptions.
+            "'": "_SINGLEQUOTE_",
+            '"': "_DOUBLEQUOTE_",
+            # Currently not possible. Terminates parser and won't find backtick.
+            # "#": "_HASH_",
+        }
+    )
+
+    name = "".join(special_characters_replacements.get(char, char) for char in name)
+    name = "BACKTICK_QUOTED_STRING_" + name
+
+    if not name.isidentifier():
+        raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.")
+
+    return name
+
+
+def clean_backtick_quoted_toks(tok: Tuple[int, str]) -> Tuple[int, str]:
+    """
+    Clean up a column name if surrounded by backticks.
+
+    Backtick quoted string are indicated by a certain tokval value. If a string
+    is a backtick quoted token it will processed by
+    :func:`_create_valid_python_identifier` so that the parser can find this
+    string when the query is executed.
+    In this case the tok will get the NAME tokval.
+
+    Parameters
+    ----------
+    tok : tuple of int, str
+        ints correspond to the all caps constants in the tokenize module
+
+    Returns
+    -------
+    tok : Tuple[int, str]
+        Either the input or token or the replacement values
+    """
+    toknum, tokval = tok
+    if toknum == BACKTICK_QUOTED_STRING:
+        return tokenize.NAME, create_valid_python_identifier(tokval)
+    return toknum, tokval
+
+
+def clean_column_name(name: str) -> str:
+    """
+    Function to emulate the cleaning of a backtick quoted name.
+
+    The purpose for this function is to see what happens to the name of
+    identifier if it goes to the process of being parsed a Python code
+    inside a backtick quoted string and than being cleaned
+    (removed of any special characters).
+
+    Parameters
+    ----------
+    name : str
+        Name to be cleaned.
+
+    Returns
+    -------
+    name : str
+        Returns the name after tokenizing and cleaning.
+
+    Notes
+    -----
+        For some cases, a name cannot be converted to a valid Python identifier.
+        In that case :func:`tokenize_string` raises a SyntaxError.
+        In that case, we just return the name unmodified.
+
+        If this name was used in the query string (this makes the query call impossible)
+        an error will be raised by :func:`tokenize_backtick_quoted_string` instead,
+        which is not catched and propogates to the user level.
+    """
+    try:
+        tokenized = tokenize_string(f"`{name}`")
+        tokval = next(tokenized)[1]
+        return create_valid_python_identifier(tokval)
+    except SyntaxError:
+        return name
+
+
+def tokenize_backtick_quoted_string(
+    token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int
+) -> Tuple[int, str]:
+    """
+    Creates a token from a backtick quoted string.
+
+    Moves the token_generator forwards till right after the next backtick.
+
+    Parameters
+    ----------
+    token_generator : Iterator[tokenize.TokenInfo]
+        The generator that yields the tokens of the source string (Tuple[int, str]).
+        The generator is at the first token after the backtick (`)
+
+    source : str
+        The Python source code string.
+
+    string_start : int
+        This is the start of backtick quoted string inside the source string.
+
+    Returns
+    -------
+    tok: Tuple[int, str]
+        The token that represents the backtick quoted string.
+        The integer is equal to BACKTICK_QUOTED_STRING (100).
+    """
+    for _, tokval, start, _, _ in token_generator:
+        if tokval == "`":
+            string_end = start[1]
+            break
+
+    return BACKTICK_QUOTED_STRING, source[string_start:string_end]
+
+
+def tokenize_string(source: str) -> Iterator[Tuple[int, str]]:
+    """
+    Tokenize a Python source code string.
+
+    Parameters
+    ----------
+    source : str
+        The Python source code string.
+
+    Returns
+    -------
+    tok_generator : Iterator[Tuple[int, str]]
+        An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]).
+    """
+    line_reader = StringIO(source).readline
+    token_generator = tokenize.generate_tokens(line_reader)
+
+    # Loop over all tokens till a backtick (`) is found.
+    # Then, take all tokens till the next backtick to form a backtick quoted string
+    for toknum, tokval, start, _, _ in token_generator:
+        if tokval == "`":
+            try:
+                yield tokenize_backtick_quoted_string(
+                    token_generator, source, string_start=start[1] + 1
+                )
+            except Exception:
+                raise SyntaxError(f"Failed to parse backticks in '{source}'.")
+        else:
+            yield toknum, tokval