Add function to clean up column names with special characters

hwalinga · hwalinga · commit 63feec6b00b3 · 2019-08-29T12:18:36.000+02:00
Created a tokenize function that does not surround operators with spaces

Fixed for keywords and word starting with digits

Created documentation
diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst
@@ -99,7 +99,7 @@ Other
 ^^^^^
 
 - Compatibility with Python 3.8 in :meth:`DataFrame.query` (:issue:`27261`)
--
+- Backtick quoting in :meth:`DataFrame.query` and :meth:`DataFrame.eval` can now also be used to use invalid identifiers like names that start with a digit, are python keywords, or are using single character operators.
 
 .. _whatsnew_0.252.contributors:
 
diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py
@@ -1,4 +1,7 @@
 from functools import reduce
+from keyword import iskeyword
+from token import tok_name
+from tokenize import EXACT_TOKEN_TYPES
 
 import numpy as np
 
@@ -25,13 +28,30 @@ def _result_type_many(*arrays_and_dtypes):
         return reduce(np.result_type, arrays_and_dtypes)
 
 
-def _remove_spaces_column_name(name):
-    """Check if name contains any spaces, if it contains any spaces
-    the spaces will be removed and an underscore suffix is added."""
-    if not isinstance(name, str) or " " not in name:
+def _clean_special_characters_column_name(name):
+    """Check if name contains any special characters, if it contains any
+    special characters the special characters will be replaced by an special
+    string and an underscore suffix is added."""
+    if not isinstance(name, str):
         return name
 
-    return name.replace(" ", "_") + "_BACKTICK_QUOTED_STRING"
+    if name.isidentifier() and not iskeyword(name):
+        return name
+
+    # Create a list with the special characters and their replacement.
+    # So far we only replace single character operators.
+    special_characters_replacements = {
+        " ": "_",
+        **{
+            char: "_" + tok_name[tokval] + "_"
+            for char, tokval in EXACT_TOKEN_TYPES.items()
+            if len(char) == 1
+        },
+    }
+
+    name = "".join(special_characters_replacements.get(char, char) for char in name)
+
+    return "BACKTICK_QUOTED_STRING_" + name
 
 
 class NameResolutionError(NameError):
diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py
@@ -6,6 +6,7 @@
 from io import StringIO
 import itertools as it
 import operator
+import token
 import tokenize
 from typing import Type
 
@@ -15,7 +16,7 @@
 from pandas.core import common as com
 from pandas.core.computation.common import (
     _BACKTICK_QUOTED_STRING,
-    _remove_spaces_column_name,
+    _clean_special_characters_column_name,
 )
 from pandas.core.computation.ops import (
     _LOCAL_TAG,
@@ -40,6 +41,24 @@
 import pandas.io.formats.printing as printing
 
 
+def tokenize_backtick_quoted_string(token_generator):
+    """Creates a token from a backtick quoted string.
+    Moves the token_generator forwards till right after the next backtick."""
+    prev_toknum = token.OP  # This will trigger the first token to have no space
+    new_tokval = ""
+    for toknum, tokval, _, _, _ in it.takewhile(
+        lambda tok: tok[1] != "`", token_generator
+    ):
+        # This check will ensure that operators will not be surrounded by spaces
+        if toknum == token.OP or prev_toknum == token.OP:
+            new_tokval += tokval
+        else:
+            new_tokval += " " + tokval
+        prev_toknum = toknum
+
+    return _BACKTICK_QUOTED_STRING, new_tokval
+
+
 def tokenize_string(source):
     """
     Tokenize a Python source code string.
@@ -57,14 +76,9 @@ def tokenize_string(source):
     # string.
     for toknum, tokval, _, _, _ in token_generator:
         if tokval == "`":
-            tokval = " ".join(
-                it.takewhile(
-                    lambda tokval: tokval != "`",
-                    map(operator.itemgetter(1), token_generator),
-                )
-            )
-            toknum = _BACKTICK_QUOTED_STRING
-        yield toknum, tokval
+            yield tokenize_backtick_quoted_string(token_generator)
+        else:
+            yield toknum, tokval
 
 
 def _rewrite_assign(tok):
@@ -134,14 +148,14 @@ def _replace_locals(tok):
     return toknum, tokval
 
 
-def _clean_spaces_backtick_quoted_names(tok):
+def _clean_backtick_quoted_names(tok):
     """Clean up a column name if surrounded by backticks.
 
     Backtick quoted string are indicated by a certain tokval value. If a string
     is a backtick quoted token it will processed by
-    :func:`_remove_spaces_column_name` so that the parser can find this
+    :func:`_clean_special_characters_column_name` so that the parser can find this
     string when the query is executed.
-    See also :meth:`NDFrame._get_space_character_free_column_resolver`.
+    See also :meth:`NDFrame._get_special_character_free_column_resolvers`.
 
     Parameters
     ----------
@@ -155,7 +169,7 @@ def _clean_spaces_backtick_quoted_names(tok):
     """
     toknum, tokval = tok
     if toknum == _BACKTICK_QUOTED_STRING:
-        return tokenize.NAME, _remove_spaces_column_name(tokval)
+        return tokenize.NAME, _clean_special_characters_column_name(tokval)
     return toknum, tokval
 
 
@@ -176,7 +190,7 @@ def _preparse(
         _replace_locals,
         _replace_booleans,
         _rewrite_assign,
-        _clean_spaces_backtick_quoted_names,
+        _clean_backtick_quoted_names,
     ),
 ):
     """Compose a collection of tokenization functions
@@ -789,7 +803,7 @@ def __init__(
         preparser=partial(
             _preparse,
             f=_compose(
-                _replace_locals, _replace_booleans, _clean_spaces_backtick_quoted_names
+                _replace_locals, _replace_booleans, _clean_backtick_quoted_names
             ),
         ),
     ):
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3137,8 +3137,13 @@ def query(self, expr, inplace=False, **kwargs):
 
             .. versionadded:: 0.25.0
 
-            You can refer to column names that contain spaces by surrounding
-            them in backticks.
+            You can refer to column names that contain spaces or operators by
+            surrounding them in backticks. This way you can also escape
+            names that start with a digit, or are a Python keyword. Basically
+            when it is not valid Python identifier.
+
+            NB. You cannot use this if there are multiple invalid characters
+            next to each other, like `very*=invalid`.
 
             For example, if one of your columns is called ``a a`` and you want
             to sum it with ``b``, your query should be ```a a` + b``.
@@ -3346,7 +3351,7 @@ def eval(self, expr, inplace=False, **kwargs):
         kwargs["level"] = kwargs.pop("level", 0) + 1
         if resolvers is None:
             index_resolvers = self._get_index_resolvers()
-            column_resolvers = self._get_space_character_free_column_resolvers()
+            column_resolvers = self._get_special_character_free_column_resolvers()
             resolvers = column_resolvers, index_resolvers
         if "target" not in kwargs:
             kwargs["target"] = self
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -488,16 +488,16 @@ def _get_index_resolvers(self):
             d.update(self._get_axis_resolvers(axis_name))
         return d
 
-    def _get_space_character_free_column_resolvers(self):
-        """Return the space character free column resolvers of a dataframe.
+    def _get_special_character_free_column_resolvers(self):
+        """Return the special character free column resolvers of a dataframe.
 
-        Column names with spaces are 'cleaned up' so that they can be referred
-        to by backtick quoting.
+        Column names with special characters are 'cleaned up' so that they can
+        be referred to by backtick quoting.
         Used in :meth:`DataFrame.eval`.
         """
-        from pandas.core.computation.common import _remove_spaces_column_name
+        from pandas.core.computation.common import _clean_special_characters_column_name
 
-        return {_remove_spaces_column_name(k): v for k, v in self.items()}
+        return {_clean_special_characters_column_name(k): v for k, v in self.items()}
 
     @property
     def _info_axis(self):
diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py
@@ -1059,6 +1059,10 @@ def df(self):
                 "C C": [4, 5, 6],
                 "C_C": [8, 9, 10],
                 "D_D D": [11, 1, 101],
+                "E.E": [6, 3, 5],
+                "F-F": [8, 1, 10],
+                "1e1": [2, 4, 8],
+                "def": [10, 11, 2],
             }
         )
 
@@ -1101,3 +1105,23 @@ def backtick_quote_name_with_no_spaces(self, df):
         res = df.eval("A + `C_C`")
         expect = df["A"] + df["C_C"]
         assert_series_equal(res, expect)
+
+    def test_special_characters(self, df):
+        res = df.eval("`E.E` + `F-F` - A")
+        expect = df["E.E"] + df["F-F"] - df["A"]
+        assert_series_equal(res, expect)
+
+    def test_start_with_digit(self, df):
+        res = df.eval("A + `1e1`")
+        expect = df["A"] + df["1e1"]
+        assert_series_equal(res, expect)
+
+    def test_keyword(self, df):
+        res = df.eval("A + `def`")
+        expect = df["A"] + df["def"]
+        assert_series_equal(res, expect)
+
+    def unneeded_quoting(self, df):
+        res = df.query("`A` > 2")
+        expect = df[df["A"] > 2]
+        assert_series_equal(res, expect)