Skip to content

Commit 63feec6

Browse files
committed
Add function to clean up column names with special characters
Created a tokenize function that does not surround operators with spaces Fixed for keywords and word starting with digits Created documentation
1 parent bc65fe6 commit 63feec6

File tree

6 files changed

+93
-30
lines changed

6 files changed

+93
-30
lines changed

doc/source/whatsnew/v0.25.2.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ Other
9999
^^^^^
100100

101101
- Compatibility with Python 3.8 in :meth:`DataFrame.query` (:issue:`27261`)
102-
-
102+
- Backtick quoting in :meth:`DataFrame.query` and :meth:`DataFrame.eval` can now also be used to use invalid identifiers like names that start with a digit, are python keywords, or are using single character operators.
103103

104104
.. _whatsnew_0.252.contributors:
105105

pandas/core/computation/common.py

+25-5
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
from functools import reduce
2+
from keyword import iskeyword
3+
from token import tok_name
4+
from tokenize import EXACT_TOKEN_TYPES
25

36
import numpy as np
47

@@ -25,13 +28,30 @@ def _result_type_many(*arrays_and_dtypes):
2528
return reduce(np.result_type, arrays_and_dtypes)
2629

2730

28-
def _remove_spaces_column_name(name):
29-
"""Check if name contains any spaces, if it contains any spaces
30-
the spaces will be removed and an underscore suffix is added."""
31-
if not isinstance(name, str) or " " not in name:
31+
def _clean_special_characters_column_name(name):
32+
"""Check if name contains any special characters, if it contains any
33+
special characters the special characters will be replaced by an special
34+
string and an underscore suffix is added."""
35+
if not isinstance(name, str):
3236
return name
3337

34-
return name.replace(" ", "_") + "_BACKTICK_QUOTED_STRING"
38+
if name.isidentifier() and not iskeyword(name):
39+
return name
40+
41+
# Create a list with the special characters and their replacement.
42+
# So far we only replace single character operators.
43+
special_characters_replacements = {
44+
" ": "_",
45+
**{
46+
char: "_" + tok_name[tokval] + "_"
47+
for char, tokval in EXACT_TOKEN_TYPES.items()
48+
if len(char) == 1
49+
},
50+
}
51+
52+
name = "".join(special_characters_replacements.get(char, char) for char in name)
53+
54+
return "BACKTICK_QUOTED_STRING_" + name
3555

3656

3757
class NameResolutionError(NameError):

pandas/core/computation/expr.py

+29-15
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from io import StringIO
77
import itertools as it
88
import operator
9+
import token
910
import tokenize
1011
from typing import Type
1112

@@ -15,7 +16,7 @@
1516
from pandas.core import common as com
1617
from pandas.core.computation.common import (
1718
_BACKTICK_QUOTED_STRING,
18-
_remove_spaces_column_name,
19+
_clean_special_characters_column_name,
1920
)
2021
from pandas.core.computation.ops import (
2122
_LOCAL_TAG,
@@ -40,6 +41,24 @@
4041
import pandas.io.formats.printing as printing
4142

4243

44+
def tokenize_backtick_quoted_string(token_generator):
45+
"""Creates a token from a backtick quoted string.
46+
Moves the token_generator forwards till right after the next backtick."""
47+
prev_toknum = token.OP # This will trigger the first token to have no space
48+
new_tokval = ""
49+
for toknum, tokval, _, _, _ in it.takewhile(
50+
lambda tok: tok[1] != "`", token_generator
51+
):
52+
# This check will ensure that operators will not be surrounded by spaces
53+
if toknum == token.OP or prev_toknum == token.OP:
54+
new_tokval += tokval
55+
else:
56+
new_tokval += " " + tokval
57+
prev_toknum = toknum
58+
59+
return _BACKTICK_QUOTED_STRING, new_tokval
60+
61+
4362
def tokenize_string(source):
4463
"""
4564
Tokenize a Python source code string.
@@ -57,14 +76,9 @@ def tokenize_string(source):
5776
# string.
5877
for toknum, tokval, _, _, _ in token_generator:
5978
if tokval == "`":
60-
tokval = " ".join(
61-
it.takewhile(
62-
lambda tokval: tokval != "`",
63-
map(operator.itemgetter(1), token_generator),
64-
)
65-
)
66-
toknum = _BACKTICK_QUOTED_STRING
67-
yield toknum, tokval
79+
yield tokenize_backtick_quoted_string(token_generator)
80+
else:
81+
yield toknum, tokval
6882

6983

7084
def _rewrite_assign(tok):
@@ -134,14 +148,14 @@ def _replace_locals(tok):
134148
return toknum, tokval
135149

136150

137-
def _clean_spaces_backtick_quoted_names(tok):
151+
def _clean_backtick_quoted_names(tok):
138152
"""Clean up a column name if surrounded by backticks.
139153
140154
Backtick quoted string are indicated by a certain tokval value. If a string
141155
is a backtick quoted token it will processed by
142-
:func:`_remove_spaces_column_name` so that the parser can find this
156+
:func:`_clean_special_characters_column_name` so that the parser can find this
143157
string when the query is executed.
144-
See also :meth:`NDFrame._get_space_character_free_column_resolver`.
158+
See also :meth:`NDFrame._get_special_character_free_column_resolvers`.
145159
146160
Parameters
147161
----------
@@ -155,7 +169,7 @@ def _clean_spaces_backtick_quoted_names(tok):
155169
"""
156170
toknum, tokval = tok
157171
if toknum == _BACKTICK_QUOTED_STRING:
158-
return tokenize.NAME, _remove_spaces_column_name(tokval)
172+
return tokenize.NAME, _clean_special_characters_column_name(tokval)
159173
return toknum, tokval
160174

161175

@@ -176,7 +190,7 @@ def _preparse(
176190
_replace_locals,
177191
_replace_booleans,
178192
_rewrite_assign,
179-
_clean_spaces_backtick_quoted_names,
193+
_clean_backtick_quoted_names,
180194
),
181195
):
182196
"""Compose a collection of tokenization functions
@@ -789,7 +803,7 @@ def __init__(
789803
preparser=partial(
790804
_preparse,
791805
f=_compose(
792-
_replace_locals, _replace_booleans, _clean_spaces_backtick_quoted_names
806+
_replace_locals, _replace_booleans, _clean_backtick_quoted_names
793807
),
794808
),
795809
):

pandas/core/frame.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -3137,8 +3137,13 @@ def query(self, expr, inplace=False, **kwargs):
31373137
31383138
.. versionadded:: 0.25.0
31393139
3140-
You can refer to column names that contain spaces by surrounding
3141-
them in backticks.
3140+
You can refer to column names that contain spaces or operators by
3141+
surrounding them in backticks. This way you can also escape
3142+
names that start with a digit, or are a Python keyword. Basically
3143+
when it is not valid Python identifier.
3144+
3145+
NB. You cannot use this if there are multiple invalid characters
3146+
next to each other, like `very*=invalid`.
31423147
31433148
For example, if one of your columns is called ``a a`` and you want
31443149
to sum it with ``b``, your query should be ```a a` + b``.
@@ -3346,7 +3351,7 @@ def eval(self, expr, inplace=False, **kwargs):
33463351
kwargs["level"] = kwargs.pop("level", 0) + 1
33473352
if resolvers is None:
33483353
index_resolvers = self._get_index_resolvers()
3349-
column_resolvers = self._get_space_character_free_column_resolvers()
3354+
column_resolvers = self._get_special_character_free_column_resolvers()
33503355
resolvers = column_resolvers, index_resolvers
33513356
if "target" not in kwargs:
33523357
kwargs["target"] = self

pandas/core/generic.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -488,16 +488,16 @@ def _get_index_resolvers(self):
488488
d.update(self._get_axis_resolvers(axis_name))
489489
return d
490490

491-
def _get_space_character_free_column_resolvers(self):
492-
"""Return the space character free column resolvers of a dataframe.
491+
def _get_special_character_free_column_resolvers(self):
492+
"""Return the special character free column resolvers of a dataframe.
493493
494-
Column names with spaces are 'cleaned up' so that they can be referred
495-
to by backtick quoting.
494+
Column names with special characters are 'cleaned up' so that they can
495+
be referred to by backtick quoting.
496496
Used in :meth:`DataFrame.eval`.
497497
"""
498-
from pandas.core.computation.common import _remove_spaces_column_name
498+
from pandas.core.computation.common import _clean_special_characters_column_name
499499

500-
return {_remove_spaces_column_name(k): v for k, v in self.items()}
500+
return {_clean_special_characters_column_name(k): v for k, v in self.items()}
501501

502502
@property
503503
def _info_axis(self):

pandas/tests/frame/test_query_eval.py

+24
Original file line numberDiff line numberDiff line change
@@ -1059,6 +1059,10 @@ def df(self):
10591059
"C C": [4, 5, 6],
10601060
"C_C": [8, 9, 10],
10611061
"D_D D": [11, 1, 101],
1062+
"E.E": [6, 3, 5],
1063+
"F-F": [8, 1, 10],
1064+
"1e1": [2, 4, 8],
1065+
"def": [10, 11, 2],
10621066
}
10631067
)
10641068

@@ -1101,3 +1105,23 @@ def backtick_quote_name_with_no_spaces(self, df):
11011105
res = df.eval("A + `C_C`")
11021106
expect = df["A"] + df["C_C"]
11031107
assert_series_equal(res, expect)
1108+
1109+
def test_special_characters(self, df):
1110+
res = df.eval("`E.E` + `F-F` - A")
1111+
expect = df["E.E"] + df["F-F"] - df["A"]
1112+
assert_series_equal(res, expect)
1113+
1114+
def test_start_with_digit(self, df):
1115+
res = df.eval("A + `1e1`")
1116+
expect = df["A"] + df["1e1"]
1117+
assert_series_equal(res, expect)
1118+
1119+
def test_keyword(self, df):
1120+
res = df.eval("A + `def`")
1121+
expect = df["A"] + df["def"]
1122+
assert_series_equal(res, expect)
1123+
1124+
def unneeded_quoting(self, df):
1125+
res = df.query("`A` > 2")
1126+
expect = df[df["A"] > 2]
1127+
assert_series_equal(res, expect)

0 commit comments

Comments
 (0)