Skip to content

Commit fffb978

Browse files
hwalingajreback
authored andcommitted
ENH: Add ability to use special characters for column names in query function. (#28215)
1 parent 06c5d24 commit fffb978

File tree

8 files changed

+335
-100
lines changed

8 files changed

+335
-100
lines changed

doc/source/whatsnew/v1.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1016,6 +1016,7 @@ Other
10161016
- Bug in :meth:`Series.diff` where a boolean series would incorrectly raise a ``TypeError`` (:issue:`17294`)
10171017
- :meth:`Series.append` will no longer raise a ``TypeError`` when passed a tuple of ``Series`` (:issue:`28410`)
10181018
- Fix corrupted error message when calling ``pandas.libs._json.encode()`` on a 0d array (:issue:`18878`)
1019+
- Backtick quoting in :meth:`DataFrame.query` and :meth:`DataFrame.eval` can now also be used to use invalid identifiers like names that start with a digit, are python keywords, or are using single character operators. (:issue:`27017`)
10191020
- Bug in ``pd.core.util.hashing.hash_pandas_object`` where arrays containing tuples were incorrectly treated as non-hashable (:issue:`28969`)
10201021
- Bug in :meth:`DataFrame.append` that raised ``IndexError`` when appending with empty list (:issue:`28769`)
10211022
- Fix :class:`AbstractHolidayCalendar` to return correct results for

pandas/core/computation/common.py

-14
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,6 @@
44

55
from pandas._config import get_option
66

7-
# A token value Python's tokenizer probably will never use.
8-
_BACKTICK_QUOTED_STRING = 100
9-
107

118
def _ensure_decoded(s):
129
"""
@@ -29,16 +26,5 @@ def result_type_many(*arrays_and_dtypes):
2926
return reduce(np.result_type, arrays_and_dtypes)
3027

3128

32-
def _remove_spaces_column_name(name):
33-
"""
34-
Check if name contains any spaces, if it contains any spaces
35-
the spaces will be removed and an underscore suffix is added.
36-
"""
37-
if not isinstance(name, str) or " " not in name:
38-
return name
39-
40-
return name.replace(" ", "_") + "_BACKTICK_QUOTED_STRING"
41-
42-
4329
class NameResolutionError(NameError):
4430
pass

pandas/core/computation/eval.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
from pandas.util._validators import validate_bool_kwarg
1313

1414
from pandas.core.computation.engines import _engines
15-
from pandas.core.computation.expr import Expr, _parsers, tokenize_string
15+
from pandas.core.computation.expr import Expr, _parsers
16+
from pandas.core.computation.parsing import tokenize_string
1617
from pandas.core.computation.scope import ensure_scope
1718

1819
from pandas.io.formats.printing import pprint_thing

pandas/core/computation/expr.py

+4-68
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,13 @@
33

44
import ast
55
from functools import partial, reduce
6-
from io import StringIO
7-
import itertools as it
8-
import operator
6+
from keyword import iskeyword
97
import tokenize
108
from typing import Optional, Type
119

1210
import numpy as np
1311

1412
import pandas.core.common as com
15-
from pandas.core.computation.common import (
16-
_BACKTICK_QUOTED_STRING,
17-
_remove_spaces_column_name,
18-
)
1913
from pandas.core.computation.ops import (
2014
_LOCAL_TAG,
2115
BinOp,
@@ -34,38 +28,12 @@
3428
_unary_ops_syms,
3529
is_term,
3630
)
31+
from pandas.core.computation.parsing import clean_backtick_quoted_toks, tokenize_string
3732
from pandas.core.computation.scope import Scope
3833

3934
import pandas.io.formats.printing as printing
4035

4136

42-
def tokenize_string(source: str):
43-
"""
44-
Tokenize a Python source code string.
45-
46-
Parameters
47-
----------
48-
source : str
49-
A Python source code string
50-
"""
51-
line_reader = StringIO(source).readline
52-
token_generator = tokenize.generate_tokens(line_reader)
53-
54-
# Loop over all tokens till a backtick (`) is found.
55-
# Then, take all tokens till the next backtick to form a backtick quoted
56-
# string.
57-
for toknum, tokval, _, _, _ in token_generator:
58-
if tokval == "`":
59-
tokval = " ".join(
60-
it.takewhile(
61-
lambda tokval: tokval != "`",
62-
map(operator.itemgetter(1), token_generator),
63-
)
64-
)
65-
toknum = _BACKTICK_QUOTED_STRING
66-
yield toknum, tokval
67-
68-
6937
def _rewrite_assign(tok):
7038
"""Rewrite the assignment operator for PyTables expressions that use ``=``
7139
as a substitute for ``==``.
@@ -133,31 +101,6 @@ def _replace_locals(tok):
133101
return toknum, tokval
134102

135103

136-
def _clean_spaces_backtick_quoted_names(tok):
137-
"""Clean up a column name if surrounded by backticks.
138-
139-
Backtick quoted string are indicated by a certain tokval value. If a string
140-
is a backtick quoted token it will processed by
141-
:func:`_remove_spaces_column_name` so that the parser can find this
142-
string when the query is executed.
143-
See also :meth:`NDFrame._get_space_character_free_column_resolver`.
144-
145-
Parameters
146-
----------
147-
tok : tuple of int, str
148-
ints correspond to the all caps constants in the tokenize module
149-
150-
Returns
151-
-------
152-
t : tuple of int, str
153-
Either the input or token or the replacement values
154-
"""
155-
toknum, tokval = tok
156-
if toknum == _BACKTICK_QUOTED_STRING:
157-
return tokenize.NAME, _remove_spaces_column_name(tokval)
158-
return toknum, tokval
159-
160-
161104
def _compose2(f, g):
162105
"""Compose 2 callables"""
163106
return lambda *args, **kwargs: f(g(*args, **kwargs))
@@ -172,10 +115,7 @@ def _compose(*funcs):
172115
def _preparse(
173116
source: str,
174117
f=_compose(
175-
_replace_locals,
176-
_replace_booleans,
177-
_rewrite_assign,
178-
_clean_spaces_backtick_quoted_names,
118+
_replace_locals, _replace_booleans, _rewrite_assign, clean_backtick_quoted_toks
179119
),
180120
):
181121
"""Compose a collection of tokenization functions
@@ -426,8 +366,6 @@ def visit(self, node, **kwargs):
426366
try:
427367
node = ast.fix_missing_locations(ast.parse(clean))
428368
except SyntaxError as e:
429-
from keyword import iskeyword
430-
431369
if any(iskeyword(x) for x in clean.split()):
432370
e.msg = "Python keyword not valid identifier in numexpr query"
433371
raise e
@@ -781,9 +719,7 @@ def __init__(
781719
parser,
782720
preparser=partial(
783721
_preparse,
784-
f=_compose(
785-
_replace_locals, _replace_booleans, _clean_spaces_backtick_quoted_names
786-
),
722+
f=_compose(_replace_locals, _replace_booleans, clean_backtick_quoted_toks),
787723
),
788724
):
789725
super().__init__(env, engine, parser, preparser)

pandas/core/computation/parsing.py

+190
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
""":func:`~pandas.eval` source string parsing functions
2+
"""
3+
4+
from io import StringIO
5+
from keyword import iskeyword
6+
import token
7+
import tokenize
8+
from typing import Iterator, Tuple
9+
10+
# A token value Python's tokenizer probably will never use.
11+
BACKTICK_QUOTED_STRING = 100
12+
13+
14+
def create_valid_python_identifier(name: str) -> str:
15+
"""
16+
Create valid Python identifiers from any string.
17+
18+
Check if name contains any special characters. If it contains any
19+
special characters, the special characters will be replaced by
20+
a special string and a prefix is added.
21+
22+
Raises
23+
------
24+
SyntaxError
25+
If the returned name is not a Python valid identifier, raise an exception.
26+
This can happen if there is a hashtag in the name, as the tokenizer will
27+
than terminate and not find the backtick.
28+
But also for characters that fall out of the range of (U+0001..U+007F).
29+
"""
30+
if name.isidentifier() and not iskeyword(name):
31+
return name
32+
33+
# Create a dict with the special characters and their replacement string.
34+
# EXACT_TOKEN_TYPES contains these special characters
35+
# toke.tok_name contains a readable description of the replacement string.
36+
special_characters_replacements = {
37+
char: f"_{token.tok_name[tokval]}_"
38+
# The ignore here is because of a bug in mypy that is resolved in 0.740
39+
for char, tokval in tokenize.EXACT_TOKEN_TYPES.items() # type: ignore
40+
}
41+
special_characters_replacements.update(
42+
{
43+
" ": "_",
44+
"?": "_QUESTIONMARK_",
45+
"!": "_EXCLAMATIONMARK_",
46+
"$": "_DOLLARSIGN_",
47+
"€": "_EUROSIGN_",
48+
# Including quotes works, but there are exceptions.
49+
"'": "_SINGLEQUOTE_",
50+
'"': "_DOUBLEQUOTE_",
51+
# Currently not possible. Terminates parser and won't find backtick.
52+
# "#": "_HASH_",
53+
}
54+
)
55+
56+
name = "".join(special_characters_replacements.get(char, char) for char in name)
57+
name = "BACKTICK_QUOTED_STRING_" + name
58+
59+
if not name.isidentifier():
60+
raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.")
61+
62+
return name
63+
64+
65+
def clean_backtick_quoted_toks(tok: Tuple[int, str]) -> Tuple[int, str]:
66+
"""
67+
Clean up a column name if surrounded by backticks.
68+
69+
Backtick quoted string are indicated by a certain tokval value. If a string
70+
is a backtick quoted token it will processed by
71+
:func:`_create_valid_python_identifier` so that the parser can find this
72+
string when the query is executed.
73+
In this case the tok will get the NAME tokval.
74+
75+
Parameters
76+
----------
77+
tok : tuple of int, str
78+
ints correspond to the all caps constants in the tokenize module
79+
80+
Returns
81+
-------
82+
tok : Tuple[int, str]
83+
Either the input or token or the replacement values
84+
"""
85+
toknum, tokval = tok
86+
if toknum == BACKTICK_QUOTED_STRING:
87+
return tokenize.NAME, create_valid_python_identifier(tokval)
88+
return toknum, tokval
89+
90+
91+
def clean_column_name(name: str) -> str:
92+
"""
93+
Function to emulate the cleaning of a backtick quoted name.
94+
95+
The purpose for this function is to see what happens to the name of
96+
identifier if it goes to the process of being parsed a Python code
97+
inside a backtick quoted string and than being cleaned
98+
(removed of any special characters).
99+
100+
Parameters
101+
----------
102+
name : str
103+
Name to be cleaned.
104+
105+
Returns
106+
-------
107+
name : str
108+
Returns the name after tokenizing and cleaning.
109+
110+
Notes
111+
-----
112+
For some cases, a name cannot be converted to a valid Python identifier.
113+
In that case :func:`tokenize_string` raises a SyntaxError.
114+
In that case, we just return the name unmodified.
115+
116+
If this name was used in the query string (this makes the query call impossible)
117+
an error will be raised by :func:`tokenize_backtick_quoted_string` instead,
118+
which is not catched and propogates to the user level.
119+
"""
120+
try:
121+
tokenized = tokenize_string(f"`{name}`")
122+
tokval = next(tokenized)[1]
123+
return create_valid_python_identifier(tokval)
124+
except SyntaxError:
125+
return name
126+
127+
128+
def tokenize_backtick_quoted_string(
129+
token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int
130+
) -> Tuple[int, str]:
131+
"""
132+
Creates a token from a backtick quoted string.
133+
134+
Moves the token_generator forwards till right after the next backtick.
135+
136+
Parameters
137+
----------
138+
token_generator : Iterator[tokenize.TokenInfo]
139+
The generator that yields the tokens of the source string (Tuple[int, str]).
140+
The generator is at the first token after the backtick (`)
141+
142+
source : str
143+
The Python source code string.
144+
145+
string_start : int
146+
This is the start of backtick quoted string inside the source string.
147+
148+
Returns
149+
-------
150+
tok: Tuple[int, str]
151+
The token that represents the backtick quoted string.
152+
The integer is equal to BACKTICK_QUOTED_STRING (100).
153+
"""
154+
for _, tokval, start, _, _ in token_generator:
155+
if tokval == "`":
156+
string_end = start[1]
157+
break
158+
159+
return BACKTICK_QUOTED_STRING, source[string_start:string_end]
160+
161+
162+
def tokenize_string(source: str) -> Iterator[Tuple[int, str]]:
163+
"""
164+
Tokenize a Python source code string.
165+
166+
Parameters
167+
----------
168+
source : str
169+
The Python source code string.
170+
171+
Returns
172+
-------
173+
tok_generator : Iterator[Tuple[int, str]]
174+
An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]).
175+
"""
176+
line_reader = StringIO(source).readline
177+
token_generator = tokenize.generate_tokens(line_reader)
178+
179+
# Loop over all tokens till a backtick (`) is found.
180+
# Then, take all tokens till the next backtick to form a backtick quoted string
181+
for toknum, tokval, start, _, _ in token_generator:
182+
if tokval == "`":
183+
try:
184+
yield tokenize_backtick_quoted_string(
185+
token_generator, source, string_start=start[1] + 1
186+
)
187+
except Exception:
188+
raise SyntaxError(f"Failed to parse backticks in '{source}'.")
189+
else:
190+
yield toknum, tokval

0 commit comments

Comments
 (0)