Skip to content

Commit 3fc1bdd

Browse files
committed
ENH: Add ability to use special characters for column names in query function.
Clean up in the code that is used for using spaces in the query function and extending the ability to also use special characters that are not allowed in python identifiers. All files related to this functionality are now in the pandas/core/computation/parsing.py file.
1 parent 50ae37d commit 3fc1bdd

File tree

8 files changed

+335
-100
lines changed

8 files changed

+335
-100
lines changed

doc/source/whatsnew/v1.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1016,6 +1016,7 @@ Other
10161016
- Bug in :meth:`Series.diff` where a boolean series would incorrectly raise a ``TypeError`` (:issue:`17294`)
10171017
- :meth:`Series.append` will no longer raise a ``TypeError`` when passed a tuple of ``Series`` (:issue:`28410`)
10181018
- Fix corrupted error message when calling ``pandas.libs._json.encode()`` on a 0d array (:issue:`18878`)
1019+
- Backtick quoting in :meth:`DataFrame.query` and :meth:`DataFrame.eval` can now also be used to use invalid identifiers like names that start with a digit, are python keywords, or are using single character operators. (:issue:`27017`)
10191020
- Bug in ``pd.core.util.hashing.hash_pandas_object`` where arrays containing tuples were incorrectly treated as non-hashable (:issue:`28969`)
10201021
- Bug in :meth:`DataFrame.append` that raised ``IndexError`` when appending with empty list (:issue:`28769`)
10211022
- Fix :class:`AbstractHolidayCalendar` to return correct results for

pandas/core/computation/common.py

-14
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,6 @@
44

55
from pandas._config import get_option
66

7-
# A token value Python's tokenizer probably will never use.
8-
_BACKTICK_QUOTED_STRING = 100
9-
107

118
def _ensure_decoded(s):
129
"""
@@ -29,16 +26,5 @@ def result_type_many(*arrays_and_dtypes):
2926
return reduce(np.result_type, arrays_and_dtypes)
3027

3128

32-
def _remove_spaces_column_name(name):
33-
"""
34-
Check if name contains any spaces, if it contains any spaces
35-
the spaces will be removed and an underscore suffix is added.
36-
"""
37-
if not isinstance(name, str) or " " not in name:
38-
return name
39-
40-
return name.replace(" ", "_") + "_BACKTICK_QUOTED_STRING"
41-
42-
4329
class NameResolutionError(NameError):
4430
pass

pandas/core/computation/eval.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
from pandas.util._validators import validate_bool_kwarg
1313

1414
from pandas.core.computation.engines import _engines
15-
from pandas.core.computation.expr import Expr, _parsers, tokenize_string
15+
from pandas.core.computation.expr import Expr, _parsers
16+
from pandas.core.computation.parsing import tokenize_string
1617
from pandas.core.computation.scope import ensure_scope
1718

1819
from pandas.io.formats.printing import pprint_thing

pandas/core/computation/expr.py

+4-68
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,13 @@
33

44
import ast
55
from functools import partial, reduce
6-
from io import StringIO
7-
import itertools as it
8-
import operator
6+
from keyword import iskeyword
97
import tokenize
108
from typing import Optional, Type
119

1210
import numpy as np
1311

1412
import pandas.core.common as com
15-
from pandas.core.computation.common import (
16-
_BACKTICK_QUOTED_STRING,
17-
_remove_spaces_column_name,
18-
)
1913
from pandas.core.computation.ops import (
2014
_LOCAL_TAG,
2115
BinOp,
@@ -34,38 +28,12 @@
3428
_unary_ops_syms,
3529
is_term,
3630
)
31+
from pandas.core.computation.parsing import clean_backtick_quoted_toks, tokenize_string
3732
from pandas.core.computation.scope import Scope
3833

3934
import pandas.io.formats.printing as printing
4035

4136

42-
def tokenize_string(source: str):
43-
"""
44-
Tokenize a Python source code string.
45-
46-
Parameters
47-
----------
48-
source : str
49-
A Python source code string
50-
"""
51-
line_reader = StringIO(source).readline
52-
token_generator = tokenize.generate_tokens(line_reader)
53-
54-
# Loop over all tokens till a backtick (`) is found.
55-
# Then, take all tokens till the next backtick to form a backtick quoted
56-
# string.
57-
for toknum, tokval, _, _, _ in token_generator:
58-
if tokval == "`":
59-
tokval = " ".join(
60-
it.takewhile(
61-
lambda tokval: tokval != "`",
62-
map(operator.itemgetter(1), token_generator),
63-
)
64-
)
65-
toknum = _BACKTICK_QUOTED_STRING
66-
yield toknum, tokval
67-
68-
6937
def _rewrite_assign(tok):
7038
"""Rewrite the assignment operator for PyTables expressions that use ``=``
7139
as a substitute for ``==``.
@@ -133,31 +101,6 @@ def _replace_locals(tok):
133101
return toknum, tokval
134102

135103

136-
def _clean_spaces_backtick_quoted_names(tok):
137-
"""Clean up a column name if surrounded by backticks.
138-
139-
Backtick quoted string are indicated by a certain tokval value. If a string
140-
is a backtick quoted token it will processed by
141-
:func:`_remove_spaces_column_name` so that the parser can find this
142-
string when the query is executed.
143-
See also :meth:`NDFrame._get_space_character_free_column_resolver`.
144-
145-
Parameters
146-
----------
147-
tok : tuple of int, str
148-
ints correspond to the all caps constants in the tokenize module
149-
150-
Returns
151-
-------
152-
t : tuple of int, str
153-
Either the input or token or the replacement values
154-
"""
155-
toknum, tokval = tok
156-
if toknum == _BACKTICK_QUOTED_STRING:
157-
return tokenize.NAME, _remove_spaces_column_name(tokval)
158-
return toknum, tokval
159-
160-
161104
def _compose2(f, g):
162105
"""Compose 2 callables"""
163106
return lambda *args, **kwargs: f(g(*args, **kwargs))
@@ -172,10 +115,7 @@ def _compose(*funcs):
172115
def _preparse(
173116
source: str,
174117
f=_compose(
175-
_replace_locals,
176-
_replace_booleans,
177-
_rewrite_assign,
178-
_clean_spaces_backtick_quoted_names,
118+
_replace_locals, _replace_booleans, _rewrite_assign, clean_backtick_quoted_toks
179119
),
180120
):
181121
"""Compose a collection of tokenization functions
@@ -426,8 +366,6 @@ def visit(self, node, **kwargs):
426366
try:
427367
node = ast.fix_missing_locations(ast.parse(clean))
428368
except SyntaxError as e:
429-
from keyword import iskeyword
430-
431369
if any(iskeyword(x) for x in clean.split()):
432370
e.msg = "Python keyword not valid identifier in numexpr query"
433371
raise e
@@ -781,9 +719,7 @@ def __init__(
781719
parser,
782720
preparser=partial(
783721
_preparse,
784-
f=_compose(
785-
_replace_locals, _replace_booleans, _clean_spaces_backtick_quoted_names
786-
),
722+
f=_compose(_replace_locals, _replace_booleans, clean_backtick_quoted_toks),
787723
),
788724
):
789725
super().__init__(env, engine, parser, preparser)

pandas/core/computation/parsing.py

+190
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
""":func:`~pandas.eval` source string parsing functions
2+
"""
3+
4+
from io import StringIO
5+
from keyword import iskeyword
6+
import token
7+
import tokenize
8+
from typing import Iterator, Tuple
9+
10+
# A token value Python's tokenizer probably will never use.
11+
BACKTICK_QUOTED_STRING = 100
12+
13+
14+
def create_valid_python_identifier(name: str) -> str:
15+
"""
16+
Create valid Python identifiers from any string.
17+
18+
Check if name contains any special characters. If it contains any
19+
special characters, the special characters will be replaced by
20+
a special string and a prefix is added.
21+
22+
Raises
23+
------
24+
SyntaxError
25+
If the returned name is not a Python valid identifier, raise an exception.
26+
This can happen if there is a hashtag in the name, as the tokenizer will
27+
than terminate and not find the backtick.
28+
But also for characters that fall out of the range of (U+0001..U+007F).
29+
"""
30+
if name.isidentifier() and not iskeyword(name):
31+
return name
32+
33+
# Create a dict with the special characters and their replacement string.
34+
# EXACT_TOKEN_TYPES contains these special characters
35+
# toke.tok_name contains a readable description of the replacement string.
36+
special_characters_replacements = {
37+
char: f"_{token.tok_name[tokval]}_"
38+
# The ignore here is because of a bug in mypy that is resolved in 0.740
39+
for char, tokval in tokenize.EXACT_TOKEN_TYPES.items() # type: ignore
40+
}
41+
special_characters_replacements.update(
42+
{
43+
" ": "_",
44+
"?": "_QUESTIONMARK_",
45+
"!": "_EXCLAMATIONMARK_",
46+
"$": "_DOLLARSIGN_",
47+
"€": "_EUROSIGN_",
48+
# Including quotes works, but there are exceptions.
49+
"'": "_SINGLEQUOTE_",
50+
'"': "_DOUBLEQUOTE_",
51+
# Currently not possible. Terminates parser and won't find backtick.
52+
# "#": "_HASH_",
53+
}
54+
)
55+
56+
name = "".join(special_characters_replacements.get(char, char) for char in name)
57+
name = "BACKTICK_QUOTED_STRING_" + name
58+
59+
if not name.isidentifier():
60+
raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.")
61+
62+
return name
63+
64+
65+
def clean_backtick_quoted_toks(tok: Tuple[int, str]) -> Tuple[int, str]:
66+
"""
67+
Clean up a column name if surrounded by backticks.
68+
69+
Backtick quoted string are indicated by a certain tokval value. If a string
70+
is a backtick quoted token it will processed by
71+
:func:`_create_valid_python_identifier` so that the parser can find this
72+
string when the query is executed.
73+
In this case the tok will get the NAME tokval.
74+
75+
Parameters
76+
----------
77+
tok : tuple of int, str
78+
ints correspond to the all caps constants in the tokenize module
79+
80+
Returns
81+
-------
82+
tok : Tuple[int, str]
83+
Either the input or token or the replacement values
84+
"""
85+
toknum, tokval = tok
86+
if toknum == BACKTICK_QUOTED_STRING:
87+
return tokenize.NAME, create_valid_python_identifier(tokval)
88+
return toknum, tokval
89+
90+
91+
def clean_column_name(name: str) -> str:
92+
"""
93+
Function to emulate the cleaning of a backtick quoted name.
94+
95+
The purpose for this function is to see what happens to the name of
96+
identifier if it goes to the process of being parsed a Python code
97+
inside a backtick quoted string and than being cleaned
98+
(removed of any special characters).
99+
100+
Parameters
101+
----------
102+
name : str
103+
Name to be cleaned.
104+
105+
Returns
106+
-------
107+
name : str
108+
Returns the name after tokenizing and cleaning.
109+
110+
Notes
111+
-----
112+
For some cases, a name cannot be converted to a valid Python identifier.
113+
In that case :func:`tokenize_string` raises a SyntaxError.
114+
In that case, we just return the name unmodified.
115+
116+
If this name was used in the query string (this makes the query call impossible)
117+
an error will be raised by :func:`tokenize_backtick_quoted_string` instead,
118+
which is not catched and propogates to the user level.
119+
"""
120+
try:
121+
tokenized = tokenize_string(f"`{name}`")
122+
tokval = next(tokenized)[1]
123+
return create_valid_python_identifier(tokval)
124+
except SyntaxError:
125+
return name
126+
127+
128+
def tokenize_backtick_quoted_string(
129+
token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int
130+
) -> Tuple[int, str]:
131+
"""
132+
Creates a token from a backtick quoted string.
133+
134+
Moves the token_generator forwards till right after the next backtick.
135+
136+
Parameters
137+
----------
138+
token_generator : Iterator[tokenize.TokenInfo]
139+
The generator that yields the tokens of the source string (Tuple[int, str]).
140+
The generator is at the first token after the backtick (`)
141+
142+
source : str
143+
The Python source code string.
144+
145+
string_start : int
146+
This is the start of backtick quoted string inside the source string.
147+
148+
Returns
149+
-------
150+
tok: Tuple[int, str]
151+
The token that represents the backtick quoted string.
152+
The integer is equal to BACKTICK_QUOTED_STRING (100).
153+
"""
154+
for _, tokval, start, _, _ in token_generator:
155+
if tokval == "`":
156+
string_end = start[1]
157+
break
158+
159+
return BACKTICK_QUOTED_STRING, source[string_start:string_end]
160+
161+
162+
def tokenize_string(source: str) -> Iterator[Tuple[int, str]]:
163+
"""
164+
Tokenize a Python source code string.
165+
166+
Parameters
167+
----------
168+
source : str
169+
The Python source code string.
170+
171+
Returns
172+
-------
173+
tok_generator : Iterator[Tuple[int, str]]
174+
An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]).
175+
"""
176+
line_reader = StringIO(source).readline
177+
token_generator = tokenize.generate_tokens(line_reader)
178+
179+
# Loop over all tokens till a backtick (`) is found.
180+
# Then, take all tokens till the next backtick to form a backtick quoted string
181+
for toknum, tokval, start, _, _ in token_generator:
182+
if tokval == "`":
183+
try:
184+
yield tokenize_backtick_quoted_string(
185+
token_generator, source, string_start=start[1] + 1
186+
)
187+
except Exception:
188+
raise SyntaxError(f"Failed to parse backticks in '{source}'.")
189+
else:
190+
yield toknum, tokval

0 commit comments

Comments
 (0)