Skip to content

Commit b32e828

Browse files
committed
Merge branch 'main' into ref-dedup
2 parents 8fbce0b + 1044cf4 commit b32e828

File tree

9 files changed

+293
-48
lines changed

9 files changed

+293
-48
lines changed

.github/actions/build_pandas/action.yml

+7
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,13 @@ runs:
2222
fi
2323
shell: bash -el {0}
2424

25+
- name: Uninstall nomkl
26+
run: |
27+
if conda list nomkl | grep nomkl 1>/dev/null; then
28+
conda remove nomkl -y
29+
fi
30+
shell: bash -el {0}
31+
2532
- name: Build Pandas
2633
run: |
2734
if [[ ${{ inputs.editable }} == "true" ]]; then

.pre-commit-config.yaml

+3-1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ repos:
2323
hooks:
2424
- id: ruff
2525
args: [--exit-non-zero-on-fix]
26+
exclude: ^pandas/tests/frame/test_query_eval.py
2627
- id: ruff
2728
# TODO: remove autofixe-only rules when they are checked by ruff
2829
name: ruff-selected-autofixes
@@ -31,7 +32,7 @@ repos:
3132
exclude: ^pandas/tests
3233
args: [--select, "ANN001,ANN2", --fix-only, --exit-non-zero-on-fix]
3334
- id: ruff-format
34-
exclude: ^scripts
35+
exclude: ^scripts|^pandas/tests/frame/test_query_eval.py
3536
- repo: https://github.com/jendrikseipp/vulture
3637
rev: 'v2.11'
3738
hooks:
@@ -85,6 +86,7 @@ repos:
8586
types: [text] # overwrite types: [rst]
8687
types_or: [python, rst]
8788
- id: rst-inline-touching-normal
89+
exclude: ^pandas/tests/frame/test_query_eval.py
8890
types: [text] # overwrite types: [rst]
8991
types_or: [python, rst]
9092
- repo: https://github.com/sphinx-contrib/sphinx-lint

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -685,6 +685,7 @@ Other
685685
- Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`)
686686
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`)
687687
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`)
688+
- Bug in :meth:`DataFrame.query` which raised an exception or produced incorrect results when expressions contained backtick-quoted column names containing the hash character ``#``, backticks, or characters that fall outside the ASCII range (U+0001..U+007F). (:issue:`59285`) (:issue:`49633`)
688689
- Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`)
689690
- Bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`)
690691
- Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`)

pandas/core/computation/parsing.py

+115-18
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from __future__ import annotations
66

7+
from enum import Enum
78
from io import StringIO
89
from keyword import iskeyword
910
import token
@@ -32,13 +33,21 @@ def create_valid_python_identifier(name: str) -> str:
3233
------
3334
SyntaxError
3435
If the returned name is not a Python valid identifier, raise an exception.
35-
This can happen if there is a hashtag in the name, as the tokenizer will
36-
than terminate and not find the backtick.
37-
But also for characters that fall out of the range of (U+0001..U+007F).
3836
"""
3937
if name.isidentifier() and not iskeyword(name):
4038
return name
4139

40+
# Escape characters that fall outside the ASCII range (U+0001..U+007F).
41+
# GH 49633
42+
gen = (
43+
(c, "".join(chr(b) for b in c.encode("ascii", "backslashreplace")))
44+
for c in name
45+
)
46+
name = "".join(
47+
c_escaped.replace("\\", "_UNICODE_" if c != c_escaped else "_BACKSLASH_")
48+
for c, c_escaped in gen
49+
)
50+
4251
# Create a dict with the special characters and their replacement string.
4352
# EXACT_TOKEN_TYPES contains these special characters
4453
# token.tok_name contains a readable description of the replacement string.
@@ -54,11 +63,10 @@ def create_valid_python_identifier(name: str) -> str:
5463
"$": "_DOLLARSIGN_",
5564
"€": "_EUROSIGN_",
5665
"°": "_DEGREESIGN_",
57-
# Including quotes works, but there are exceptions.
5866
"'": "_SINGLEQUOTE_",
5967
'"': "_DOUBLEQUOTE_",
60-
# Currently not possible. Terminates parser and won't find backtick.
61-
# "#": "_HASH_",
68+
"#": "_HASH_",
69+
"`": "_BACKTICK_",
6270
}
6371
)
6472

@@ -127,6 +135,9 @@ def clean_column_name(name: Hashable) -> Hashable:
127135
which is not caught and propagates to the user level.
128136
"""
129137
try:
138+
# Escape backticks
139+
name = name.replace("`", "``") if isinstance(name, str) else name
140+
130141
tokenized = tokenize_string(f"`{name}`")
131142
tokval = next(tokenized)[1]
132143
return create_valid_python_identifier(tokval)
@@ -168,6 +179,91 @@ def tokenize_backtick_quoted_string(
168179
return BACKTICK_QUOTED_STRING, source[string_start:string_end]
169180

170181

182+
class ParseState(Enum):
183+
DEFAULT = 0
184+
IN_BACKTICK = 1
185+
IN_SINGLE_QUOTE = 2
186+
IN_DOUBLE_QUOTE = 3
187+
188+
189+
def _split_by_backtick(s: str) -> list[tuple[bool, str]]:
190+
"""
191+
Splits a str into substrings along backtick characters (`).
192+
193+
Disregards backticks inside quotes.
194+
195+
Parameters
196+
----------
197+
s : str
198+
The Python source code string.
199+
200+
Returns
201+
-------
202+
substrings: list[tuple[bool, str]]
203+
List of tuples, where each tuple has two elements:
204+
The first is a boolean indicating if the substring is backtick-quoted.
205+
The second is the actual substring.
206+
"""
207+
substrings = []
208+
substr: list[str] = [] # Will join into a string before adding to `substrings`
209+
i = 0
210+
parse_state = ParseState.DEFAULT
211+
while i < len(s):
212+
char = s[i]
213+
214+
match char:
215+
case "`":
216+
# start of a backtick-quoted string
217+
if parse_state == ParseState.DEFAULT:
218+
if substr:
219+
substrings.append((False, "".join(substr)))
220+
221+
substr = [char]
222+
i += 1
223+
parse_state = ParseState.IN_BACKTICK
224+
continue
225+
226+
elif parse_state == ParseState.IN_BACKTICK:
227+
# escaped backtick inside a backtick-quoted string
228+
next_char = s[i + 1] if (i != len(s) - 1) else None
229+
if next_char == "`":
230+
substr.append(char)
231+
substr.append(next_char)
232+
i += 2
233+
continue
234+
235+
# end of the backtick-quoted string
236+
else:
237+
substr.append(char)
238+
substrings.append((True, "".join(substr)))
239+
240+
substr = []
241+
i += 1
242+
parse_state = ParseState.DEFAULT
243+
continue
244+
case "'":
245+
# start of a single-quoted string
246+
if parse_state == ParseState.DEFAULT:
247+
parse_state = ParseState.IN_SINGLE_QUOTE
248+
# end of a single-quoted string
249+
elif (parse_state == ParseState.IN_SINGLE_QUOTE) and (s[i - 1] != "\\"):
250+
parse_state = ParseState.DEFAULT
251+
case '"':
252+
# start of a double-quoted string
253+
if parse_state == ParseState.DEFAULT:
254+
parse_state = ParseState.IN_DOUBLE_QUOTE
255+
# end of a double-quoted string
256+
elif (parse_state == ParseState.IN_DOUBLE_QUOTE) and (s[i - 1] != "\\"):
257+
parse_state = ParseState.DEFAULT
258+
substr.append(char)
259+
i += 1
260+
261+
if substr:
262+
substrings.append((False, "".join(substr)))
263+
264+
return substrings
265+
266+
171267
def tokenize_string(source: str) -> Iterator[tuple[int, str]]:
172268
"""
173269
Tokenize a Python source code string.
@@ -182,18 +278,19 @@ def tokenize_string(source: str) -> Iterator[tuple[int, str]]:
182278
tok_generator : Iterator[Tuple[int, str]]
183279
An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]).
184280
"""
281+
# GH 59285
282+
# Escape characters, including backticks
283+
source = "".join(
284+
(
285+
create_valid_python_identifier(substring[1:-1])
286+
if is_backtick_quoted
287+
else substring
288+
)
289+
for is_backtick_quoted, substring in _split_by_backtick(source)
290+
)
291+
185292
line_reader = StringIO(source).readline
186293
token_generator = tokenize.generate_tokens(line_reader)
187294

188-
# Loop over all tokens till a backtick (`) is found.
189-
# Then, take all tokens till the next backtick to form a backtick quoted string
190-
for toknum, tokval, start, _, _ in token_generator:
191-
if tokval == "`":
192-
try:
193-
yield tokenize_backtick_quoted_string(
194-
token_generator, source, string_start=start[1] + 1
195-
)
196-
except Exception as err:
197-
raise SyntaxError(f"Failed to parse backticks in '{source}'.") from err
198-
else:
199-
yield toknum, tokval
295+
for toknum, tokval, _, _, _ in token_generator:
296+
yield toknum, tokval

pandas/core/frame.py

+3-11
Original file line numberDiff line numberDiff line change
@@ -4556,17 +4556,8 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No
45564556
quoted string are replaced by strings that are allowed as a Python identifier.
45574557
These characters include all operators in Python, the space character, the
45584558
question mark, the exclamation mark, the dollar sign, and the euro sign.
4559-
For other characters that fall outside the ASCII range (U+0001..U+007F)
4560-
and those that are not further specified in PEP 3131,
4561-
the query parser will raise an error.
4562-
This excludes whitespace different than the space character,
4563-
but also the hashtag (as it is used for comments) and the backtick
4564-
itself (backtick can also not be escaped).
4565-
4566-
In a special case, quotes that make a pair around a backtick can
4567-
confuse the parser.
4568-
For example, ```it's` > `that's``` will raise an error,
4569-
as it forms a quoted string (``'s > `that'``) with a backtick inside.
4559+
4560+
A backtick can be escaped by double backticks.
45704561
45714562
See also the `Python documentation about lexical analysis
45724563
<https://docs.python.org/3/reference/lexical_analysis.html>`__
@@ -4620,6 +4611,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No
46204611
raise ValueError(msg)
46214612
kwargs["level"] = kwargs.pop("level", 0) + 1
46224613
kwargs["target"] = None
4614+
46234615
res = self.eval(expr, **kwargs)
46244616

46254617
try:

0 commit comments

Comments
 (0)