From ad0e9b8b5968f2c06b12569067395e3fbed5ed0d Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Sun, 21 Jul 2024 16:42:25 +0200 Subject: [PATCH 01/32] escape unescape sharp, single quote, double quote --- pandas/core/frame.py | 41 ++++++++++++++----- pandas/tests/computation/test_eval.py | 58 +++++++++++++++++++++++++++ 2 files changed, 90 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b8039746d9952..364bcd034b42c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -34,6 +34,7 @@ cast, overload, ) +import urllib.parse import warnings import numpy as np @@ -4559,14 +4560,8 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No For other characters that fall outside the ASCII range (U+0001..U+007F) and those that are not further specified in PEP 3131, the query parser will raise an error. - This excludes whitespace different than the space character, - but also the hashtag (as it is used for comments) and the backtick - itself (backtick can also not be escaped). - - In a special case, quotes that make a pair around a backtick can - confuse the parser. - For example, ```it's` > `that's``` will raise an error, - as it forms a quoted string (``'s > `that'``) with a backtick inside. + This excludes whitespace different than the space character + and the backtick itself (backtick cannot be escaped). See also the `Python documentation about lexical analysis `__ @@ -4620,7 +4615,35 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No raise ValueError(msg) kwargs["level"] = kwargs.pop("level", 0) + 1 kwargs["target"] = None - res = self.eval(expr, **kwargs) + + # GH 59285 + if any(("#" in col) or ("'" in col) or ('"' in col) for col in self.columns): + # Create a copy of `self` with column names escaped + escaped_self = self.copy() + escaped_self.columns = [ + urllib.parse.quote(col) for col in escaped_self.columns + ] + + # In expr, escape column names between backticks + column_name_to_escaped_name = { + col: urllib.parse.quote(col) for col in self.columns + } + escaped_expr = "`".join( + (column_name_to_escaped_name.get(token, token) if (i % 2) else token) + for i, token in enumerate(expr.split("`")) + ) + + # eval + escaped_res = escaped_self.eval(escaped_expr, **kwargs) + + # If `res` is a Series or DataFrame, unescape names + res = escaped_res.copy() + if isinstance(res, Series) and res.name: + res.name = urllib.parse.unquote(res.name) + elif isinstance(res, DataFrame): + res.columns = [urllib.parse.unquote(col) for col in res.columns] + else: + res = self.eval(expr, **kwargs) try: result = self.loc[res] diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 31d568d7c1e0c..e6edbd418837f 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -1978,6 +1978,64 @@ def test_eval_no_support_column_name(request, column): tm.assert_frame_equal(result, expected) +def test_query_on_column_name_with_hashtag_character(): + # GH 59285 + df = DataFrame((1, 2, 3), columns=["a#"]) + result = df.query("`a#` < 2") + expected = df[df["a#"] < 2] + tm.assert_frame_equal(result, expected) + + +def test_query_on_expr_with_comment(): + # GH 59285 + df = DataFrame((1, 2, 3), columns=["a#"]) + result = df.query("`a#` < 2 # This is a comment") + expected = df[df["a#"] < 2] + tm.assert_frame_equal(result, expected) + + +def test_query_on_column_names_with_single_quote_character(): + df = DataFrame( + [ + {"it's": 1, "that's": 2}, + {"it's": 3, "that's": 4}, + {"it's": -1, "that's": -2}, + {"it's": -3, "that's": -4}, + ] + ) + result = df.query("`it's` < `that's`") + expected = df[df["it's"] < df["that's"]] + tm.assert_frame_equal(result, expected) + + +def test_query_on_column_names_with_double_quote_character(): + df = DataFrame( + [ + {'it"s': 1, 'that"s': 2}, + {'it"s': 3, 'that"s': 4}, + {'it"s': -1, 'that"s': -2}, + {'it"s': -3, 'that"s': -4}, + ] + ) + result = df.query('`it"s` < `that"s`') + expected = df[df['it"s'] < df['that"s']] + tm.assert_frame_equal(result, expected) + + +def test_query_on_column_names_with_single_quote_and_double_quote_character(): + df = DataFrame( + [ + {"it's": 1, 'that\'s "nice"': 2}, + {"it's": 3, 'that\'s "nice"': 4}, + {"it's": -1, 'that\'s "nice"': -2}, + {"it's": -3, 'that\'s "nice"': -4}, + ] + ) + result = df.query("`it's` < `that's \"nice\"`") + expected = df[df["it's"] < df['that\'s "nice"']] + tm.assert_frame_equal(result, expected) + + def test_set_inplace(): # https://github.com/pandas-dev/pandas/issues/47449 # Ensure we don't only update the DataFrame inplace, but also the actual From 41fb2c80e873e50e9c9767feaf4e1e5c818d5943 Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Sun, 21 Jul 2024 16:53:20 +0200 Subject: [PATCH 02/32] parametrize and add tests --- pandas/tests/computation/test_eval.py | 52 +++++++++------------------ 1 file changed, 17 insertions(+), 35 deletions(-) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index e6edbd418837f..3e7fc62c4b48c 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -1994,45 +1994,27 @@ def test_query_on_expr_with_comment(): tm.assert_frame_equal(result, expected) -def test_query_on_column_names_with_single_quote_character(): - df = DataFrame( - [ - {"it's": 1, "that's": 2}, - {"it's": 3, "that's": 4}, - {"it's": -1, "that's": -2}, - {"it's": -3, "that's": -4}, - ] - ) - result = df.query("`it's` < `that's`") - expected = df[df["it's"] < df["that's"]] - tm.assert_frame_equal(result, expected) - - -def test_query_on_column_names_with_double_quote_character(): - df = DataFrame( - [ - {'it"s': 1, 'that"s': 2}, - {'it"s': 3, 'that"s': 4}, - {'it"s': -1, 'that"s': -2}, - {'it"s': -3, 'that"s': -4}, - ] - ) - result = df.query('`it"s` < `that"s`') - expected = df[df['it"s'] < df['that"s']] - tm.assert_frame_equal(result, expected) - - -def test_query_on_column_names_with_single_quote_and_double_quote_character(): +@pytest.mark.parametrize( + "col1,col2,expr", + [ + ("it's", "that's", "`it's` < `that's`"), + ('it"s', 'that"s', '`it"s` < `that"s`'), + ("it's", 'that\'s "nice"', "`it's` < `that's \"nice\"`"), + ("it's", "that's #cool", "`it's` < `that's #cool` # This is a comment"), + ], +) +def test_query_on_column_names_with_special_characters(col1, col2, expr): + # GH 59285 df = DataFrame( [ - {"it's": 1, 'that\'s "nice"': 2}, - {"it's": 3, 'that\'s "nice"': 4}, - {"it's": -1, 'that\'s "nice"': -2}, - {"it's": -3, 'that\'s "nice"': -4}, + {col1: 1, col2: 2}, + {col1: 3, col2: 4}, + {col1: -1, col2: -2}, + {col1: -3, col2: -4}, ] ) - result = df.query("`it's` < `that's \"nice\"`") - expected = df[df["it's"] < df['that\'s "nice"']] + result = df.query(expr) + expected = df[df[col1] < df[col2]] tm.assert_frame_equal(result, expected) From c756fc34ab2d3b2aacc613df7cba76ba7c73be5e Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Sun, 21 Jul 2024 17:48:21 +0200 Subject: [PATCH 03/32] reinstate text in docs, shorten some lines --- pandas/core/frame.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 364bcd034b42c..20f1fc9f750d4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4560,8 +4560,9 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No For other characters that fall outside the ASCII range (U+0001..U+007F) and those that are not further specified in PEP 3131, the query parser will raise an error. - This excludes whitespace different than the space character - and the backtick itself (backtick cannot be escaped). + This excludes whitespace different than the space character, + but also the hashtag (as it is used for comments) and the backtick + itself (backtick can also not be escaped). See also the `Python documentation about lexical analysis `__ @@ -4620,16 +4621,15 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No if any(("#" in col) or ("'" in col) or ('"' in col) for col in self.columns): # Create a copy of `self` with column names escaped escaped_self = self.copy() - escaped_self.columns = [ - urllib.parse.quote(col) for col in escaped_self.columns - ] + escaped_self.columns = map(urllib.parse.quote, escaped_self.columns) # In expr, escape column names between backticks - column_name_to_escaped_name = { + column_name_to_escaped = { col: urllib.parse.quote(col) for col in self.columns } + # Odd-number indexes are column names escaped_expr = "`".join( - (column_name_to_escaped_name.get(token, token) if (i % 2) else token) + (column_name_to_escaped.get(token, token) if (i % 2) else token) for i, token in enumerate(expr.split("`")) ) @@ -4641,7 +4641,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No if isinstance(res, Series) and res.name: res.name = urllib.parse.unquote(res.name) elif isinstance(res, DataFrame): - res.columns = [urllib.parse.unquote(col) for col in res.columns] + res.columns = map(urllib.parse.unquote, res.columns) else: res = self.eval(expr, **kwargs) From d4707b65eb91dd81aea200c28efe567d31a9b504 Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Sun, 21 Jul 2024 18:02:18 +0200 Subject: [PATCH 04/32] update whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/frame.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 3de65fe6f682c..19a0439cb20ce 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -647,6 +647,7 @@ Other - Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`) +- Bug in :meth:`DataFrame.query` which raised a ``KeyError`` when the expression contained column names with characters like ``#``. (:issue:`59285`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`) - Bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`) - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 20f1fc9f750d4..89ba9830c86a6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4627,7 +4627,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No column_name_to_escaped = { col: urllib.parse.quote(col) for col in self.columns } - # Odd-number indexes are column names + # A `token` with an odd-number index is a column name escaped_expr = "`".join( (column_name_to_escaped.get(token, token) if (i % 2) else token) for i, token in enumerate(expr.split("`")) From ba50d912924c68fa0f25091ba38fed565f465a46 Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Sun, 21 Jul 2024 18:03:58 +0200 Subject: [PATCH 05/32] minor: double space to single space --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 19a0439cb20ce..b5f7ebc26c2bb 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -647,7 +647,7 @@ Other - Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`) -- Bug in :meth:`DataFrame.query` which raised a ``KeyError`` when the expression contained column names with characters like ``#``. (:issue:`59285`) +- Bug in :meth:`DataFrame.query` which raised a ``KeyError`` when the expression contained column names with characters like ``#``. (:issue:`59285`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`) - Bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`) - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) From aaffbba09ebf137a4f280ac9b101217a8ed6c1ec Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Sat, 27 Jul 2024 20:25:54 +0200 Subject: [PATCH 06/32] move to parsing.py, split better, add tests --- pandas/core/computation/parsing.py | 80 ++++++++++++++++++++++++++- pandas/core/frame.py | 29 +--------- pandas/tests/computation/test_eval.py | 16 ++++++ 3 files changed, 95 insertions(+), 30 deletions(-) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index 8fbf8936d31ef..d5e2c5a9e8864 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -4,7 +4,10 @@ from __future__ import annotations -from io import StringIO +from io import ( + BytesIO, + StringIO, +) from keyword import iskeyword import token import tokenize @@ -58,7 +61,7 @@ def create_valid_python_identifier(name: str) -> str: "'": "_SINGLEQUOTE_", '"': "_DOUBLEQUOTE_", # Currently not possible. Terminates parser and won't find backtick. - # "#": "_HASH_", + "#": "_HASH_", } ) @@ -168,6 +171,69 @@ def tokenize_backtick_quoted_string( return BACKTICK_QUOTED_STRING, source[string_start:string_end] +def split_by_backtick(s: str) -> list[tuple[bool, str]]: + substrings = [] + substring = "" + i = 0 + while i < len(s): + backtick_index = s.find("`", i) + + # No backticks + if backtick_index == -1: + substrings.append((False, substring + s[i:])) + break + + single_quote_index = s.find("'", i) + double_quote_index = s.find('"', i) + if (single_quote_index == -1) and (double_quote_index == -1): + quote_index = -1 + elif single_quote_index == -1: + quote_index = double_quote_index + elif double_quote_index == -1: + quote_index = single_quote_index + else: + quote_index = min(single_quote_index, double_quote_index) + + # No quotes + if quote_index == -1: + next_backtick_index = s.find("`", backtick_index + 1) + # Backtick opened before quote + elif backtick_index < quote_index: + next_backtick_index = s.find("`", backtick_index + 1) + # Quote opened before backtick + else: + next_quote_index = -1 + line_reader = BytesIO(s[i:].encode("utf-8")).readline + token_generator = tokenize.tokenize(line_reader) + for toknum, _, (_, _), (_, end), _ in token_generator: + if toknum == tokenize.STRING: + next_quote_index = i + end - 1 + break + + # Quote is unmatched + if next_quote_index == -1: + next_backtick_index = s.find("`", backtick_index + 1) + # Quote is matched + else: + substring += s[i:next_quote_index] + i = next_quote_index + continue + + # Backtick is unmatched + if next_backtick_index == -1: + substrings.append((False, substring + s[i:])) + break + # Backtick is matched + else: + if i != backtick_index: + substrings.append((False, substring + s[i:backtick_index])) + substrings.append((True, s[backtick_index : next_backtick_index + 1])) + substring = "" + i = next_backtick_index + 1 + + return substrings + + def tokenize_string(source: str) -> Iterator[tuple[int, str]]: """ Tokenize a Python source code string. @@ -182,6 +248,16 @@ def tokenize_string(source: str) -> Iterator[tuple[int, str]]: tok_generator : Iterator[Tuple[int, str]] An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]). """ + # GH 59285 + source = "".join( + ( + f"`{create_valid_python_identifier(substring[1:-1])}`" + if is_backticked + else substring + ) + for is_backticked, substring in split_by_backtick(source) + ) + line_reader = StringIO(source).readline token_generator = tokenize.generate_tokens(line_reader) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 89ba9830c86a6..f2d87ee36490a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -34,7 +34,6 @@ cast, overload, ) -import urllib.parse import warnings import numpy as np @@ -4617,33 +4616,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No kwargs["level"] = kwargs.pop("level", 0) + 1 kwargs["target"] = None - # GH 59285 - if any(("#" in col) or ("'" in col) or ('"' in col) for col in self.columns): - # Create a copy of `self` with column names escaped - escaped_self = self.copy() - escaped_self.columns = map(urllib.parse.quote, escaped_self.columns) - - # In expr, escape column names between backticks - column_name_to_escaped = { - col: urllib.parse.quote(col) for col in self.columns - } - # A `token` with an odd-number index is a column name - escaped_expr = "`".join( - (column_name_to_escaped.get(token, token) if (i % 2) else token) - for i, token in enumerate(expr.split("`")) - ) - - # eval - escaped_res = escaped_self.eval(escaped_expr, **kwargs) - - # If `res` is a Series or DataFrame, unescape names - res = escaped_res.copy() - if isinstance(res, Series) and res.name: - res.name = urllib.parse.unquote(res.name) - elif isinstance(res, DataFrame): - res.columns = map(urllib.parse.unquote, res.columns) - else: - res = self.eval(expr, **kwargs) + res = self.eval(expr, **kwargs) try: result = self.loc[res] diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 3e7fc62c4b48c..02833ba60ed8d 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -1994,6 +1994,22 @@ def test_query_on_expr_with_comment(): tm.assert_frame_equal(result, expected) +def test_query_on_expr_with_backticks(): + # GH 59285 + df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"]) + result = df.query("'`' < `#backticks`") + expected = df["`" < df["#backticks"]] + tm.assert_frame_equal(result, expected) + + +def test_query_on_expr_with_backticked_string_same_as_column_name(): + # GH 59285 + df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"]) + result = df.query("'`#backticks`' < `#backticks`") + expected = df["`#backticks`" < df["#backticks"]] + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "col1,col2,expr", [ From 0c75550b60d8744c2221adc1d28efa366f4c7dc5 Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Sat, 27 Jul 2024 20:48:16 +0200 Subject: [PATCH 07/32] clean up --- pandas/core/computation/parsing.py | 38 +++++++++++++++++++----------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index d5e2c5a9e8864..0cb681a88765a 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -171,7 +171,24 @@ def tokenize_backtick_quoted_string( return BACKTICK_QUOTED_STRING, source[string_start:string_end] -def split_by_backtick(s: str) -> list[tuple[bool, str]]: +def _split_by_backtick(s: str) -> list[tuple[bool, str]]: + """ + Splits a str into substrings along backtick characters (`). + + Disregards backticks inside quotes. + + Parameters + ---------- + s : str + The Python source code string. + + Returns + ------- + substrings: list[tuple[bool, str]] + List of tuples, where each tuple has two elements: + The first is a boolean indicating if the substring is backtick-quoted. + The second is the actual substring. + """ substrings = [] substring = "" i = 0 @@ -249,13 +266,14 @@ def tokenize_string(source: str) -> Iterator[tuple[int, str]]: An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]). """ # GH 59285 + # Escape characters, including backticks source = "".join( ( - f"`{create_valid_python_identifier(substring[1:-1])}`" - if is_backticked + create_valid_python_identifier(substring[1:-1]) + if is_backtick_quoted else substring ) - for is_backticked, substring in split_by_backtick(source) + for is_backtick_quoted, substring in _split_by_backtick(source) ) line_reader = StringIO(source).readline @@ -263,13 +281,5 @@ def tokenize_string(source: str) -> Iterator[tuple[int, str]]: # Loop over all tokens till a backtick (`) is found. # Then, take all tokens till the next backtick to form a backtick quoted string - for toknum, tokval, start, _, _ in token_generator: - if tokval == "`": - try: - yield tokenize_backtick_quoted_string( - token_generator, source, string_start=start[1] + 1 - ) - except Exception as err: - raise SyntaxError(f"Failed to parse backticks in '{source}'.") from err - else: - yield toknum, tokval + for toknum, tokval, _, _, _ in token_generator: + yield toknum, tokval From 90c5dbcfc44a5125843db9f1b7f1f295c2fc3379 Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Sat, 27 Jul 2024 20:50:36 +0200 Subject: [PATCH 08/32] remove old comment --- pandas/core/computation/parsing.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index 0cb681a88765a..de0a4f8774c15 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -279,7 +279,5 @@ def tokenize_string(source: str) -> Iterator[tuple[int, str]]: line_reader = StringIO(source).readline token_generator = tokenize.generate_tokens(line_reader) - # Loop over all tokens till a backtick (`) is found. - # Then, take all tokens till the next backtick to form a backtick quoted string for toknum, tokval, _, _, _ in token_generator: yield toknum, tokval From c0ee651d444d9a5542121ce4a8343b342fb1a018 Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Sat, 27 Jul 2024 21:04:11 +0200 Subject: [PATCH 09/32] test names --- pandas/tests/computation/test_eval.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 02833ba60ed8d..2ff2f8c7162b6 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -1994,7 +1994,7 @@ def test_query_on_expr_with_comment(): tm.assert_frame_equal(result, expected) -def test_query_on_expr_with_backticks(): +def test_query_on_expr_with_string_with_backticks(): # GH 59285 df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"]) result = df.query("'`' < `#backticks`") @@ -2002,7 +2002,7 @@ def test_query_on_expr_with_backticks(): tm.assert_frame_equal(result, expected) -def test_query_on_expr_with_backticked_string_same_as_column_name(): +def test_query_on_expr_with_string_with_backticked_substring_same_as_column_name(): # GH 59285 df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"]) result = df.query("'`#backticks`' < `#backticks`") From b7dc1a8173ba80209532950db7847e46a49abcce Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Sat, 27 Jul 2024 21:13:53 +0200 Subject: [PATCH 10/32] minor test change --- pandas/tests/computation/test_eval.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 2ff2f8c7162b6..792fa97d97640 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -1997,8 +1997,8 @@ def test_query_on_expr_with_comment(): def test_query_on_expr_with_string_with_backticks(): # GH 59285 df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"]) - result = df.query("'`' < `#backticks`") - expected = df["`" < df["#backticks"]] + result = df.query("'```' < `#backticks`") + expected = df["```" < df["#backticks"]] tm.assert_frame_equal(result, expected) From 164e3c50c9d655af0c5155cc34c8e407e067b8cc Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Sat, 27 Jul 2024 23:54:46 +0200 Subject: [PATCH 11/32] improve splitting --- pandas/core/computation/parsing.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index de0a4f8774c15..f7ab8168614f4 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -227,16 +227,17 @@ def _split_by_backtick(s: str) -> list[tuple[bool, str]]: next_quote_index = i + end - 1 break - # Quote is unmatched + # Quote is unmatched (Possibly a mistake) if next_quote_index == -1: - next_backtick_index = s.find("`", backtick_index + 1) + substrings.append((False, substring + s[i:])) + break # Quote is matched else: - substring += s[i:next_quote_index] - i = next_quote_index + substring += s[i : next_quote_index + 1] + i = next_quote_index + 1 continue - # Backtick is unmatched + # Backtick is unmatched (Possibly a mistake) if next_backtick_index == -1: substrings.append((False, substring + s[i:])) break From 4040370a21a055490293387e0d41401861d34e0e Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Sun, 28 Jul 2024 00:18:14 +0200 Subject: [PATCH 12/32] fix splitting --- pandas/core/computation/parsing.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index f7ab8168614f4..9646068c6bca1 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -231,7 +231,11 @@ def _split_by_backtick(s: str) -> list[tuple[bool, str]]: if next_quote_index == -1: substrings.append((False, substring + s[i:])) break - # Quote is matched + # Quote is matched, and the next quote is at the end of the string + elif next_quote_index + 1 == len(s): + substrings.append((False, substring + s[i:])) + break + # Quote is matched, and the next quote is in the middle of the string else: substring += s[i : next_quote_index + 1] i = next_quote_index + 1 From 148d1ed2aeebbaa0090e4beb72037532682c845f Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Sun, 28 Jul 2024 00:39:59 +0200 Subject: [PATCH 13/32] improve splitting --- pandas/core/computation/parsing.py | 31 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index 9646068c6bca1..a8df3a738a389 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -211,12 +211,23 @@ def _split_by_backtick(s: str) -> list[tuple[bool, str]]: else: quote_index = min(single_quote_index, double_quote_index) - # No quotes - if quote_index == -1: - next_backtick_index = s.find("`", backtick_index + 1) + # No quotes, or # Backtick opened before quote - elif backtick_index < quote_index: + if (quote_index == -1) or (backtick_index < quote_index): next_backtick_index = s.find("`", backtick_index + 1) + + # Backtick is unmatched (Possibly a mistake) + if next_backtick_index == -1: + substrings.append((False, substring + s[i:])) + break + # Backtick is matched + else: + if i != backtick_index: + substrings.append((False, substring + s[i:backtick_index])) + substrings.append((True, s[backtick_index : next_backtick_index + 1])) + substring = "" + i = next_backtick_index + 1 + # Quote opened before backtick else: next_quote_index = -1 @@ -241,18 +252,6 @@ def _split_by_backtick(s: str) -> list[tuple[bool, str]]: i = next_quote_index + 1 continue - # Backtick is unmatched (Possibly a mistake) - if next_backtick_index == -1: - substrings.append((False, substring + s[i:])) - break - # Backtick is matched - else: - if i != backtick_index: - substrings.append((False, substring + s[i:backtick_index])) - substrings.append((True, s[backtick_index : next_backtick_index + 1])) - substring = "" - i = next_backtick_index + 1 - return substrings From 990d0d392f23e7c6b6986c0c8bb047de2c2e467a Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Sun, 28 Jul 2024 01:44:24 +0200 Subject: [PATCH 14/32] add tests --- pandas/core/computation/parsing.py | 12 ++---- pandas/tests/computation/test_eval.py | 61 +++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 8 deletions(-) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index a8df3a738a389..12991abc7f399 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -216,13 +216,13 @@ def _split_by_backtick(s: str) -> list[tuple[bool, str]]: if (quote_index == -1) or (backtick_index < quote_index): next_backtick_index = s.find("`", backtick_index + 1) - # Backtick is unmatched (Possibly a mistake) + # Backtick is unmatched (Bad syntax) if next_backtick_index == -1: substrings.append((False, substring + s[i:])) break # Backtick is matched else: - if i != backtick_index: + if substring or (i != backtick_index): substrings.append((False, substring + s[i:backtick_index])) substrings.append((True, s[backtick_index : next_backtick_index + 1])) substring = "" @@ -238,19 +238,15 @@ def _split_by_backtick(s: str) -> list[tuple[bool, str]]: next_quote_index = i + end - 1 break - # Quote is unmatched (Possibly a mistake) - if next_quote_index == -1: - substrings.append((False, substring + s[i:])) - break + # Quote is unmatched (Bad syntax), or # Quote is matched, and the next quote is at the end of the string - elif next_quote_index + 1 == len(s): + if (next_quote_index == -1) or (next_quote_index + 1 == len(s)): substrings.append((False, substring + s[i:])) break # Quote is matched, and the next quote is in the middle of the string else: substring += s[i : next_quote_index + 1] i = next_quote_index + 1 - continue return substrings diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 792fa97d97640..d2547a02fba03 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -2034,6 +2034,67 @@ def test_query_on_column_names_with_special_characters(col1, col2, expr): tm.assert_frame_equal(result, expected) +def test_query_on_expr_with_no_backticks(): + # GH 59285 + df = DataFrame(("aaa", "vvv", "zzz"), columns=["column_name"]) + result = df.query("'value' < column_name") + expected = df["value" < df["column_name"]] + tm.assert_frame_equal(result, expected) + + +def test_query_on_expr_with_no_quotes_and_backtick_is_unmatched(): + # GH 59285 + df = DataFrame((1, 5, 10), columns=["column-name"]) + with pytest.raises(SyntaxError, match="invalid syntax"): + df.query("5 < `column-name") + + +def test_query_on_expr_with_no_quotes_and_backtick_is_matched(): + # GH 59285 + df = DataFrame((1, 5, 10), columns=["column-name"]) + result = df.query("5 < `column-name`") + expected = df[5 < df["column-name"]] + tm.assert_frame_equal(result, expected) + + +def test_query_on_expr_with_backtick_opened_before_quote_and_backtick_is_unmatched(): + # GH 59285 + df = DataFrame((1, 5, 10), columns=["It's"]) + with pytest.raises(SyntaxError, match="unterminated string literal"): + df.query("5 < `It's") + + +def test_query_on_expr_with_backtick_opened_before_quote_and_backtick_is_matched(): + # GH 59285 + df = DataFrame((1, 5, 10), columns=["It's"]) + result = df.query("5 < `It's`") + expected = df[5 < df["It's"]] + tm.assert_frame_equal(result, expected) + + +def test_query_on_expr_with_quote_opened_before_backtick_and_quote_is_unmatched(): + # GH 59285 + df = DataFrame(("aaa", "vvv", "zzz"), columns=['It`s that\\\'s "quote" #hash']) + with pytest.raises(SyntaxError, match="unterminated string literal"): + df.query("`column-name` < 'It`s that\\'s \"quote\" #hash") + + +def test_query_on_expr_with_quote_opened_before_backtick_and_quote_is_matched_at_end(): + # GH 59285 + df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"]) + result = df.query("`column-name` < 'It`s that\\'s \"quote\" #hash'") + expected = df[df["column-name"] < 'It`s that\'s "quote" #hash'] + tm.assert_frame_equal(result, expected) + + +def test_query_on_expr_with_quote_opened_before_backtick_and_quote_is_matched_in_mid(): + # GH 59285 + df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"]) + result = df.query("'It`s that\\'s \"quote\" #hash' < `column-name`") + expected = df['It`s that\'s "quote" #hash' < df["column-name"]] + tm.assert_frame_equal(result, expected) + + def test_set_inplace(): # https://github.com/pandas-dev/pandas/issues/47449 # Ensure we don't only update the DataFrame inplace, but also the actual From e674eb83236d6e832ebd76497e60d1aaa4ad3fce Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Sun, 28 Jul 2024 02:29:53 +0200 Subject: [PATCH 15/32] edit docstring and comments --- pandas/core/computation/parsing.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index 12991abc7f399..5030d1728d799 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -35,9 +35,8 @@ def create_valid_python_identifier(name: str) -> str: ------ SyntaxError If the returned name is not a Python valid identifier, raise an exception. - This can happen if there is a hashtag in the name, as the tokenizer will - than terminate and not find the backtick. - But also for characters that fall out of the range of (U+0001..U+007F). + This can happen if the name includes characters that fall out of the range of + (U+0001..U+007F). """ if name.isidentifier() and not iskeyword(name): return name @@ -60,7 +59,6 @@ def create_valid_python_identifier(name: str) -> str: # Including quotes works, but there are exceptions. "'": "_SINGLEQUOTE_", '"': "_DOUBLEQUOTE_", - # Currently not possible. Terminates parser and won't find backtick. "#": "_HASH_", } ) @@ -239,11 +237,11 @@ def _split_by_backtick(s: str) -> list[tuple[bool, str]]: break # Quote is unmatched (Bad syntax), or - # Quote is matched, and the next quote is at the end of the string + # Quote is matched, and the next quote is at the end of s if (next_quote_index == -1) or (next_quote_index + 1 == len(s)): substrings.append((False, substring + s[i:])) break - # Quote is matched, and the next quote is in the middle of the string + # Quote is matched, and the next quote is in the middle of s else: substring += s[i : next_quote_index + 1] i = next_quote_index + 1 From 6a0ac728ce5ef7aa57de2a52e1a29a8420453749 Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Sun, 28 Jul 2024 04:08:32 +0200 Subject: [PATCH 16/32] minor test change --- pandas/tests/computation/test_eval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index d2547a02fba03..b969c7f270810 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -2074,7 +2074,7 @@ def test_query_on_expr_with_backtick_opened_before_quote_and_backtick_is_matched def test_query_on_expr_with_quote_opened_before_backtick_and_quote_is_unmatched(): # GH 59285 - df = DataFrame(("aaa", "vvv", "zzz"), columns=['It`s that\\\'s "quote" #hash']) + df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"]) with pytest.raises(SyntaxError, match="unterminated string literal"): df.query("`column-name` < 'It`s that\\'s \"quote\" #hash") From f2126b3e7f141b9fd4748544717422abda347042 Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Sun, 28 Jul 2024 21:53:47 +0200 Subject: [PATCH 17/32] escape backticks --- pandas/core/computation/parsing.py | 2 +- pandas/tests/computation/test_eval.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index 5030d1728d799..67758fc99ae5e 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -56,7 +56,6 @@ def create_valid_python_identifier(name: str) -> str: "$": "_DOLLARSIGN_", "€": "_EUROSIGN_", "°": "_DEGREESIGN_", - # Including quotes works, but there are exceptions. "'": "_SINGLEQUOTE_", '"': "_DOUBLEQUOTE_", "#": "_HASH_", @@ -128,6 +127,7 @@ def clean_column_name(name: Hashable) -> Hashable: which is not caught and propagates to the user level. """ try: + name = name.replace("`", "``") # Escape backticks tokenized = tokenize_string(f"`{name}`") tokval = next(tokenized)[1] return create_valid_python_identifier(tokval) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index b969c7f270810..7ddb62b1b17f0 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -1994,6 +1994,14 @@ def test_query_on_expr_with_comment(): tm.assert_frame_equal(result, expected) +def test_query_on_expr_with_column_name_with_backtick_and_hash(): + # GH 59285 + df = DataFrame((1, 2, 3), columns=["a`#b"]) + result = df.query("`a``#b` < 2") + expected = df[df["a`#b"] < 2] + tm.assert_frame_equal(result, expected) + + def test_query_on_expr_with_string_with_backticks(): # GH 59285 df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"]) From 168f56c36db6662954487def0a8ffe04ce10f8fd Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Mon, 29 Jul 2024 13:15:43 +0200 Subject: [PATCH 18/32] escape backticks properly --- pandas/core/computation/parsing.py | 8 ++++++++ pandas/tests/computation/test_eval.py | 9 +++++++++ 2 files changed, 17 insertions(+) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index 67758fc99ae5e..05db7d51b5b20 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -59,6 +59,7 @@ def create_valid_python_identifier(name: str) -> str: "'": "_SINGLEQUOTE_", '"': "_DOUBLEQUOTE_", "#": "_HASH_", + "`": "_BACKTICK_", } ) @@ -213,6 +214,13 @@ def _split_by_backtick(s: str) -> list[tuple[bool, str]]: # Backtick opened before quote if (quote_index == -1) or (backtick_index < quote_index): next_backtick_index = s.find("`", backtick_index + 1) + while ( + (next_backtick_index != -1) + and (next_backtick_index != len(s) - 1) + and (s[next_backtick_index + 1] == "`") + ): + # Since the next character is also a backtick, it's an escaped backtick + next_backtick_index = s.find("`", next_backtick_index + 2) # Backtick is unmatched (Bad syntax) if next_backtick_index == -1: diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 7ddb62b1b17f0..1c6a58d044410 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -2002,6 +2002,15 @@ def test_query_on_expr_with_column_name_with_backtick_and_hash(): tm.assert_frame_equal(result, expected) +def test_query_on_expr_with_column_name_with_backtick(): + # GH 59285 + df = DataFrame({"a`b": (1, 2, 3), "ab": (4, 5, 6)}) + result = df.query("`a``b` < 2") + # Note: Formatting checks may wrongly consider the above``inline code``. + expected = df[df["a`b"] < 2] + tm.assert_frame_equal(result, expected) + + def test_query_on_expr_with_string_with_backticks(): # GH 59285 df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"]) From 810c82b3c2dabf6f0ef6abfb04c28f5c46fa6ce3 Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Tue, 30 Jul 2024 13:11:08 +0200 Subject: [PATCH 19/32] comment --- pandas/tests/computation/test_eval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 1c6a58d044410..1d5f5c3e76bcd 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -2006,7 +2006,7 @@ def test_query_on_expr_with_column_name_with_backtick(): # GH 59285 df = DataFrame({"a`b": (1, 2, 3), "ab": (4, 5, 6)}) result = df.query("`a``b` < 2") - # Note: Formatting checks may wrongly consider the above``inline code``. + # Note: Formatting checks may wrongly consider the above ``inline code``. expected = df[df["a`b"] < 2] tm.assert_frame_equal(result, expected) From 86947b2188b5cd9a1716713db810065816e8c022 Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Tue, 30 Jul 2024 14:25:24 +0200 Subject: [PATCH 20/32] fix tests --- pandas/tests/computation/test_eval.py | 134 ------------------------- pandas/tests/frame/test_query_eval.py | 135 ++++++++++++++++++++++++-- 2 files changed, 127 insertions(+), 142 deletions(-) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 1d5f5c3e76bcd..31d568d7c1e0c 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -1978,140 +1978,6 @@ def test_eval_no_support_column_name(request, column): tm.assert_frame_equal(result, expected) -def test_query_on_column_name_with_hashtag_character(): - # GH 59285 - df = DataFrame((1, 2, 3), columns=["a#"]) - result = df.query("`a#` < 2") - expected = df[df["a#"] < 2] - tm.assert_frame_equal(result, expected) - - -def test_query_on_expr_with_comment(): - # GH 59285 - df = DataFrame((1, 2, 3), columns=["a#"]) - result = df.query("`a#` < 2 # This is a comment") - expected = df[df["a#"] < 2] - tm.assert_frame_equal(result, expected) - - -def test_query_on_expr_with_column_name_with_backtick_and_hash(): - # GH 59285 - df = DataFrame((1, 2, 3), columns=["a`#b"]) - result = df.query("`a``#b` < 2") - expected = df[df["a`#b"] < 2] - tm.assert_frame_equal(result, expected) - - -def test_query_on_expr_with_column_name_with_backtick(): - # GH 59285 - df = DataFrame({"a`b": (1, 2, 3), "ab": (4, 5, 6)}) - result = df.query("`a``b` < 2") - # Note: Formatting checks may wrongly consider the above ``inline code``. - expected = df[df["a`b"] < 2] - tm.assert_frame_equal(result, expected) - - -def test_query_on_expr_with_string_with_backticks(): - # GH 59285 - df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"]) - result = df.query("'```' < `#backticks`") - expected = df["```" < df["#backticks"]] - tm.assert_frame_equal(result, expected) - - -def test_query_on_expr_with_string_with_backticked_substring_same_as_column_name(): - # GH 59285 - df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"]) - result = df.query("'`#backticks`' < `#backticks`") - expected = df["`#backticks`" < df["#backticks"]] - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "col1,col2,expr", - [ - ("it's", "that's", "`it's` < `that's`"), - ('it"s', 'that"s', '`it"s` < `that"s`'), - ("it's", 'that\'s "nice"', "`it's` < `that's \"nice\"`"), - ("it's", "that's #cool", "`it's` < `that's #cool` # This is a comment"), - ], -) -def test_query_on_column_names_with_special_characters(col1, col2, expr): - # GH 59285 - df = DataFrame( - [ - {col1: 1, col2: 2}, - {col1: 3, col2: 4}, - {col1: -1, col2: -2}, - {col1: -3, col2: -4}, - ] - ) - result = df.query(expr) - expected = df[df[col1] < df[col2]] - tm.assert_frame_equal(result, expected) - - -def test_query_on_expr_with_no_backticks(): - # GH 59285 - df = DataFrame(("aaa", "vvv", "zzz"), columns=["column_name"]) - result = df.query("'value' < column_name") - expected = df["value" < df["column_name"]] - tm.assert_frame_equal(result, expected) - - -def test_query_on_expr_with_no_quotes_and_backtick_is_unmatched(): - # GH 59285 - df = DataFrame((1, 5, 10), columns=["column-name"]) - with pytest.raises(SyntaxError, match="invalid syntax"): - df.query("5 < `column-name") - - -def test_query_on_expr_with_no_quotes_and_backtick_is_matched(): - # GH 59285 - df = DataFrame((1, 5, 10), columns=["column-name"]) - result = df.query("5 < `column-name`") - expected = df[5 < df["column-name"]] - tm.assert_frame_equal(result, expected) - - -def test_query_on_expr_with_backtick_opened_before_quote_and_backtick_is_unmatched(): - # GH 59285 - df = DataFrame((1, 5, 10), columns=["It's"]) - with pytest.raises(SyntaxError, match="unterminated string literal"): - df.query("5 < `It's") - - -def test_query_on_expr_with_backtick_opened_before_quote_and_backtick_is_matched(): - # GH 59285 - df = DataFrame((1, 5, 10), columns=["It's"]) - result = df.query("5 < `It's`") - expected = df[5 < df["It's"]] - tm.assert_frame_equal(result, expected) - - -def test_query_on_expr_with_quote_opened_before_backtick_and_quote_is_unmatched(): - # GH 59285 - df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"]) - with pytest.raises(SyntaxError, match="unterminated string literal"): - df.query("`column-name` < 'It`s that\\'s \"quote\" #hash") - - -def test_query_on_expr_with_quote_opened_before_backtick_and_quote_is_matched_at_end(): - # GH 59285 - df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"]) - result = df.query("`column-name` < 'It`s that\\'s \"quote\" #hash'") - expected = df[df["column-name"] < 'It`s that\'s "quote" #hash'] - tm.assert_frame_equal(result, expected) - - -def test_query_on_expr_with_quote_opened_before_backtick_and_quote_is_matched_in_mid(): - # GH 59285 - df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"]) - result = df.query("'It`s that\\'s \"quote\" #hash' < `column-name`") - expected = df['It`s that\'s "quote" #hash' < df["column-name"]] - tm.assert_frame_equal(result, expected) - - def test_set_inplace(): # https://github.com/pandas-dev/pandas/issues/47449 # Ensure we don't only update the DataFrame inplace, but also the actual diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index aa2fb19fe8528..550241201836d 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1341,20 +1341,139 @@ def test_missing_attribute(self, df): with pytest.raises(AttributeError, match=message): df.eval("@pd.thing") - def test_failing_quote(self, df): - msg = r"(Could not convert ).*( to a valid Python identifier.)" - with pytest.raises(SyntaxError, match=msg): - df.query("`it's` > `that's`") + def test_quote(self, df): + res = df.query("`it's` > `that's`") + expect = df[df["it's"] > df["that's"]] + tm.assert_frame_equal(res, expect) def test_failing_character_outside_range(self, df): msg = r"(Could not convert ).*( to a valid Python identifier.)" with pytest.raises(SyntaxError, match=msg): df.query("`☺` > 4") - def test_failing_hashtag(self, df): - msg = "Failed to parse backticks" - with pytest.raises(SyntaxError, match=msg): - df.query("`foo#bar` > 4") + def test_hashtag(self, df): + res = df.query("`foo#bar` > 4") + expect = df[df["foo#bar"] > 4] + tm.assert_frame_equal(res, expect) + + def test_expr_with_column_name_with_hashtag_character(self): + # GH 59285 + df = DataFrame((1, 2, 3), columns=["a#"]) + result = df.query("`a#` < 2") + expected = df[df["a#"] < 2] + tm.assert_frame_equal(result, expected) + + def test_expr_with_comment(self): + # GH 59285 + df = DataFrame((1, 2, 3), columns=["a#"]) + result = df.query("`a#` < 2 # This is a comment") + expected = df[df["a#"] < 2] + tm.assert_frame_equal(result, expected) + + def test_expr_with_column_name_with_backtick_and_hash(self): + # GH 59285 + df = DataFrame((1, 2, 3), columns=["a`#b"]) + result = df.query("`a``#b` < 2") + expected = df[df["a`#b"] < 2] + tm.assert_frame_equal(result, expected) + + def test_expr_with_column_name_with_backtick(self): + # GH 59285 + df = DataFrame({"a`b": (1, 2, 3), "ab": (4, 5, 6)}) + result = df.query("`a``b` < 2") + # Note: Formatting checks may wrongly consider the above ``inline code``. + expected = df[df["a`b"] < 2] + tm.assert_frame_equal(result, expected) + + def test_expr_with_string_with_backticks(self): + # GH 59285 + df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"]) + result = df.query("'```' < `#backticks`") + expected = df["```" < df["#backticks"]] + tm.assert_frame_equal(result, expected) + + def test_expr_with_string_with_backticked_substring_same_as_column_name(self): + # GH 59285 + df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"]) + result = df.query("'`#backticks`' < `#backticks`") + expected = df["`#backticks`" < df["#backticks"]] + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "col1,col2,expr", + [ + ("it's", "that's", "`it's` < `that's`"), + ('it"s', 'that"s', '`it"s` < `that"s`'), + ("it's", 'that\'s "nice"', "`it's` < `that's \"nice\"`"), + ("it's", "that's #cool", "`it's` < `that's #cool` # This is a comment"), + ], + ) + def test_expr_with_column_names_with_special_characters(self, col1, col2, expr): + # GH 59285 + df = DataFrame( + [ + {col1: 1, col2: 2}, + {col1: 3, col2: 4}, + {col1: -1, col2: -2}, + {col1: -3, col2: -4}, + ] + ) + result = df.query(expr) + expected = df[df[col1] < df[col2]] + tm.assert_frame_equal(result, expected) + + def test_expr_with_no_backticks(self): + # GH 59285 + df = DataFrame(("aaa", "vvv", "zzz"), columns=["column_name"]) + result = df.query("'value' < column_name") + expected = df["value" < df["column_name"]] + tm.assert_frame_equal(result, expected) + + def test_expr_with_no_quotes_and_backtick_is_unmatched(self): + # GH 59285 + df = DataFrame((1, 5, 10), columns=["column-name"]) + with pytest.raises(SyntaxError, match="invalid syntax"): + df.query("5 < `column-name") + + def test_expr_with_no_quotes_and_backtick_is_matched(self): + # GH 59285 + df = DataFrame((1, 5, 10), columns=["column-name"]) + result = df.query("5 < `column-name`") + expected = df[5 < df["column-name"]] + tm.assert_frame_equal(result, expected) + + def test_expr_with_backtick_opened_before_quote_and_backtick_is_unmatched(self): + # GH 59285 + df = DataFrame((1, 5, 10), columns=["It's"]) + with pytest.raises(SyntaxError, match="unterminated string literal"): + df.query("5 < `It's") + + def test_expr_with_backtick_opened_before_quote_and_backtick_is_matched(self): + # GH 59285 + df = DataFrame((1, 5, 10), columns=["It's"]) + result = df.query("5 < `It's`") + expected = df[5 < df["It's"]] + tm.assert_frame_equal(result, expected) + + def test_expr_with_quote_opened_before_backtick_and_quote_is_unmatched(self): + # GH 59285 + df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"]) + with pytest.raises(SyntaxError, match="unterminated string literal"): + df.query("`column-name` < 'It`s that\\'s \"quote\" #hash") + + def test_expr_with_quote_opened_before_backtick_and_quote_is_matched_at_end(self): + # GH 59285 + df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"]) + result = df.query("`column-name` < 'It`s that\\'s \"quote\" #hash'") + expected = df[df["column-name"] < 'It`s that\'s "quote" #hash'] + tm.assert_frame_equal(result, expected) + + def test_expr_with_quote_opened_before_backtick_and_quote_is_matched_in_mid(self): + # GH 59285 + df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"]) + result = df.query("'It`s that\\'s \"quote\" #hash' < `column-name`") + expected = df['It`s that\'s "quote" #hash' < df["column-name"]] + tm.assert_frame_equal(result, expected) def test_call_non_named_expression(self, df): """ From e99db1ce03953fd4078dd3b368b30c1a9a25f1ca Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Tue, 6 Aug 2024 18:10:29 +0200 Subject: [PATCH 21/32] GH 49633: special characters --- pandas/core/computation/parsing.py | 10 ++++++++++ pandas/core/frame.py | 8 ++------ pandas/tests/frame/test_query_eval.py | 22 ++++++++++++++++++---- 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index 05db7d51b5b20..d793111f93aeb 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -41,6 +41,16 @@ def create_valid_python_identifier(name: str) -> str: if name.isidentifier() and not iskeyword(name): return name + # Escape characters that fall outside the ASCII range (U+0001..U+007F). + # GH 49633 + c_escaped_gen = ( + "".join(chr(b) for b in c.encode("ascii", "backslashreplace")) for c in name + ) + name = "".join( + c_escaped.replace("\\", "_UNICODE_" if c != c_escaped else "_BACKSLASH_") + for c, c_escaped in zip(name, c_escaped_gen) + ) + # Create a dict with the special characters and their replacement string. # EXACT_TOKEN_TYPES contains these special characters # token.tok_name contains a readable description of the replacement string. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f2d87ee36490a..ef7ffcb652149 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4556,12 +4556,8 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No quoted string are replaced by strings that are allowed as a Python identifier. These characters include all operators in Python, the space character, the question mark, the exclamation mark, the dollar sign, and the euro sign. - For other characters that fall outside the ASCII range (U+0001..U+007F) - and those that are not further specified in PEP 3131, - the query parser will raise an error. - This excludes whitespace different than the space character, - but also the hashtag (as it is used for comments) and the backtick - itself (backtick can also not be escaped). + + A backtick can be escaped by double backticks. See also the `Python documentation about lexical analysis `__ diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 550241201836d..b8769cc7d4be8 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1246,6 +1246,8 @@ def df(self): "it's": [6, 3, 1], "that's": [9, 1, 8], "☺": [8, 7, 6], + "xy (z)": [1, 2, 3], + "xy (z\\uff09": [4, 5, 6], "foo#bar": [2, 4, 5], 1: [5, 7, 9], } @@ -1346,10 +1348,22 @@ def test_quote(self, df): expect = df[df["it's"] > df["that's"]] tm.assert_frame_equal(res, expect) - def test_failing_character_outside_range(self, df): - msg = r"(Could not convert ).*( to a valid Python identifier.)" - with pytest.raises(SyntaxError, match=msg): - df.query("`☺` > 4") + def test_character_outside_range_smiley(self, df): + res = df.query("`☺` > 4") + expect = df[df["☺"] > 4] + tm.assert_frame_equal(res, expect) + + def test_character_outside_range_2_byte_parens(self, df): + # GH 49633 + res = df.query("`xy (z)` == 2") + expect = df[df["xy (z)"] == 2] + tm.assert_frame_equal(res, expect) + + def test_character_outside_range_and_actual_backslash(self, df): + # GH 49633 + res = df.query("`xy (z\\uff09` == 2") + expect = df[df["xy \uff08z\\uff09"] == 2] + tm.assert_frame_equal(res, expect) def test_hashtag(self, df): res = df.query("`foo#bar` > 4") From a005f135d16c9c4666886db37ff101139d4db85c Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Tue, 6 Aug 2024 18:14:34 +0200 Subject: [PATCH 22/32] add noqa --- pandas/tests/frame/test_query_eval.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index b8769cc7d4be8..b1b75ea0c16cb 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1246,8 +1246,8 @@ def df(self): "it's": [6, 3, 1], "that's": [9, 1, 8], "☺": [8, 7, 6], - "xy (z)": [1, 2, 3], - "xy (z\\uff09": [4, 5, 6], + "xy (z)": [1, 2, 3], # noqa: RUF001 + "xy (z\\uff09": [4, 5, 6], # noqa: RUF001 "foo#bar": [2, 4, 5], 1: [5, 7, 9], } @@ -1355,13 +1355,13 @@ def test_character_outside_range_smiley(self, df): def test_character_outside_range_2_byte_parens(self, df): # GH 49633 - res = df.query("`xy (z)` == 2") - expect = df[df["xy (z)"] == 2] + res = df.query("`xy (z)` == 2") # noqa: RUF001 + expect = df[df["xy (z)"] == 2] # noqa: RUF001 tm.assert_frame_equal(res, expect) def test_character_outside_range_and_actual_backslash(self, df): # GH 49633 - res = df.query("`xy (z\\uff09` == 2") + res = df.query("`xy (z\\uff09` == 2") # noqa: RUF001 expect = df[df["xy \uff08z\\uff09"] == 2] tm.assert_frame_equal(res, expect) @@ -1394,7 +1394,7 @@ def test_expr_with_column_name_with_backtick_and_hash(self): def test_expr_with_column_name_with_backtick(self): # GH 59285 df = DataFrame({"a`b": (1, 2, 3), "ab": (4, 5, 6)}) - result = df.query("`a``b` < 2") + result = df.query("`a``b` < 2") # noqa # Note: Formatting checks may wrongly consider the above ``inline code``. expected = df[df["a`b"] < 2] tm.assert_frame_equal(result, expected) From a77a215a0f700d5a0c7fd30eb6e6ba9c94371f66 Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Tue, 6 Aug 2024 19:40:21 +0200 Subject: [PATCH 23/32] update docstring, --- pandas/core/computation/parsing.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index d793111f93aeb..5fcaec1bb8a93 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -35,20 +35,19 @@ def create_valid_python_identifier(name: str) -> str: ------ SyntaxError If the returned name is not a Python valid identifier, raise an exception. - This can happen if the name includes characters that fall out of the range of - (U+0001..U+007F). """ if name.isidentifier() and not iskeyword(name): return name # Escape characters that fall outside the ASCII range (U+0001..U+007F). # GH 49633 - c_escaped_gen = ( - "".join(chr(b) for b in c.encode("ascii", "backslashreplace")) for c in name + gen = ( + (c, "".join(chr(b) for b in c.encode("ascii", "backslashreplace"))) + for c in name ) name = "".join( c_escaped.replace("\\", "_UNICODE_" if c != c_escaped else "_BACKSLASH_") - for c, c_escaped in zip(name, c_escaped_gen) + for c, c_escaped in gen ) # Create a dict with the special characters and their replacement string. From daf2c37d9de49ead30de14351aebf9d8eedde2e9 Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Tue, 6 Aug 2024 22:47:49 +0200 Subject: [PATCH 24/32] unmatched backtick or quote can raise SyntaxError OR TokenError --- pandas/tests/frame/test_query_eval.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index b1b75ea0c16cb..0a83c8113e814 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1,4 +1,5 @@ import operator +from tokenize import TokenError import numpy as np import pytest @@ -1446,7 +1447,7 @@ def test_expr_with_no_backticks(self): def test_expr_with_no_quotes_and_backtick_is_unmatched(self): # GH 59285 df = DataFrame((1, 5, 10), columns=["column-name"]) - with pytest.raises(SyntaxError, match="invalid syntax"): + with pytest.raises((SyntaxError, TokenError), match="invalid syntax"): df.query("5 < `column-name") def test_expr_with_no_quotes_and_backtick_is_matched(self): @@ -1459,7 +1460,9 @@ def test_expr_with_no_quotes_and_backtick_is_matched(self): def test_expr_with_backtick_opened_before_quote_and_backtick_is_unmatched(self): # GH 59285 df = DataFrame((1, 5, 10), columns=["It's"]) - with pytest.raises(SyntaxError, match="unterminated string literal"): + with pytest.raises( + (SyntaxError, TokenError), match="unterminated string literal" + ): df.query("5 < `It's") def test_expr_with_backtick_opened_before_quote_and_backtick_is_matched(self): @@ -1472,7 +1475,9 @@ def test_expr_with_backtick_opened_before_quote_and_backtick_is_matched(self): def test_expr_with_quote_opened_before_backtick_and_quote_is_unmatched(self): # GH 59285 df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"]) - with pytest.raises(SyntaxError, match="unterminated string literal"): + with pytest.raises( + (SyntaxError, TokenError), match="unterminated string literal" + ): df.query("`column-name` < 'It`s that\\'s \"quote\" #hash") def test_expr_with_quote_opened_before_backtick_and_quote_is_matched_at_end(self): From 984431bfaa2864cabab52aa501fe84e7b842167d Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Wed, 7 Aug 2024 06:37:23 +0200 Subject: [PATCH 25/32] change splitting --- pandas/core/computation/parsing.py | 136 +++++++++++++++-------------- 1 file changed, 70 insertions(+), 66 deletions(-) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index 5fcaec1bb8a93..7b2c9816c607a 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -4,10 +4,8 @@ from __future__ import annotations -from io import ( - BytesIO, - StringIO, -) +from enum import Enum +from io import StringIO from keyword import iskeyword import token import tokenize @@ -179,6 +177,13 @@ def tokenize_backtick_quoted_string( return BACKTICK_QUOTED_STRING, source[string_start:string_end] +class ParseState(Enum): + DEFAULT = 0 + IN_BACKTICK = 1 + IN_SINGLE_QUOTE = 2 + IN_DOUBLE_QUOTE = 3 + + def _split_by_backtick(s: str) -> list[tuple[bool, str]]: """ Splits a str into substrings along backtick characters (`). @@ -198,70 +203,69 @@ def _split_by_backtick(s: str) -> list[tuple[bool, str]]: The second is the actual substring. """ substrings = [] - substring = "" + substr = "" i = 0 + parse_state = ParseState.DEFAULT while i < len(s): - backtick_index = s.find("`", i) - - # No backticks - if backtick_index == -1: - substrings.append((False, substring + s[i:])) - break - - single_quote_index = s.find("'", i) - double_quote_index = s.find('"', i) - if (single_quote_index == -1) and (double_quote_index == -1): - quote_index = -1 - elif single_quote_index == -1: - quote_index = double_quote_index - elif double_quote_index == -1: - quote_index = single_quote_index - else: - quote_index = min(single_quote_index, double_quote_index) - - # No quotes, or - # Backtick opened before quote - if (quote_index == -1) or (backtick_index < quote_index): - next_backtick_index = s.find("`", backtick_index + 1) - while ( - (next_backtick_index != -1) - and (next_backtick_index != len(s) - 1) - and (s[next_backtick_index + 1] == "`") - ): - # Since the next character is also a backtick, it's an escaped backtick - next_backtick_index = s.find("`", next_backtick_index + 2) - - # Backtick is unmatched (Bad syntax) - if next_backtick_index == -1: - substrings.append((False, substring + s[i:])) - break - # Backtick is matched - else: - if substring or (i != backtick_index): - substrings.append((False, substring + s[i:backtick_index])) - substrings.append((True, s[backtick_index : next_backtick_index + 1])) - substring = "" - i = next_backtick_index + 1 - - # Quote opened before backtick - else: - next_quote_index = -1 - line_reader = BytesIO(s[i:].encode("utf-8")).readline - token_generator = tokenize.tokenize(line_reader) - for toknum, _, (_, _), (_, end), _ in token_generator: - if toknum == tokenize.STRING: - next_quote_index = i + end - 1 - break - - # Quote is unmatched (Bad syntax), or - # Quote is matched, and the next quote is at the end of s - if (next_quote_index == -1) or (next_quote_index + 1 == len(s)): - substrings.append((False, substring + s[i:])) - break - # Quote is matched, and the next quote is in the middle of s - else: - substring += s[i : next_quote_index + 1] - i = next_quote_index + 1 + char = s[i] + + match char: + case "`": + # start of a backtick-quoted string + if parse_state == ParseState.DEFAULT: + if substr: + substrings.append((False, substr)) + substr = char + i += 1 + parse_state = ParseState.IN_BACKTICK + continue + elif parse_state == ParseState.IN_BACKTICK: + # escaped backtick inside a backtick-quoted string + next_char = s[i + 1] if (i != len(s) - 1) else None + if next_char == "`": + substr += char + next_char + i += 2 + continue + # end of the backtick-quoted string + else: + substr += char + substrings.append((True, substr)) + + substr = "" + i += 1 + parse_state = ParseState.DEFAULT + continue + case "'": + # start of a single-quoted string + if parse_state == ParseState.DEFAULT: + substr += char + i += 1 + parse_state = ParseState.IN_SINGLE_QUOTE + continue + # end of a single-quoted string + elif (parse_state == ParseState.IN_SINGLE_QUOTE) and (s[i - 1] != "\\"): + substr += char + i += 1 + parse_state = ParseState.DEFAULT + continue + case '"': + # start of a double-quoted string + if parse_state == ParseState.DEFAULT: + substr += char + i += 1 + parse_state = ParseState.IN_DOUBLE_QUOTE + continue + # end of a double-quoted string + elif (parse_state == ParseState.IN_DOUBLE_QUOTE) and (s[i - 1] != "\\"): + substr += char + i += 1 + parse_state = ParseState.DEFAULT + continue + substr += char + i += 1 + + if substr: + substrings.append((False, substr)) return substrings From b0833c0a51915b0189751838fd48c7efd8ad9840 Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Wed, 7 Aug 2024 06:40:43 +0200 Subject: [PATCH 26/32] remove repeated --- pandas/core/computation/parsing.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index 7b2c9816c607a..4353e046db9ec 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -238,29 +238,17 @@ def _split_by_backtick(s: str) -> list[tuple[bool, str]]: case "'": # start of a single-quoted string if parse_state == ParseState.DEFAULT: - substr += char - i += 1 parse_state = ParseState.IN_SINGLE_QUOTE - continue # end of a single-quoted string elif (parse_state == ParseState.IN_SINGLE_QUOTE) and (s[i - 1] != "\\"): - substr += char - i += 1 parse_state = ParseState.DEFAULT - continue case '"': # start of a double-quoted string if parse_state == ParseState.DEFAULT: - substr += char - i += 1 parse_state = ParseState.IN_DOUBLE_QUOTE - continue # end of a double-quoted string elif (parse_state == ParseState.IN_DOUBLE_QUOTE) and (s[i - 1] != "\\"): - substr += char - i += 1 parse_state = ParseState.DEFAULT - continue substr += char i += 1 From 5e0631d028c43697d5c367bc391ca54e38164298 Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Fri, 9 Aug 2024 03:13:36 +0200 Subject: [PATCH 27/32] collect chars in a list --- pandas/core/computation/parsing.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index 4353e046db9ec..9b58bb7bf9e7a 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -203,7 +203,7 @@ def _split_by_backtick(s: str) -> list[tuple[bool, str]]: The second is the actual substring. """ substrings = [] - substr = "" + substr = [] # collect in a list, join into a string before adding to substrings i = 0 parse_state = ParseState.DEFAULT while i < len(s): @@ -214,24 +214,28 @@ def _split_by_backtick(s: str) -> list[tuple[bool, str]]: # start of a backtick-quoted string if parse_state == ParseState.DEFAULT: if substr: - substrings.append((False, substr)) - substr = char + substrings.append((False, "".join(substr))) + + substr = [char] i += 1 parse_state = ParseState.IN_BACKTICK continue + elif parse_state == ParseState.IN_BACKTICK: # escaped backtick inside a backtick-quoted string next_char = s[i + 1] if (i != len(s) - 1) else None if next_char == "`": - substr += char + next_char + substr.append(char) + substr.append(next_char) i += 2 continue + # end of the backtick-quoted string else: - substr += char - substrings.append((True, substr)) + substr.append(char) + substrings.append((True, "".join(substr))) - substr = "" + substr = [] i += 1 parse_state = ParseState.DEFAULT continue @@ -249,11 +253,11 @@ def _split_by_backtick(s: str) -> list[tuple[bool, str]]: # end of a double-quoted string elif (parse_state == ParseState.IN_DOUBLE_QUOTE) and (s[i - 1] != "\\"): parse_state = ParseState.DEFAULT - substr += char + substr.append(char) i += 1 if substr: - substrings.append((False, substr)) + substrings.append((False, "".join(substr))) return substrings From d3669c7974ddb5c24e694d8fc4bfeb8e08aad357 Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Fri, 9 Aug 2024 03:31:07 +0200 Subject: [PATCH 28/32] add issue 49633 to whatsnew --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index b5f7ebc26c2bb..452e4a9bfece8 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -647,7 +647,7 @@ Other - Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`) -- Bug in :meth:`DataFrame.query` which raised a ``KeyError`` when the expression contained column names with characters like ``#``. (:issue:`59285`) +- Bug in :meth:`DataFrame.query` which raised an exception or produced incorrect results when expressions contained backtick-quoted column names containing the hash character ``#``, backticks, or characters that fall outside the ASCII range (U+0001..U+007F). (:issue:`59285`) (:issue:`49633`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`) - Bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`) - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) From 87ded7c3d3130b6c432044b32ebaaf51f3515605 Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Sat, 10 Aug 2024 15:17:56 +0200 Subject: [PATCH 29/32] atone for my typing sins :) --- pandas/core/computation/parsing.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index 9b58bb7bf9e7a..b35dda87bbfb4 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -135,7 +135,9 @@ def clean_column_name(name: Hashable) -> Hashable: which is not caught and propagates to the user level. """ try: - name = name.replace("`", "``") # Escape backticks + # Escape backticks + name = name.replace("`", "``") if isinstance(name, str) else name + tokenized = tokenize_string(f"`{name}`") tokval = next(tokenized)[1] return create_valid_python_identifier(tokval) @@ -203,7 +205,7 @@ def _split_by_backtick(s: str) -> list[tuple[bool, str]]: The second is the actual substring. """ substrings = [] - substr = [] # collect in a list, join into a string before adding to substrings + substr: list[str] = [] # join into a string before adding to `substrings` i = 0 parse_state = ParseState.DEFAULT while i < len(s): From ad18c87c6b5ee9243cf4f2a674dc1850dba42233 Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Sat, 10 Aug 2024 15:23:22 +0200 Subject: [PATCH 30/32] exclude test_query_eval.py for rst-inline-touching-normal in .pre-commit-config.yaml --- .pre-commit-config.yaml | 1 + pandas/core/computation/parsing.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b81b9ba070a44..882be47c47ee7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -85,6 +85,7 @@ repos: types: [text] # overwrite types: [rst] types_or: [python, rst] - id: rst-inline-touching-normal + exclude: ^pandas/tests/frame/test_query_eval.py types: [text] # overwrite types: [rst] types_or: [python, rst] - repo: https://github.com/sphinx-contrib/sphinx-lint diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index b35dda87bbfb4..35a6d1c6ad269 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -205,7 +205,7 @@ def _split_by_backtick(s: str) -> list[tuple[bool, str]]: The second is the actual substring. """ substrings = [] - substr: list[str] = [] # join into a string before adding to `substrings` + substr: list[str] = [] # Will join into a string before adding to `substrings` i = 0 parse_state = ParseState.DEFAULT while i < len(s): From 173f3996079c8226c6779e2560fd66f0e5463f24 Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Mon, 12 Aug 2024 15:36:45 +0200 Subject: [PATCH 31/32] tests: add decorators for Future Infer Strings job --- pandas/tests/frame/test_query_eval.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 0a83c8113e814..fa71153d01157 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1400,6 +1400,7 @@ def test_expr_with_column_name_with_backtick(self): expected = df[df["a`b"] < 2] tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_expr_with_string_with_backticks(self): # GH 59285 df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"]) @@ -1407,6 +1408,7 @@ def test_expr_with_string_with_backticks(self): expected = df["```" < df["#backticks"]] tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_expr_with_string_with_backticked_substring_same_as_column_name(self): # GH 59285 df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"]) @@ -1437,6 +1439,7 @@ def test_expr_with_column_names_with_special_characters(self, col1, col2, expr): expected = df[df[col1] < df[col2]] tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_expr_with_no_backticks(self): # GH 59285 df = DataFrame(("aaa", "vvv", "zzz"), columns=["column_name"]) @@ -1480,6 +1483,7 @@ def test_expr_with_quote_opened_before_backtick_and_quote_is_unmatched(self): ): df.query("`column-name` < 'It`s that\\'s \"quote\" #hash") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_expr_with_quote_opened_before_backtick_and_quote_is_matched_at_end(self): # GH 59285 df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"]) @@ -1487,6 +1491,7 @@ def test_expr_with_quote_opened_before_backtick_and_quote_is_matched_at_end(self expected = df[df["column-name"] < 'It`s that\'s "quote" #hash'] tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_expr_with_quote_opened_before_backtick_and_quote_is_matched_in_mid(self): # GH 59285 df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"]) From 9ee2231edc25893293645a64a4e25e47715ab201 Mon Sep 17 00:00:00 2001 From: aram-cinnamon Date: Tue, 13 Aug 2024 02:41:52 +0200 Subject: [PATCH 32/32] pre-commit exclude --- .pre-commit-config.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 882be47c47ee7..f6717dd503c9b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -23,6 +23,7 @@ repos: hooks: - id: ruff args: [--exit-non-zero-on-fix] + exclude: ^pandas/tests/frame/test_query_eval.py - id: ruff # TODO: remove autofixe-only rules when they are checked by ruff name: ruff-selected-autofixes @@ -31,7 +32,7 @@ repos: exclude: ^pandas/tests args: [--select, "ANN001,ANN2", --fix-only, --exit-non-zero-on-fix] - id: ruff-format - exclude: ^scripts + exclude: ^scripts|^pandas/tests/frame/test_query_eval.py - repo: https://github.com/jendrikseipp/vulture rev: 'v2.11' hooks: