From 3fc1bdd57544fc7241ae73dd2832eb958747efc9 Mon Sep 17 00:00:00 2001 From: Hielke Walinga Date: Sat, 4 Jan 2020 17:41:46 +0100 Subject: [PATCH] ENH: Add ability to use special characters for column names in query function. Clean up in the code that is used for using spaces in the query function and extending the ability to also use special characters that are not allowed in python identifiers. All files related to this functionality are now in the pandas/core/computation/parsing.py file. --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/computation/common.py | 14 -- pandas/core/computation/eval.py | 3 +- pandas/core/computation/expr.py | 72 +--------- pandas/core/computation/parsing.py | 190 ++++++++++++++++++++++++++ pandas/core/frame.py | 48 ++++++- pandas/core/generic.py | 27 ++-- pandas/tests/frame/test_query_eval.py | 80 ++++++++++- 8 files changed, 335 insertions(+), 100 deletions(-) create mode 100644 pandas/core/computation/parsing.py diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index b9cc1dad53674..014bd22aa2dab 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -1016,6 +1016,7 @@ Other - Bug in :meth:`Series.diff` where a boolean series would incorrectly raise a ``TypeError`` (:issue:`17294`) - :meth:`Series.append` will no longer raise a ``TypeError`` when passed a tuple of ``Series`` (:issue:`28410`) - Fix corrupted error message when calling ``pandas.libs._json.encode()`` on a 0d array (:issue:`18878`) +- Backtick quoting in :meth:`DataFrame.query` and :meth:`DataFrame.eval` can now also be used to use invalid identifiers like names that start with a digit, are python keywords, or are using single character operators. (:issue:`27017`) - Bug in ``pd.core.util.hashing.hash_pandas_object`` where arrays containing tuples were incorrectly treated as non-hashable (:issue:`28969`) - Bug in :meth:`DataFrame.append` that raised ``IndexError`` when appending with empty list (:issue:`28769`) - Fix :class:`AbstractHolidayCalendar` to return correct results for diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py index 994f470942cd1..19a8898a2987c 100644 --- a/pandas/core/computation/common.py +++ b/pandas/core/computation/common.py @@ -4,9 +4,6 @@ from pandas._config import get_option -# A token value Python's tokenizer probably will never use. -_BACKTICK_QUOTED_STRING = 100 - def _ensure_decoded(s): """ @@ -29,16 +26,5 @@ def result_type_many(*arrays_and_dtypes): return reduce(np.result_type, arrays_and_dtypes) -def _remove_spaces_column_name(name): - """ - Check if name contains any spaces, if it contains any spaces - the spaces will be removed and an underscore suffix is added. - """ - if not isinstance(name, str) or " " not in name: - return name - - return name.replace(" ", "_") + "_BACKTICK_QUOTED_STRING" - - class NameResolutionError(NameError): pass diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 7599a82ddffed..5c320042721dc 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -12,7 +12,8 @@ from pandas.util._validators import validate_bool_kwarg from pandas.core.computation.engines import _engines -from pandas.core.computation.expr import Expr, _parsers, tokenize_string +from pandas.core.computation.expr import Expr, _parsers +from pandas.core.computation.parsing import tokenize_string from pandas.core.computation.scope import ensure_scope from pandas.io.formats.printing import pprint_thing diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 9b422b28c3c27..1350587b5ca90 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -3,19 +3,13 @@ import ast from functools import partial, reduce -from io import StringIO -import itertools as it -import operator +from keyword import iskeyword import tokenize from typing import Optional, Type import numpy as np import pandas.core.common as com -from pandas.core.computation.common import ( - _BACKTICK_QUOTED_STRING, - _remove_spaces_column_name, -) from pandas.core.computation.ops import ( _LOCAL_TAG, BinOp, @@ -34,38 +28,12 @@ _unary_ops_syms, is_term, ) +from pandas.core.computation.parsing import clean_backtick_quoted_toks, tokenize_string from pandas.core.computation.scope import Scope import pandas.io.formats.printing as printing -def tokenize_string(source: str): - """ - Tokenize a Python source code string. - - Parameters - ---------- - source : str - A Python source code string - """ - line_reader = StringIO(source).readline - token_generator = tokenize.generate_tokens(line_reader) - - # Loop over all tokens till a backtick (`) is found. - # Then, take all tokens till the next backtick to form a backtick quoted - # string. - for toknum, tokval, _, _, _ in token_generator: - if tokval == "`": - tokval = " ".join( - it.takewhile( - lambda tokval: tokval != "`", - map(operator.itemgetter(1), token_generator), - ) - ) - toknum = _BACKTICK_QUOTED_STRING - yield toknum, tokval - - def _rewrite_assign(tok): """Rewrite the assignment operator for PyTables expressions that use ``=`` as a substitute for ``==``. @@ -133,31 +101,6 @@ def _replace_locals(tok): return toknum, tokval -def _clean_spaces_backtick_quoted_names(tok): - """Clean up a column name if surrounded by backticks. - - Backtick quoted string are indicated by a certain tokval value. If a string - is a backtick quoted token it will processed by - :func:`_remove_spaces_column_name` so that the parser can find this - string when the query is executed. - See also :meth:`NDFrame._get_space_character_free_column_resolver`. - - Parameters - ---------- - tok : tuple of int, str - ints correspond to the all caps constants in the tokenize module - - Returns - ------- - t : tuple of int, str - Either the input or token or the replacement values - """ - toknum, tokval = tok - if toknum == _BACKTICK_QUOTED_STRING: - return tokenize.NAME, _remove_spaces_column_name(tokval) - return toknum, tokval - - def _compose2(f, g): """Compose 2 callables""" return lambda *args, **kwargs: f(g(*args, **kwargs)) @@ -172,10 +115,7 @@ def _compose(*funcs): def _preparse( source: str, f=_compose( - _replace_locals, - _replace_booleans, - _rewrite_assign, - _clean_spaces_backtick_quoted_names, + _replace_locals, _replace_booleans, _rewrite_assign, clean_backtick_quoted_toks ), ): """Compose a collection of tokenization functions @@ -426,8 +366,6 @@ def visit(self, node, **kwargs): try: node = ast.fix_missing_locations(ast.parse(clean)) except SyntaxError as e: - from keyword import iskeyword - if any(iskeyword(x) for x in clean.split()): e.msg = "Python keyword not valid identifier in numexpr query" raise e @@ -781,9 +719,7 @@ def __init__( parser, preparser=partial( _preparse, - f=_compose( - _replace_locals, _replace_booleans, _clean_spaces_backtick_quoted_names - ), + f=_compose(_replace_locals, _replace_booleans, clean_backtick_quoted_toks), ), ): super().__init__(env, engine, parser, preparser) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py new file mode 100644 index 0000000000000..ce213c8532834 --- /dev/null +++ b/pandas/core/computation/parsing.py @@ -0,0 +1,190 @@ +""":func:`~pandas.eval` source string parsing functions +""" + +from io import StringIO +from keyword import iskeyword +import token +import tokenize +from typing import Iterator, Tuple + +# A token value Python's tokenizer probably will never use. +BACKTICK_QUOTED_STRING = 100 + + +def create_valid_python_identifier(name: str) -> str: + """ + Create valid Python identifiers from any string. + + Check if name contains any special characters. If it contains any + special characters, the special characters will be replaced by + a special string and a prefix is added. + + Raises + ------ + SyntaxError + If the returned name is not a Python valid identifier, raise an exception. + This can happen if there is a hashtag in the name, as the tokenizer will + than terminate and not find the backtick. + But also for characters that fall out of the range of (U+0001..U+007F). + """ + if name.isidentifier() and not iskeyword(name): + return name + + # Create a dict with the special characters and their replacement string. + # EXACT_TOKEN_TYPES contains these special characters + # toke.tok_name contains a readable description of the replacement string. + special_characters_replacements = { + char: f"_{token.tok_name[tokval]}_" + # The ignore here is because of a bug in mypy that is resolved in 0.740 + for char, tokval in tokenize.EXACT_TOKEN_TYPES.items() # type: ignore + } + special_characters_replacements.update( + { + " ": "_", + "?": "_QUESTIONMARK_", + "!": "_EXCLAMATIONMARK_", + "$": "_DOLLARSIGN_", + "€": "_EUROSIGN_", + # Including quotes works, but there are exceptions. + "'": "_SINGLEQUOTE_", + '"': "_DOUBLEQUOTE_", + # Currently not possible. Terminates parser and won't find backtick. + # "#": "_HASH_", + } + ) + + name = "".join(special_characters_replacements.get(char, char) for char in name) + name = "BACKTICK_QUOTED_STRING_" + name + + if not name.isidentifier(): + raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.") + + return name + + +def clean_backtick_quoted_toks(tok: Tuple[int, str]) -> Tuple[int, str]: + """ + Clean up a column name if surrounded by backticks. + + Backtick quoted string are indicated by a certain tokval value. If a string + is a backtick quoted token it will processed by + :func:`_create_valid_python_identifier` so that the parser can find this + string when the query is executed. + In this case the tok will get the NAME tokval. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + tok : Tuple[int, str] + Either the input or token or the replacement values + """ + toknum, tokval = tok + if toknum == BACKTICK_QUOTED_STRING: + return tokenize.NAME, create_valid_python_identifier(tokval) + return toknum, tokval + + +def clean_column_name(name: str) -> str: + """ + Function to emulate the cleaning of a backtick quoted name. + + The purpose for this function is to see what happens to the name of + identifier if it goes to the process of being parsed a Python code + inside a backtick quoted string and than being cleaned + (removed of any special characters). + + Parameters + ---------- + name : str + Name to be cleaned. + + Returns + ------- + name : str + Returns the name after tokenizing and cleaning. + + Notes + ----- + For some cases, a name cannot be converted to a valid Python identifier. + In that case :func:`tokenize_string` raises a SyntaxError. + In that case, we just return the name unmodified. + + If this name was used in the query string (this makes the query call impossible) + an error will be raised by :func:`tokenize_backtick_quoted_string` instead, + which is not catched and propogates to the user level. + """ + try: + tokenized = tokenize_string(f"`{name}`") + tokval = next(tokenized)[1] + return create_valid_python_identifier(tokval) + except SyntaxError: + return name + + +def tokenize_backtick_quoted_string( + token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int +) -> Tuple[int, str]: + """ + Creates a token from a backtick quoted string. + + Moves the token_generator forwards till right after the next backtick. + + Parameters + ---------- + token_generator : Iterator[tokenize.TokenInfo] + The generator that yields the tokens of the source string (Tuple[int, str]). + The generator is at the first token after the backtick (`) + + source : str + The Python source code string. + + string_start : int + This is the start of backtick quoted string inside the source string. + + Returns + ------- + tok: Tuple[int, str] + The token that represents the backtick quoted string. + The integer is equal to BACKTICK_QUOTED_STRING (100). + """ + for _, tokval, start, _, _ in token_generator: + if tokval == "`": + string_end = start[1] + break + + return BACKTICK_QUOTED_STRING, source[string_start:string_end] + + +def tokenize_string(source: str) -> Iterator[Tuple[int, str]]: + """ + Tokenize a Python source code string. + + Parameters + ---------- + source : str + The Python source code string. + + Returns + ------- + tok_generator : Iterator[Tuple[int, str]] + An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]). + """ + line_reader = StringIO(source).readline + token_generator = tokenize.generate_tokens(line_reader) + + # Loop over all tokens till a backtick (`) is found. + # Then, take all tokens till the next backtick to form a backtick quoted string + for toknum, tokval, start, _, _ in token_generator: + if tokval == "`": + try: + yield tokenize_backtick_quoted_string( + token_generator, source, string_start=start[1] + 1 + ) + except Exception: + raise SyntaxError(f"Failed to parse backticks in '{source}'.") + else: + yield toknum, tokval diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 65b315167bd58..97b218878f4cc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3066,18 +3066,27 @@ def query(self, expr, inplace=False, **kwargs): Parameters ---------- expr : str - The query string to evaluate. You can refer to variables + The query string to evaluate. + + You can refer to variables in the environment by prefixing them with an '@' character like ``@a + b``. - .. versionadded:: 0.25.0 - - You can refer to column names that contain spaces by surrounding - them in backticks. + You can refer to column names that contain spaces or operators by + surrounding them in backticks. This way you can also escape + names that start with a digit, or those that are a Python keyword. + Basically when it is not valid Python identifier. See notes down + for more details. For example, if one of your columns is called ``a a`` and you want to sum it with ``b``, your query should be ```a a` + b``. + .. versionadded:: 0.25.0 + Backtick quoting introduced. + + .. versionadded:: 1.0.0 + Expanding functionality of backtick quoting for more than only spaces. + inplace : bool Whether the query should modify the data in place or return a modified copy. @@ -3132,6 +3141,32 @@ def query(self, expr, inplace=False, **kwargs): For further details and examples see the ``query`` documentation in :ref:`indexing `. + *Backtick quoted variables* + + Backtick quoted variables are parsed as literal Python code and + are converted internally to a Python valid identifier. + This can lead to the following problems. + + During parsing a number of disallowed characters inside the backtick + quoted string are replaced by strings that are allowed as a Python identifier. + These characters include all operators in Python, the space character, the + question mark, the exclamation mark, the dollar sign, and the euro sign. + For other characters that fall outside the ASCII range (U+0001..U+007F) + and those that are not further specified in PEP 3131, + the query parser will raise an error. + This excludes whitespace different than the space character, + but also the hashtag (as it is used for comments) and the backtick + itself (backtick can also not be escaped). + + In a special case, quotes that make a pair around a backtick can + confuse the parser. + For example, ```it's` > `that's``` will raise an error, + as it forms a quoted string (``'s > `that'``) with a backtick inside. + + See also the Python documentation about lexical analysis + (https://docs.python.org/3/reference/lexical_analysis.html) + in combination with the source code in :mod:`pandas.core.computation.parsing`. + Examples -------- >>> df = pd.DataFrame({'A': range(1, 6), @@ -3281,11 +3316,12 @@ def eval(self, expr, inplace=False, **kwargs): kwargs["level"] = kwargs.pop("level", 0) + 1 if resolvers is None: index_resolvers = self._get_index_resolvers() - column_resolvers = self._get_space_character_free_column_resolvers() + column_resolvers = self._get_cleaned_column_resolvers() resolvers = column_resolvers, index_resolvers if "target" not in kwargs: kwargs["target"] = self kwargs["resolvers"] = kwargs.get("resolvers", ()) + tuple(resolvers) + return _eval(expr, inplace=inplace, **kwargs) def select_dtypes(self, include=None, exclude=None): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f5b0ce1ae77fb..21a22322daece 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -424,7 +424,7 @@ def _get_block_manager_axis(cls, axis): return m - axis return axis - def _get_axis_resolvers(self, axis): + def _get_axis_resolvers(self, axis: str) -> Dict[str, ABCSeries]: # index or columns axis_index = getattr(self, axis) d = dict() @@ -454,22 +454,29 @@ def _get_axis_resolvers(self, axis): d[axis] = dindex return d - def _get_index_resolvers(self): - d = {} + def _get_index_resolvers(self) -> Dict[str, ABCSeries]: + from pandas.core.computation.parsing import clean_column_name + + d: Dict[str, ABCSeries] = {} for axis_name in self._AXIS_ORDERS: d.update(self._get_axis_resolvers(axis_name)) - return d - def _get_space_character_free_column_resolvers(self): - """Return the space character free column resolvers of a dataframe. + return {clean_column_name(k): v for k, v in d.items() if k is not int} - Column names with spaces are 'cleaned up' so that they can be referred - to by backtick quoting. + def _get_cleaned_column_resolvers(self) -> Dict[str, ABCSeries]: + """ + Return the special character free column resolvers of a dataframe. + + Column names with special characters are 'cleaned up' so that they can + be referred to by backtick quoting. Used in :meth:`DataFrame.eval`. """ - from pandas.core.computation.common import _remove_spaces_column_name + from pandas.core.computation.parsing import clean_column_name + + if isinstance(self, ABCSeries): + return {clean_column_name(self.name): self} - return {_remove_spaces_column_name(k): v for k, v in self.items()} + return {clean_column_name(k): v for k, v in self.items() if k is not int} @property def _info_axis(self): diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 9cd26160ec877..578487ea3f54c 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1048,13 +1048,34 @@ def test_invalid_type_for_operator_raises(self, parser, engine, op): class TestDataFrameQueryBacktickQuoting: @pytest.fixture(scope="class") def df(self): + """ + Yields a dataframe with strings that may or may not need escaping + by backticks. The last two columns cannot be escaped by backticks + and should raise a ValueError. + """ yield DataFrame( { "A": [1, 2, 3], "B B": [3, 2, 1], "C C": [4, 5, 6], + "C C": [7, 4, 3], "C_C": [8, 9, 10], "D_D D": [11, 1, 101], + "E.E": [6, 3, 5], + "F-F": [8, 1, 10], + "1e1": [2, 4, 8], + "def": [10, 11, 2], + "A (x)": [4, 1, 3], + "B(x)": [1, 1, 5], + "B (x)": [2, 7, 4], + " &^ :!€$?(} > <++*'' ": [2, 5, 6], + "": [10, 11, 1], + " A": [4, 7, 9], + " ": [1, 2, 1], + "it's": [6, 3, 1], + "that's": [9, 1, 8], + "☺": [8, 7, 6], + "foo#bar": [2, 4, 5], } ) @@ -1093,7 +1114,64 @@ def test_mixed_underscores_and_spaces(self, df): expect = df["A"] + df["D_D D"] tm.assert_series_equal(res, expect) - def backtick_quote_name_with_no_spaces(self, df): + def test_backtick_quote_name_with_no_spaces(self, df): res = df.eval("A + `C_C`") expect = df["A"] + df["C_C"] tm.assert_series_equal(res, expect) + + def test_special_characters(self, df): + res = df.eval("`E.E` + `F-F` - A") + expect = df["E.E"] + df["F-F"] - df["A"] + tm.assert_series_equal(res, expect) + + def test_start_with_digit(self, df): + res = df.eval("A + `1e1`") + expect = df["A"] + df["1e1"] + tm.assert_series_equal(res, expect) + + def test_keyword(self, df): + res = df.eval("A + `def`") + expect = df["A"] + df["def"] + tm.assert_series_equal(res, expect) + + def test_unneeded_quoting(self, df): + res = df.query("`A` > 2") + expect = df[df["A"] > 2] + tm.assert_frame_equal(res, expect) + + def test_parenthesis(self, df): + res = df.query("`A (x)` > 2") + expect = df[df["A (x)"] > 2] + tm.assert_frame_equal(res, expect) + + def test_empty_string(self, df): + res = df.query("`` > 5") + expect = df[df[""] > 5] + tm.assert_frame_equal(res, expect) + + def test_multiple_spaces(self, df): + res = df.query("`C C` > 5") + expect = df[df["C C"] > 5] + tm.assert_frame_equal(res, expect) + + def test_start_with_spaces(self, df): + res = df.eval("` A` + ` `") + expect = df[" A"] + df[" "] + tm.assert_series_equal(res, expect) + + def test_lots_of_operators_string(self, df): + res = df.query("` &^ :!€$?(} > <++*'' ` > 4") + expect = df[df[" &^ :!€$?(} > <++*'' "] > 4] + tm.assert_frame_equal(res, expect) + + def test_failing_quote(self, df): + with pytest.raises(SyntaxError): + df.query("`it's` > `that's`") + + def test_failing_character_outside_range(self, df): + with pytest.raises(SyntaxError): + df.query("`☺` > 4") + + def test_failing_hashtag(self, df): + with pytest.raises(SyntaxError): + df.query("`foo#bar` > 4")