Skip to content

Add function to clean up column names with special characters #28215

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1016,6 +1016,7 @@ Other
- Bug in :meth:`Series.diff` where a boolean series would incorrectly raise a ``TypeError`` (:issue:`17294`)
- :meth:`Series.append` will no longer raise a ``TypeError`` when passed a tuple of ``Series`` (:issue:`28410`)
- Fix corrupted error message when calling ``pandas.libs._json.encode()`` on a 0d array (:issue:`18878`)
- Backtick quoting in :meth:`DataFrame.query` and :meth:`DataFrame.eval` can now also be used to use invalid identifiers like names that start with a digit, are python keywords, or are using single character operators. (:issue:`27017`)
- Bug in ``pd.core.util.hashing.hash_pandas_object`` where arrays containing tuples were incorrectly treated as non-hashable (:issue:`28969`)
- Bug in :meth:`DataFrame.append` that raised ``IndexError`` when appending with empty list (:issue:`28769`)
- Fix :class:`AbstractHolidayCalendar` to return correct results for
Expand Down
14 changes: 0 additions & 14 deletions pandas/core/computation/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@

from pandas._config import get_option

# A token value Python's tokenizer probably will never use.
_BACKTICK_QUOTED_STRING = 100


def _ensure_decoded(s):
"""
Expand All @@ -29,16 +26,5 @@ def result_type_many(*arrays_and_dtypes):
return reduce(np.result_type, arrays_and_dtypes)


def _remove_spaces_column_name(name):
"""
Check if name contains any spaces, if it contains any spaces
the spaces will be removed and an underscore suffix is added.
"""
if not isinstance(name, str) or " " not in name:
return name

return name.replace(" ", "_") + "_BACKTICK_QUOTED_STRING"


class NameResolutionError(NameError):
pass
3 changes: 2 additions & 1 deletion pandas/core/computation/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
from pandas.util._validators import validate_bool_kwarg

from pandas.core.computation.engines import _engines
from pandas.core.computation.expr import Expr, _parsers, tokenize_string
from pandas.core.computation.expr import Expr, _parsers
from pandas.core.computation.parsing import tokenize_string
from pandas.core.computation.scope import ensure_scope

from pandas.io.formats.printing import pprint_thing
Expand Down
72 changes: 4 additions & 68 deletions pandas/core/computation/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,13 @@

import ast
from functools import partial, reduce
from io import StringIO
import itertools as it
import operator
from keyword import iskeyword
import tokenize
from typing import Optional, Type

import numpy as np

import pandas.core.common as com
from pandas.core.computation.common import (
_BACKTICK_QUOTED_STRING,
_remove_spaces_column_name,
)
from pandas.core.computation.ops import (
_LOCAL_TAG,
BinOp,
Expand All @@ -34,38 +28,12 @@
_unary_ops_syms,
is_term,
)
from pandas.core.computation.parsing import clean_backtick_quoted_toks, tokenize_string
from pandas.core.computation.scope import Scope

import pandas.io.formats.printing as printing


def tokenize_string(source: str):
"""
Tokenize a Python source code string.

Parameters
----------
source : str
A Python source code string
"""
line_reader = StringIO(source).readline
token_generator = tokenize.generate_tokens(line_reader)

# Loop over all tokens till a backtick (`) is found.
# Then, take all tokens till the next backtick to form a backtick quoted
# string.
for toknum, tokval, _, _, _ in token_generator:
if tokval == "`":
tokval = " ".join(
it.takewhile(
lambda tokval: tokval != "`",
map(operator.itemgetter(1), token_generator),
)
)
toknum = _BACKTICK_QUOTED_STRING
yield toknum, tokval


def _rewrite_assign(tok):
"""Rewrite the assignment operator for PyTables expressions that use ``=``
as a substitute for ``==``.
Expand Down Expand Up @@ -133,31 +101,6 @@ def _replace_locals(tok):
return toknum, tokval


def _clean_spaces_backtick_quoted_names(tok):
"""Clean up a column name if surrounded by backticks.

Backtick quoted string are indicated by a certain tokval value. If a string
is a backtick quoted token it will processed by
:func:`_remove_spaces_column_name` so that the parser can find this
string when the query is executed.
See also :meth:`NDFrame._get_space_character_free_column_resolver`.

Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module

Returns
-------
t : tuple of int, str
Either the input or token or the replacement values
"""
toknum, tokval = tok
if toknum == _BACKTICK_QUOTED_STRING:
return tokenize.NAME, _remove_spaces_column_name(tokval)
return toknum, tokval


def _compose2(f, g):
"""Compose 2 callables"""
return lambda *args, **kwargs: f(g(*args, **kwargs))
Expand All @@ -172,10 +115,7 @@ def _compose(*funcs):
def _preparse(
source: str,
f=_compose(
_replace_locals,
_replace_booleans,
_rewrite_assign,
_clean_spaces_backtick_quoted_names,
_replace_locals, _replace_booleans, _rewrite_assign, clean_backtick_quoted_toks
),
):
"""Compose a collection of tokenization functions
Expand Down Expand Up @@ -426,8 +366,6 @@ def visit(self, node, **kwargs):
try:
node = ast.fix_missing_locations(ast.parse(clean))
except SyntaxError as e:
from keyword import iskeyword

if any(iskeyword(x) for x in clean.split()):
e.msg = "Python keyword not valid identifier in numexpr query"
raise e
Expand Down Expand Up @@ -781,9 +719,7 @@ def __init__(
parser,
preparser=partial(
_preparse,
f=_compose(
_replace_locals, _replace_booleans, _clean_spaces_backtick_quoted_names
),
f=_compose(_replace_locals, _replace_booleans, clean_backtick_quoted_toks),
),
):
super().__init__(env, engine, parser, preparser)
Expand Down
190 changes: 190 additions & 0 deletions pandas/core/computation/parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
""":func:`~pandas.eval` source string parsing functions
"""

from io import StringIO
from keyword import iskeyword
import token
import tokenize
from typing import Iterator, Tuple

# A token value Python's tokenizer probably will never use.
BACKTICK_QUOTED_STRING = 100


def create_valid_python_identifier(name: str) -> str:
"""
Create valid Python identifiers from any string.

Check if name contains any special characters. If it contains any
special characters, the special characters will be replaced by
a special string and a prefix is added.

Raises
------
SyntaxError
If the returned name is not a Python valid identifier, raise an exception.
This can happen if there is a hashtag in the name, as the tokenizer will
than terminate and not find the backtick.
But also for characters that fall out of the range of (U+0001..U+007F).
"""
if name.isidentifier() and not iskeyword(name):
return name

# Create a dict with the special characters and their replacement string.
# EXACT_TOKEN_TYPES contains these special characters
# toke.tok_name contains a readable description of the replacement string.
special_characters_replacements = {
char: f"_{token.tok_name[tokval]}_"
# The ignore here is because of a bug in mypy that is resolved in 0.740
for char, tokval in tokenize.EXACT_TOKEN_TYPES.items() # type: ignore
}
special_characters_replacements.update(
{
" ": "_",
"?": "_QUESTIONMARK_",
"!": "_EXCLAMATIONMARK_",
"$": "_DOLLARSIGN_",
"€": "_EUROSIGN_",
# Including quotes works, but there are exceptions.
"'": "_SINGLEQUOTE_",
'"': "_DOUBLEQUOTE_",
# Currently not possible. Terminates parser and won't find backtick.
# "#": "_HASH_",
}
)

name = "".join(special_characters_replacements.get(char, char) for char in name)
name = "BACKTICK_QUOTED_STRING_" + name

if not name.isidentifier():
raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.")

return name


def clean_backtick_quoted_toks(tok: Tuple[int, str]) -> Tuple[int, str]:
"""
Clean up a column name if surrounded by backticks.

Backtick quoted string are indicated by a certain tokval value. If a string
is a backtick quoted token it will processed by
:func:`_create_valid_python_identifier` so that the parser can find this
string when the query is executed.
In this case the tok will get the NAME tokval.

Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module

Returns
-------
tok : Tuple[int, str]
Either the input or token or the replacement values
"""
toknum, tokval = tok
if toknum == BACKTICK_QUOTED_STRING:
return tokenize.NAME, create_valid_python_identifier(tokval)
return toknum, tokval


def clean_column_name(name: str) -> str:
"""
Function to emulate the cleaning of a backtick quoted name.

The purpose for this function is to see what happens to the name of
identifier if it goes to the process of being parsed a Python code
inside a backtick quoted string and than being cleaned
(removed of any special characters).

Parameters
----------
name : str
Name to be cleaned.

Returns
-------
name : str
Returns the name after tokenizing and cleaning.

Notes
-----
For some cases, a name cannot be converted to a valid Python identifier.
In that case :func:`tokenize_string` raises a SyntaxError.
In that case, we just return the name unmodified.

If this name was used in the query string (this makes the query call impossible)
an error will be raised by :func:`tokenize_backtick_quoted_string` instead,
which is not catched and propogates to the user level.
"""
try:
tokenized = tokenize_string(f"`{name}`")
tokval = next(tokenized)[1]
return create_valid_python_identifier(tokval)
except SyntaxError:
return name


def tokenize_backtick_quoted_string(
token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int
) -> Tuple[int, str]:
"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add Parameters / Returns here

Creates a token from a backtick quoted string.

Moves the token_generator forwards till right after the next backtick.

Parameters
----------
token_generator : Iterator[tokenize.TokenInfo]
The generator that yields the tokens of the source string (Tuple[int, str]).
The generator is at the first token after the backtick (`)

source : str
The Python source code string.

string_start : int
This is the start of backtick quoted string inside the source string.

Returns
-------
tok: Tuple[int, str]
The token that represents the backtick quoted string.
The integer is equal to BACKTICK_QUOTED_STRING (100).
"""
for _, tokval, start, _, _ in token_generator:
if tokval == "`":
string_end = start[1]
break

return BACKTICK_QUOTED_STRING, source[string_start:string_end]


def tokenize_string(source: str) -> Iterator[Tuple[int, str]]:
"""
Tokenize a Python source code string.

Parameters
----------
source : str
The Python source code string.

Returns
-------
tok_generator : Iterator[Tuple[int, str]]
An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]).
"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add Returns

line_reader = StringIO(source).readline
token_generator = tokenize.generate_tokens(line_reader)

# Loop over all tokens till a backtick (`) is found.
# Then, take all tokens till the next backtick to form a backtick quoted string
for toknum, tokval, start, _, _ in token_generator:
if tokval == "`":
try:
yield tokenize_backtick_quoted_string(
token_generator, source, string_start=start[1] + 1
)
except Exception:
raise SyntaxError(f"Failed to parse backticks in '{source}'.")
else:
yield toknum, tokval
Loading