-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
/
Copy pathparsing.py
199 lines (162 loc) · 6.25 KB
/
parsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
"""
:func:`~pandas.eval` source string parsing functions
"""
from __future__ import annotations
from io import StringIO
from keyword import iskeyword
import token
import tokenize
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from collections.abc import (
Hashable,
Iterator,
)
# A token value Python's tokenizer probably will never use.
BACKTICK_QUOTED_STRING = 100
def create_valid_python_identifier(name: str) -> str:
"""
Create valid Python identifiers from any string.
Check if name contains any special characters. If it contains any
special characters, the special characters will be replaced by
a special string and a prefix is added.
Raises
------
SyntaxError
If the returned name is not a Python valid identifier, raise an exception.
This can happen if there is a hashtag in the name, as the tokenizer will
than terminate and not find the backtick.
But also for characters that fall out of the range of (U+0001..U+007F).
"""
if name.isidentifier() and not iskeyword(name):
return name
# Create a dict with the special characters and their replacement string.
# EXACT_TOKEN_TYPES contains these special characters
# token.tok_name contains a readable description of the replacement string.
special_characters_replacements = {
char: f"_{token.tok_name[tokval]}_"
for char, tokval in (tokenize.EXACT_TOKEN_TYPES.items())
}
special_characters_replacements.update(
{
" ": "_",
"?": "_QUESTIONMARK_",
"!": "_EXCLAMATIONMARK_",
"$": "_DOLLARSIGN_",
"€": "_EUROSIGN_",
"°": "_DEGREESIGN_",
# Including quotes works, but there are exceptions.
"'": "_SINGLEQUOTE_",
'"': "_DOUBLEQUOTE_",
# Currently not possible. Terminates parser and won't find backtick.
# "#": "_HASH_",
}
)
name = "".join([special_characters_replacements.get(char, char) for char in name])
name = f"BACKTICK_QUOTED_STRING_{name}"
if not name.isidentifier():
raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.")
return name
def clean_backtick_quoted_toks(tok: tuple[int, str]) -> tuple[int, str]:
"""
Clean up a column name if surrounded by backticks.
Backtick quoted string are indicated by a certain tokval value. If a string
is a backtick quoted token it will processed by
:func:`_create_valid_python_identifier` so that the parser can find this
string when the query is executed.
In this case the tok will get the NAME tokval.
Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module
Returns
-------
tok : Tuple[int, str]
Either the input or token or the replacement values
"""
toknum, tokval = tok
if toknum == BACKTICK_QUOTED_STRING:
return tokenize.NAME, create_valid_python_identifier(tokval)
return toknum, tokval
def clean_column_name(name: Hashable) -> Hashable:
"""
Function to emulate the cleaning of a backtick quoted name.
The purpose for this function is to see what happens to the name of
identifier if it goes to the process of being parsed a Python code
inside a backtick quoted string and than being cleaned
(removed of any special characters).
Parameters
----------
name : hashable
Name to be cleaned.
Returns
-------
name : hashable
Returns the name after tokenizing and cleaning.
Notes
-----
For some cases, a name cannot be converted to a valid Python identifier.
In that case :func:`tokenize_string` raises a SyntaxError.
In that case, we just return the name unmodified.
If this name was used in the query string (this makes the query call impossible)
an error will be raised by :func:`tokenize_backtick_quoted_string` instead,
which is not caught and propagates to the user level.
"""
try:
tokenized = tokenize_string(f"`{name}`")
tokval = next(tokenized)[1]
return create_valid_python_identifier(tokval)
except SyntaxError:
return name
def tokenize_backtick_quoted_string(
token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int
) -> tuple[int, str]:
"""
Creates a token from a backtick quoted string.
Moves the token_generator forwards till right after the next backtick.
Parameters
----------
token_generator : Iterator[tokenize.TokenInfo]
The generator that yields the tokens of the source string (Tuple[int, str]).
The generator is at the first token after the backtick (`)
source : str
The Python source code string.
string_start : int
This is the start of backtick quoted string inside the source string.
Returns
-------
tok: Tuple[int, str]
The token that represents the backtick quoted string.
The integer is equal to BACKTICK_QUOTED_STRING (100).
"""
for _, tokval, start, _, _ in token_generator:
if tokval == "`":
string_end = start[1]
break
return BACKTICK_QUOTED_STRING, source[string_start:string_end]
def tokenize_string(source: str) -> Iterator[tuple[int, str]]:
"""
Tokenize a Python source code string.
Parameters
----------
source : str
The Python source code string.
Returns
-------
tok_generator : Iterator[Tuple[int, str]]
An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]).
"""
line_reader = StringIO(source).readline
token_generator = tokenize.generate_tokens(line_reader)
# Loop over all tokens till a backtick (`) is found.
# Then, take all tokens till the next backtick to form a backtick quoted string
for toknum, tokval, start, _, _ in token_generator:
if tokval == "`":
try:
yield tokenize_backtick_quoted_string(
token_generator, source, string_start=start[1] + 1
)
except Exception as err:
raise SyntaxError(f"Failed to parse backticks in '{source}'.") from err
else:
yield toknum, tokval