This repository was archived by the owner on Mar 26, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 65
/
Copy pathcommon.py
124 lines (107 loc) · 3.76 KB
/
common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from __future__ import absolute_import, unicode_literals
import re
import sys
try:
from urllib.parse import quote
except ImportError:
from urllib import quote
if sys.version_info >= (3, 0):
if sys.version_info >= (3, 4):
import html.parser
HTMLunescape = html.parser.HTMLParser().unescape
else:
from .entitytrans import _unescape
HTMLunescape = _unescape
else:
from commonmark import entitytrans
HTMLunescape = entitytrans._unescape
ENTITY = '&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});'
TAGNAME = '[A-Za-z][A-Za-z0-9-]*'
ATTRIBUTENAME = '[a-zA-Z_:][a-zA-Z0-9:._-]*'
UNQUOTEDVALUE = "[^\"'=<>`\\x00-\\x20]+"
SINGLEQUOTEDVALUE = "'[^']*'"
DOUBLEQUOTEDVALUE = '"[^"]*"'
ATTRIBUTEVALUE = "(?:" + UNQUOTEDVALUE + "|" + SINGLEQUOTEDVALUE + \
"|" + DOUBLEQUOTEDVALUE + ")"
ATTRIBUTEVALUESPEC = "(?:" + "\\s*=" + "\\s*" + ATTRIBUTEVALUE + ")"
ATTRIBUTE = "(?:" + "\\s+" + ATTRIBUTENAME + ATTRIBUTEVALUESPEC + "?)"
OPENTAG = "<" + TAGNAME + ATTRIBUTE + "*" + "\\s*/?>"
CLOSETAG = "</" + TAGNAME + "\\s*[>]"
HTMLCOMMENT = '<!---->|<!--(?:-?[^>-])(?:-?[^-])*-->'
PROCESSINGINSTRUCTION = "[<][?].*?[?][>]"
DECLARATION = "<![A-Z]+" + "\\s+[^>]*>"
CDATA = '<!\\[CDATA\\[[\\s\\S]*?\\]\\]>'
HTMLTAG = "(?:" + OPENTAG + "|" + CLOSETAG + "|" + HTMLCOMMENT + "|" + \
PROCESSINGINSTRUCTION + "|" + DECLARATION + "|" + CDATA + ")"
reHtmlTag = re.compile('^' + HTMLTAG, re.IGNORECASE)
reBackslashOrAmp = re.compile(r'[\\&]')
ESCAPABLE = '[!"#$%&\'()*+,./:;<=>?@[\\\\\\]^_`{|}~-]'
reEntityOrEscapedChar = re.compile(
'\\\\' + ESCAPABLE + '|' + ENTITY, re.IGNORECASE)
XMLSPECIAL = '[&<>"]'
reXmlSpecial = re.compile(XMLSPECIAL)
reXmlSpecialOrEntity = re.compile(ENTITY + '|' + XMLSPECIAL, re.IGNORECASE)
def unescape_char(s):
if s[0] == '\\':
return s[1]
else:
return HTMLunescape(s)
def unescape_string(s):
"""Replace entities and backslash escapes with literal characters."""
if re.search(reBackslashOrAmp, s):
return re.sub(
reEntityOrEscapedChar,
lambda m: unescape_char(m.group()),
s)
else:
return s
def normalize_uri(uri):
try:
return quote(uri, safe=str('/@:+?=&()%#*,'))
except KeyError:
# Python 2 throws a KeyError sometimes
try:
return quote(uri.encode('utf-8'), safe=str('/@:+?=&()%#*,'))
except UnicodeDecodeError:
# Python 2 also throws a UnicodeDecodeError, complaining about
# the width of the "safe" string. Removing this parameter
# solves the issue, but yields overly aggressive quoting, but we
# can correct those errors manually.
s = quote(uri.encode('utf-8'))
s = re.sub(r'%40', '@', s)
s = re.sub(r'%3A', ':', s)
s = re.sub(r'%2B', '+', s)
s = re.sub(r'%3F', '?', s)
s = re.sub(r'%3D', '=', s)
s = re.sub(r'%26', '&', s)
s = re.sub(r'%28', '(', s)
s = re.sub(r'%29', ')', s)
s = re.sub(r'%25', '%', s)
s = re.sub(r'%23', '#', s)
s = re.sub(r'%2A', '*', s)
s = re.sub(r'%2C', ',', s)
return s
UNSAFE_MAP = {
'&': '&',
'<': '<',
'>': '>',
'"': '"',
}
def replace_unsafe_char(s):
return UNSAFE_MAP.get(s, s)
def escape_xml(s, preserve_entities):
if s is None:
return ''
if re.search(reXmlSpecial, s):
if preserve_entities:
return re.sub(
reXmlSpecialOrEntity,
lambda m: replace_unsafe_char(m.group()),
s)
else:
return re.sub(
reXmlSpecial,
lambda m: replace_unsafe_char(m.group()),
s)
else:
return s