|
10 | 10 | import locale
|
11 | 11 | import os
|
12 | 12 | import sys
|
| 13 | +import codecs |
| 14 | + |
13 | 15 |
|
14 | 16 | from gitdb.utils.compat import (
|
15 | 17 | xrange,
|
@@ -67,7 +69,7 @@ def safe_decode(s):
|
67 | 69 | if isinstance(s, unicode):
|
68 | 70 | return s
|
69 | 71 | elif isinstance(s, bytes):
|
70 |
| - return s.decode(defenc, 'replace') |
| 72 | + return s.decode(defenc, 'surrogateescape') |
71 | 73 | elif s is not None:
|
72 | 74 | raise TypeError('Expected bytes or text, but got %r' % (s,))
|
73 | 75 |
|
@@ -121,3 +123,191 @@ def __str__(self):
|
121 | 123 | else: # Python 2
|
122 | 124 | def __str__(self):
|
123 | 125 | return self.__unicode__().encode(defenc)
|
| 126 | + |
| 127 | + |
| 128 | +""" |
| 129 | +This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error |
| 130 | +handler of Python 3. |
| 131 | +Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc |
| 132 | +""" |
| 133 | + |
| 134 | +# This code is released under the Python license and the BSD 2-clause license |
| 135 | + |
| 136 | + |
| 137 | +FS_ERRORS = 'surrogateescape' |
| 138 | + |
| 139 | +# # -- Python 2/3 compatibility ------------------------------------- |
| 140 | +# FS_ERRORS = 'my_surrogateescape' |
| 141 | + |
| 142 | +def u(text): |
| 143 | + if PY3: |
| 144 | + return text |
| 145 | + else: |
| 146 | + return text.decode('unicode_escape') |
| 147 | + |
| 148 | +def b(data): |
| 149 | + if PY3: |
| 150 | + return data.encode('latin1') |
| 151 | + else: |
| 152 | + return data |
| 153 | + |
| 154 | +if PY3: |
| 155 | + _unichr = chr |
| 156 | + bytes_chr = lambda code: bytes((code,)) |
| 157 | +else: |
| 158 | + _unichr = unichr |
| 159 | + bytes_chr = chr |
| 160 | + |
| 161 | +def surrogateescape_handler(exc): |
| 162 | + """ |
| 163 | + Pure Python implementation of the PEP 383: the "surrogateescape" error |
| 164 | + handler of Python 3. Undecodable bytes will be replaced by a Unicode |
| 165 | + character U+DCxx on decoding, and these are translated into the |
| 166 | + original bytes on encoding. |
| 167 | + """ |
| 168 | + mystring = exc.object[exc.start:exc.end] |
| 169 | + |
| 170 | + try: |
| 171 | + if isinstance(exc, UnicodeDecodeError): |
| 172 | + # mystring is a byte-string in this case |
| 173 | + decoded = replace_surrogate_decode(mystring) |
| 174 | + elif isinstance(exc, UnicodeEncodeError): |
| 175 | + # In the case of u'\udcc3'.encode('ascii', |
| 176 | + # 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an |
| 177 | + # exception anyway after this function is called, even though I think |
| 178 | + # it's doing what it should. It seems that the strict encoder is called |
| 179 | + # to encode the unicode string that this function returns ... |
| 180 | + decoded = replace_surrogate_encode(mystring) |
| 181 | + else: |
| 182 | + raise exc |
| 183 | + except NotASurrogateError: |
| 184 | + raise exc |
| 185 | + return (decoded, exc.end) |
| 186 | + |
| 187 | + |
| 188 | +class NotASurrogateError(Exception): |
| 189 | + pass |
| 190 | + |
| 191 | + |
| 192 | +def replace_surrogate_encode(mystring): |
| 193 | + """ |
| 194 | + Returns a (unicode) string, not the more logical bytes, because the codecs |
| 195 | + register_error functionality expects this. |
| 196 | + """ |
| 197 | + decoded = [] |
| 198 | + for ch in mystring: |
| 199 | + # if PY3: |
| 200 | + # code = ch |
| 201 | + # else: |
| 202 | + code = ord(ch) |
| 203 | + |
| 204 | + # The following magic comes from Py3.3's Python/codecs.c file: |
| 205 | + if not 0xD800 <= code <= 0xDCFF: |
| 206 | + # Not a surrogate. Fail with the original exception. |
| 207 | + raise exc |
| 208 | + # mybytes = [0xe0 | (code >> 12), |
| 209 | + # 0x80 | ((code >> 6) & 0x3f), |
| 210 | + # 0x80 | (code & 0x3f)] |
| 211 | + # Is this a good idea? |
| 212 | + if 0xDC00 <= code <= 0xDC7F: |
| 213 | + decoded.append(_unichr(code - 0xDC00)) |
| 214 | + elif code <= 0xDCFF: |
| 215 | + decoded.append(_unichr(code - 0xDC00)) |
| 216 | + else: |
| 217 | + raise NotASurrogateError |
| 218 | + return str().join(decoded) |
| 219 | + |
| 220 | + |
| 221 | +def replace_surrogate_decode(mybytes): |
| 222 | + """ |
| 223 | + Returns a (unicode) string |
| 224 | + """ |
| 225 | + decoded = [] |
| 226 | + for ch in mybytes: |
| 227 | + # We may be parsing newbytes (in which case ch is an int) or a native |
| 228 | + # str on Py2 |
| 229 | + if isinstance(ch, int): |
| 230 | + code = ch |
| 231 | + else: |
| 232 | + code = ord(ch) |
| 233 | + if 0x80 <= code <= 0xFF: |
| 234 | + decoded.append(_unichr(0xDC00 + code)) |
| 235 | + elif code <= 0x7F: |
| 236 | + decoded.append(_unichr(code)) |
| 237 | + else: |
| 238 | + # # It may be a bad byte |
| 239 | + # # Try swallowing it. |
| 240 | + # continue |
| 241 | + # print("RAISE!") |
| 242 | + raise NotASurrogateError |
| 243 | + return str().join(decoded) |
| 244 | + |
| 245 | + |
| 246 | +def encodefilename(fn): |
| 247 | + if FS_ENCODING == 'ascii': |
| 248 | + # ASCII encoder of Python 2 expects that the error handler returns a |
| 249 | + # Unicode string encodable to ASCII, whereas our surrogateescape error |
| 250 | + # handler has to return bytes in 0x80-0xFF range. |
| 251 | + encoded = [] |
| 252 | + for index, ch in enumerate(fn): |
| 253 | + code = ord(ch) |
| 254 | + if code < 128: |
| 255 | + ch = bytes_chr(code) |
| 256 | + elif 0xDC80 <= code <= 0xDCFF: |
| 257 | + ch = bytes_chr(code - 0xDC00) |
| 258 | + else: |
| 259 | + raise UnicodeEncodeError(FS_ENCODING, |
| 260 | + fn, index, index+1, |
| 261 | + 'ordinal not in range(128)') |
| 262 | + encoded.append(ch) |
| 263 | + return bytes().join(encoded) |
| 264 | + elif FS_ENCODING == 'utf-8': |
| 265 | + # UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF |
| 266 | + # doesn't go through our error handler |
| 267 | + encoded = [] |
| 268 | + for index, ch in enumerate(fn): |
| 269 | + code = ord(ch) |
| 270 | + if 0xD800 <= code <= 0xDFFF: |
| 271 | + if 0xDC80 <= code <= 0xDCFF: |
| 272 | + ch = bytes_chr(code - 0xDC00) |
| 273 | + encoded.append(ch) |
| 274 | + else: |
| 275 | + raise UnicodeEncodeError( |
| 276 | + FS_ENCODING, |
| 277 | + fn, index, index+1, 'surrogates not allowed') |
| 278 | + else: |
| 279 | + ch_utf8 = ch.encode('utf-8') |
| 280 | + encoded.append(ch_utf8) |
| 281 | + return bytes().join(encoded) |
| 282 | + else: |
| 283 | + return fn.encode(FS_ENCODING, FS_ERRORS) |
| 284 | + |
| 285 | +def decodefilename(fn): |
| 286 | + return fn.decode(FS_ENCODING, FS_ERRORS) |
| 287 | + |
| 288 | +FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') |
| 289 | +# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]') |
| 290 | +# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') |
| 291 | + |
| 292 | + |
| 293 | +# normalize the filesystem encoding name. |
| 294 | +# For example, we expect "utf-8", not "UTF8". |
| 295 | +FS_ENCODING = codecs.lookup(FS_ENCODING).name |
| 296 | + |
| 297 | + |
| 298 | +def register_surrogateescape(): |
| 299 | + """ |
| 300 | + Registers the surrogateescape error handler on Python 2 (only) |
| 301 | + """ |
| 302 | + if PY3: |
| 303 | + return |
| 304 | + try: |
| 305 | + codecs.lookup_error(FS_ERRORS) |
| 306 | + except LookupError: |
| 307 | + codecs.register_error(FS_ERRORS, surrogateescape_handler) |
| 308 | + |
| 309 | + |
| 310 | +try: |
| 311 | + b"100644 \x9f\0aaa".decode(defenc, "surrogateescape") |
| 312 | +except: |
| 313 | + register_surrogateescape() |
0 commit comments