Skip to content

Commit ec731f4

Browse files
committed
Merge with #532, fix unicode filenames with escapesurogates
2 parents b2efa1b + 9e4a454 commit ec731f4

File tree

7 files changed

+209
-18
lines changed

7 files changed

+209
-18
lines changed

Diff for: VERSION

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.0.9dev0
1+
2.0.10dev0

Diff for: git/compat.py

+191-1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
import locale
1111
import os
1212
import sys
13+
import codecs
14+
1315

1416
from gitdb.utils.compat import (
1517
xrange,
@@ -67,7 +69,7 @@ def safe_decode(s):
6769
if isinstance(s, unicode):
6870
return s
6971
elif isinstance(s, bytes):
70-
return s.decode(defenc, 'replace')
72+
return s.decode(defenc, 'surrogateescape')
7173
elif s is not None:
7274
raise TypeError('Expected bytes or text, but got %r' % (s,))
7375

@@ -121,3 +123,191 @@ def __str__(self):
121123
else: # Python 2
122124
def __str__(self):
123125
return self.__unicode__().encode(defenc)
126+
127+
128+
"""
129+
This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error
130+
handler of Python 3.
131+
Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc
132+
"""
133+
134+
# This code is released under the Python license and the BSD 2-clause license
135+
136+
137+
FS_ERRORS = 'surrogateescape'
138+
139+
# # -- Python 2/3 compatibility -------------------------------------
140+
# FS_ERRORS = 'my_surrogateescape'
141+
142+
def u(text):
143+
if PY3:
144+
return text
145+
else:
146+
return text.decode('unicode_escape')
147+
148+
def b(data):
149+
if PY3:
150+
return data.encode('latin1')
151+
else:
152+
return data
153+
154+
if PY3:
155+
_unichr = chr
156+
bytes_chr = lambda code: bytes((code,))
157+
else:
158+
_unichr = unichr
159+
bytes_chr = chr
160+
161+
def surrogateescape_handler(exc):
162+
"""
163+
Pure Python implementation of the PEP 383: the "surrogateescape" error
164+
handler of Python 3. Undecodable bytes will be replaced by a Unicode
165+
character U+DCxx on decoding, and these are translated into the
166+
original bytes on encoding.
167+
"""
168+
mystring = exc.object[exc.start:exc.end]
169+
170+
try:
171+
if isinstance(exc, UnicodeDecodeError):
172+
# mystring is a byte-string in this case
173+
decoded = replace_surrogate_decode(mystring)
174+
elif isinstance(exc, UnicodeEncodeError):
175+
# In the case of u'\udcc3'.encode('ascii',
176+
# 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an
177+
# exception anyway after this function is called, even though I think
178+
# it's doing what it should. It seems that the strict encoder is called
179+
# to encode the unicode string that this function returns ...
180+
decoded = replace_surrogate_encode(mystring)
181+
else:
182+
raise exc
183+
except NotASurrogateError:
184+
raise exc
185+
return (decoded, exc.end)
186+
187+
188+
class NotASurrogateError(Exception):
189+
pass
190+
191+
192+
def replace_surrogate_encode(mystring):
193+
"""
194+
Returns a (unicode) string, not the more logical bytes, because the codecs
195+
register_error functionality expects this.
196+
"""
197+
decoded = []
198+
for ch in mystring:
199+
# if PY3:
200+
# code = ch
201+
# else:
202+
code = ord(ch)
203+
204+
# The following magic comes from Py3.3's Python/codecs.c file:
205+
if not 0xD800 <= code <= 0xDCFF:
206+
# Not a surrogate. Fail with the original exception.
207+
raise exc
208+
# mybytes = [0xe0 | (code >> 12),
209+
# 0x80 | ((code >> 6) & 0x3f),
210+
# 0x80 | (code & 0x3f)]
211+
# Is this a good idea?
212+
if 0xDC00 <= code <= 0xDC7F:
213+
decoded.append(_unichr(code - 0xDC00))
214+
elif code <= 0xDCFF:
215+
decoded.append(_unichr(code - 0xDC00))
216+
else:
217+
raise NotASurrogateError
218+
return str().join(decoded)
219+
220+
221+
def replace_surrogate_decode(mybytes):
222+
"""
223+
Returns a (unicode) string
224+
"""
225+
decoded = []
226+
for ch in mybytes:
227+
# We may be parsing newbytes (in which case ch is an int) or a native
228+
# str on Py2
229+
if isinstance(ch, int):
230+
code = ch
231+
else:
232+
code = ord(ch)
233+
if 0x80 <= code <= 0xFF:
234+
decoded.append(_unichr(0xDC00 + code))
235+
elif code <= 0x7F:
236+
decoded.append(_unichr(code))
237+
else:
238+
# # It may be a bad byte
239+
# # Try swallowing it.
240+
# continue
241+
# print("RAISE!")
242+
raise NotASurrogateError
243+
return str().join(decoded)
244+
245+
246+
def encodefilename(fn):
247+
if FS_ENCODING == 'ascii':
248+
# ASCII encoder of Python 2 expects that the error handler returns a
249+
# Unicode string encodable to ASCII, whereas our surrogateescape error
250+
# handler has to return bytes in 0x80-0xFF range.
251+
encoded = []
252+
for index, ch in enumerate(fn):
253+
code = ord(ch)
254+
if code < 128:
255+
ch = bytes_chr(code)
256+
elif 0xDC80 <= code <= 0xDCFF:
257+
ch = bytes_chr(code - 0xDC00)
258+
else:
259+
raise UnicodeEncodeError(FS_ENCODING,
260+
fn, index, index+1,
261+
'ordinal not in range(128)')
262+
encoded.append(ch)
263+
return bytes().join(encoded)
264+
elif FS_ENCODING == 'utf-8':
265+
# UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF
266+
# doesn't go through our error handler
267+
encoded = []
268+
for index, ch in enumerate(fn):
269+
code = ord(ch)
270+
if 0xD800 <= code <= 0xDFFF:
271+
if 0xDC80 <= code <= 0xDCFF:
272+
ch = bytes_chr(code - 0xDC00)
273+
encoded.append(ch)
274+
else:
275+
raise UnicodeEncodeError(
276+
FS_ENCODING,
277+
fn, index, index+1, 'surrogates not allowed')
278+
else:
279+
ch_utf8 = ch.encode('utf-8')
280+
encoded.append(ch_utf8)
281+
return bytes().join(encoded)
282+
else:
283+
return fn.encode(FS_ENCODING, FS_ERRORS)
284+
285+
def decodefilename(fn):
286+
return fn.decode(FS_ENCODING, FS_ERRORS)
287+
288+
FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
289+
# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]')
290+
# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
291+
292+
293+
# normalize the filesystem encoding name.
294+
# For example, we expect "utf-8", not "UTF8".
295+
FS_ENCODING = codecs.lookup(FS_ENCODING).name
296+
297+
298+
def register_surrogateescape():
299+
"""
300+
Registers the surrogateescape error handler on Python 2 (only)
301+
"""
302+
if PY3:
303+
return
304+
try:
305+
codecs.lookup_error(FS_ERRORS)
306+
except LookupError:
307+
codecs.register_error(FS_ERRORS, surrogateescape_handler)
308+
309+
310+
try:
311+
b"100644 \x9f\0aaa".decode(defenc, "surrogateescape")
312+
except:
313+
register_surrogateescape()

Diff for: git/objects/fun.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from stat import S_ISDIR
33
from git.compat import (
44
byte_ord,
5+
safe_decode,
56
defenc,
67
xrange,
78
text_type,
@@ -76,11 +77,7 @@ def tree_entries_from_data(data):
7677
# default encoding for strings in git is utf8
7778
# Only use the respective unicode object if the byte stream was encoded
7879
name = data[ns:i]
79-
try:
80-
name = name.decode(defenc)
81-
except UnicodeDecodeError:
82-
pass
83-
# END handle encoding
80+
name = safe_decode(name)
8481

8582
# byte is NULL, get next 20
8683
i += 1

Diff for: git/test/performance/test_commit.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def test_iteration(self):
5252
# END for each object
5353
# END for each commit
5454
elapsed_time = time() - st
55-
print("Traversed %i Trees and a total of %i unchached objects in %s [s] ( %f objs/s )"
55+
print("Traversed %i Trees and a total of %i uncached objects in %s [s] ( %f objs/s )"
5656
% (nc, no, elapsed_time, no / elapsed_time), file=sys.stderr)
5757

5858
def test_commit_traversal(self):

Diff for: git/test/test_fun.py

+11-7
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
from io import BytesIO
2-
from stat import (
3-
S_IFDIR,
4-
S_IFREG,
5-
S_IFLNK
6-
)
2+
from stat import S_IFDIR, S_IFREG, S_IFLNK
3+
from unittest.case import skipIf
74

5+
from git.compat import PY3
86
from git.index import IndexFile
97
from git.index.fun import (
108
aggressive_tree_merge
@@ -253,6 +251,12 @@ def test_tree_traversal_single(self):
253251
assert entries
254252
# END for each commit
255253

256-
def test_tree_entries_from_data_with_failing_name_decode(self):
254+
@skipIf(PY3, 'odd types returned ... maybe figure it out one day')
255+
def test_tree_entries_from_data_with_failing_name_decode_py2(self):
256+
r = tree_entries_from_data(b'100644 \x9f\0aaa')
257+
assert r == [('aaa', 33188, u'\udc9f')], r
258+
259+
@skipIf(not PY3, 'odd types returned ... maybe figure it out one day')
260+
def test_tree_entries_from_data_with_failing_name_decode_py3(self):
257261
r = tree_entries_from_data(b'100644 \x9f\0aaa')
258-
assert r == [(b'aaa', 33188, b'\x9f')], r
262+
assert r == [(b'aaa', 33188, '\udc9f')], r

Diff for: setup.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def _stamp_version(filename):
6464
else:
6565
print("WARNING: Couldn't find version line in file %s" % filename, file=sys.stderr)
6666

67-
install_requires = ['gitdb >= 0.6.4']
67+
install_requires = ['gitdb2 >= 2.0.0']
6868
extras_require = {
6969
':python_version == "2.6"': ['ordereddict'],
7070
}
@@ -100,7 +100,7 @@ def _stamp_version(filename):
100100
package_data={'git.test': ['fixtures/*']},
101101
package_dir={'git': 'git'},
102102
license="BSD License",
103-
requires=['gitdb (>=0.6.4)'],
103+
requires=['gitdb2 (>=2.0.0)'],
104104
install_requires=install_requires,
105105
test_requirements=test_requires + install_requires,
106106
zip_safe=False,

0 commit comments

Comments
 (0)