Merge branch 'unicode'

Byron · Byron · commit 13647590f96f · 2010-10-15T12:40:54.000+02:00
diff --git a/doc/source/changes.rst b/doc/source/changes.rst
@@ -2,6 +2,10 @@
 Changelog
 =========
 
+0.3.0 Beta 3
+============
+* Added unicode support for author names. Commit.author.name is now unicode instead of string.
+
 0.3.0 Beta 2
 ============
 * Added python 2.4 support
diff --git a/lib/git/ext/gitdb b/lib/git/ext/gitdb
@@ -1 +1 @@
-Subproject commit 425ecf04aa5038c3d46b01ca20de17c51ef6c4e5
+Subproject commit 78665b13ff4125f4ce3e5311d040c027bdc92a9a
diff --git a/lib/git/objects/commit.py b/lib/git/objects/commit.py
@@ -368,9 +368,14 @@ def _serialize(self, stream):
 			write("parent %s\n" % p)
 			
 		a = self.author
+		aname = a.name
+		if isinstance(aname, unicode):
+			aname = aname.encode(self.encoding)
+		# END handle unicode in name
+		
 		c = self.committer
 		fmt = "%s %s <%s> %s %s\n"
-		write(fmt % ("author", a.name, a.email, 
+		write(fmt % ("author", aname, a.email, 
 						self.authored_date, 
 						altz_to_utctz_str(self.author_tz_offset)))
 			
@@ -425,12 +430,19 @@ def _deserialize(self, stream):
 			readline()
 		# END handle encoding
 		
+		# decode the authors name
+		try:
+			self.author.name = self.author.name.decode(self.encoding) 
+		except UnicodeDecodeError:
+			print >> sys.stderr, "Failed to decode author name: %s" % self.author.name
+		# END handle author's encoding
+		
 		# a stream from our data simply gives us the plain message
 		# The end of our message stream is marked with a newline that we strip
 		self.message = stream.read()
 		try:
 			self.message = self.message.decode(self.encoding)
-		except Exception:
+		except UnicodeDecodeError:
 			print >> sys.stderr, "Failed to decode message: %s" % self.message
 		# END exception handling 
 		return self
diff --git a/test/git/performance/test_odb.py b/test/git/performance/test_odb.py
@@ -49,17 +49,18 @@ def test_random_access(self):
 			st = time()
 			nb = 0
 			too_many = 15000
+			data_bytes = 0
 			for blob_list in blobs_per_commit:
 				for blob in blob_list:
-					blob.data_stream.read()
+					data_bytes += len(blob.data_stream.read())
 				# END for each blobsha
 				nb += len(blob_list)
 				if nb > too_many:
 					break
 			# END for each bloblist
 			elapsed = time() - st
 			
-			print >> sys.stderr, "%s: Retrieved %i blob and their data in %g s ( %f blobs / s )" % (type(repo.odb), nb, elapsed, nb / elapsed)
+			print >> sys.stderr, "%s: Retrieved %i blob (%i KiB) and their data in %g s ( %f blobs / s, %f KiB / s )" % (type(repo.odb), nb, data_bytes/1000, elapsed, nb / elapsed, (data_bytes / 1000) / elapsed)
 			results[2].append(elapsed)
 		# END for each repo type
 		
diff --git a/test/git/test_commit.py b/test/git/test_commit.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # test_commit.py
 # Copyright (C) 2008, 2009 Michael Trier (mtrier@gmail.com) and contributors
 #
@@ -108,6 +109,14 @@ def check_entries(d):
 		assert commit.committer_tz_offset == 14400, commit.committer_tz_offset
 		assert commit.message == "initial project\n"
 		
+	def test_unicode_actor(self):
+		# assure we can parse unicode actors correctly
+		name = "Üäöß ÄußÉ".decode("utf-8")
+		assert len(name) == 9
+		special = Actor._from_string(u"%s <something@this.com>" % name)
+		assert special.name == name
+		assert isinstance(special.name, unicode)
+		
 	def test_traversal(self):
 		start = self.rorepo.commit("a4d06724202afccd2b5c54f81bcf2bf26dea7fff")
 		first = self.rorepo.commit("33ebe7acec14b25c5f84f35a664803fcab2f7781")
@@ -233,3 +242,32 @@ def test_serialization(self, rwrepo):
 		# create all commits of our repo
 		assert_commit_serialization(rwrepo, '0.1.6')
 		
+	def test_serialization_unicode_support(self):
+		assert Commit.default_encoding.lower() == 'utf-8'
+		
+		# create a commit with unicode in the message, and the author's name
+		# Verify its serialization and deserialization
+		cmt = self.rorepo.commit('0.1.6')
+		assert isinstance(cmt.message, unicode)		# it automatically decodes it as such
+		assert isinstance(cmt.author.name, unicode)	# same here
+		
+		cmt.message = "üäêèß".decode("utf-8")
+		assert len(cmt.message) == 5
+		
+		cmt.author.name = "äüß".decode("utf-8")
+		assert len(cmt.author.name) == 3
+		
+		cstream = StringIO()
+		cmt._serialize(cstream)
+		cstream.seek(0)
+		assert len(cstream.getvalue())
+		
+		ncmt = Commit(self.rorepo, cmt.binsha)
+		ncmt._deserialize(cstream)
+		
+		assert cmt.author.name == ncmt.author.name
+		assert cmt.message == ncmt.message
+		# actually, it can't be printed in a shell as repr wants to have ascii only
+		# it appears
+		cmt.author.__repr__()
+