util: pick the type of memory manager based on the python version, to have optimal results in all cases (at least the ones I can test)

Byron · Byron · commit f0c05ea8da79 · 2011-07-05T18:13:49.000+02:00
pack: now works properly with a sliding memory manager
test_packedodb_pure: fixed very memory hungry implementation by using an iterator. This will of course reduce the measured performance a bit, but 750MB of memory is just a little bit too much for an ordinary test. Maybe it would be alright to just reduce the number of items ... but performance isn't a strength of python after all
diff --git a/git/pack.py b/git/pack.py
@@ -73,7 +73,7 @@
 	
 #{ Utilities 
 
-def pack_object_at(data, offset, as_stream):
+def pack_object_at(cursor, offset, as_stream):
 	"""
 	:return: Tuple(abs_data_offset, PackInfo|PackStream)
 		an object of the correct type according to the type_id  of the object.
@@ -83,7 +83,7 @@ def pack_object_at(data, offset, as_stream):
 	:parma offset: offset in to the data at which the object information is located
 	:param as_stream: if True, a stream object will be returned that can read 
 		the data, otherwise you receive an info object only"""
-	data = buffer(data, offset)
+	data = cursor.use_region(offset).buffer()
 	type_id, uncomp_size, data_rela_offset = pack_object_header_info(data)
 	total_rela_offset = None				# set later, actual offset until data stream begins
 	delta_info = None
@@ -269,6 +269,10 @@ def _set_cache_(self, attr):
 			# that we can actually write to the location - it could be a read-only
 			# alternate for instance
 			self._cursor = mman.make_cursor(self._indexpath).use_region()
+			# We will assume that the index will always fully fit into memory !
+			if mman.window_size() > 0 and self._cursor.file_size() > mman.window_size():
+				raise AssertionError("The index file at %s is too large to fit into a mapped window (%i > %i). This is a limitation of the implementation" % (self._indexpath, self._cursor.file_size(), mman.window_size()))
+			#END assert window size
 		else:
 			# now its time to initialize everything - if we are here, someone wants
 			# to access the fanout table or related properties
@@ -527,13 +531,13 @@ def _set_cache_(self, attr):
 		
 	def _iter_objects(self, start_offset, as_stream=True):
 		"""Handle the actual iteration of objects within this pack"""
-		data = self._cursor.map()
-		content_size = len(data) - self.footer_size
+		c = self._cursor
+		content_size = c.file_size() - self.footer_size
 		cur_offset = start_offset or self.first_object_offset
 		
 		null = NullStream()
 		while cur_offset < content_size:
-			data_offset, ostream = pack_object_at(data, cur_offset, True)
+			data_offset, ostream = pack_object_at(c, cur_offset, True)
 			# scrub the stream to the end - this decompresses the object, but yields
 			# the amount of compressed bytes we need to get to the next offset
 				
@@ -562,12 +566,14 @@ def version(self):
 	def data(self):
 		"""
 		:return: read-only data of this pack. It provides random access and usually
-			is a memory map"""
-		return self._cursor.map()
+			is a memory map.
+		:note: This method is unsafe as it returns a window into a file which might be larger than than the actual window size"""
+		# can use map as we are starting at offset 0. Otherwise we would have to use buffer()
+		return self._cursor.use_region().map()
 		
 	def checksum(self):
 		""":return: 20 byte sha1 hash on all object sha's contained in this file"""
-		return self._cursor.map()[-20:]
+		return self._cursor.use_region(self._cursor.file_size()-20).buffer()[:]
 	
 	def path(self):
 		""":return: path to the packfile"""
@@ -586,9 +592,9 @@ def collect_streams(self, offset):
 			If the object at offset is no delta, the size of the list is 1.
 		:param offset: specifies the first byte of the object within this pack"""
 		out = list()
-		data = self._cursor.map()
+		c = self._cursor
 		while True:
-			ostream = pack_object_at(data, offset, True)[1]
+			ostream = pack_object_at(c, offset, True)[1]
 			out.append(ostream)
 			if ostream.type_id == OFS_DELTA:
 				offset = ostream.pack_offset - ostream.delta_info
@@ -610,14 +616,14 @@ def info(self, offset):
 		
 		:param offset: byte offset
 		:return: OPackInfo instance, the actual type differs depending on the type_id attribute"""
-		return pack_object_at(self._cursor.map(), offset or self.first_object_offset, False)[1]
+		return pack_object_at(self._cursor, offset or self.first_object_offset, False)[1]
 		
 	def stream(self, offset):
 		"""Retrieve an object at the given file-relative offset as stream along with its information
 		
 		:param offset: byte offset
 		:return: OPackStream instance, the actual type differs depending on the type_id attribute"""
-		return pack_object_at(self._cursor.map(), offset or self.first_object_offset, True)[1]
+		return pack_object_at(self._cursor, offset or self.first_object_offset, True)[1]
 		
 	def stream_iter(self, start_offset=0):
 		"""
@@ -700,7 +706,7 @@ def _object(self, sha, as_stream, index=-1):
 			sha = self._index.sha(index)
 		# END assure sha is present ( in output )
 		offset = self._index.offset(index)
-		type_id, uncomp_size, data_rela_offset = pack_object_header_info(buffer(self._pack._cursor.map(), offset))
+		type_id, uncomp_size, data_rela_offset = pack_object_header_info(self._pack._cursor.use_region(offset).buffer())
 		if as_stream:
 			if type_id not in delta_types:
 				packstream = self._pack.stream(offset)
diff --git a/git/test/performance/db/test_packedodb_pure.py b/git/test/performance/db/test_packedodb_pure.py
@@ -49,18 +49,17 @@ def test_pack_writing(self):
 			count = 0
 			total_size = 0
 			st = time()
-			objs = list()
 			for sha in rorepo.sha_iter():
 				count += 1
-				objs.append(rorepo.stream(sha))
+				rorepo.stream(sha)
 				if count == ni:
 					break
 			#END gather objects for pack-writing
 			elapsed = time() - st
-			print >> sys.stderr, "PDB Streaming: Got %i streams from %s by sha in in %f s ( %f streams/s )" % (ni, rorepo.__class__.__name__, elapsed, ni / elapsed)
+			print >> sys.stderr, "PDB Streaming: Got %i streams from %s by sha in in %f s ( %f streams/s )" % (count, rorepo.__class__.__name__, elapsed, count / elapsed)
 			
 			st = time()
-			PackEntity.write_pack(objs, ostream.write)
+			PackEntity.write_pack((rorepo.stream(sha) for sha in rorepo.sha_iter()), ostream.write, object_count=ni)
 			elapsed = time() - st
 			total_kb = ostream.bytes_written() / 1000
 			print >> sys.stderr, "PDB Streaming: Wrote pack of size %i kb in %f s (%f kb/s)" % (total_kb, elapsed, total_kb/elapsed)
diff --git a/git/util.py b/git/util.py
@@ -17,6 +17,7 @@
 import tempfile
 from smmap import (
 					StaticWindowMapManager,
+					SlidingWindowMapManager,
 					SlidingWindowMapBuffer
 				)
 
@@ -72,7 +73,11 @@ def unpack_from(fmt, data, offset=0):
 
 # initialize our global memory manager instance
 # Use it to free cached (and unused) resources.
-mman = StaticWindowMapManager()
+if sys.version_info[1] < 6:
+	mman = StaticWindowMapManager()
+else:
+	mman = SlidingWindowMapManager()
+#END handle mman
 
 #} END globals