index.add: now uses gitdb.store functionality instead of git-hash-file. The python version is about as fast, but could support multithreading using async

Byron · Byron · commit ac62760c52ab · 2010-06-21T12:51:46.000+02:00
diff --git a/lib/git/index/base.py b/lib/git/index/base.py
@@ -12,6 +12,7 @@
 import stat
 import subprocess
 import glob
+from cStringIO import StringIO
 
 from typ import *
 from util import (
@@ -48,6 +49,10 @@
 						)
 
 
+from gitdb.base import (
+							IStream
+						)
+
 __all__ = ( 'IndexFile', 'CheckoutError' )
 
 
@@ -255,9 +260,6 @@ def write(self, file_path = None, ignore_tree_extension_data=False):
 
 		Returns
 			self
-
-		Note
-			Index writing based on the dulwich implementation
 		"""
 		lfd = LockedFD(file_path or self._file_path)
 		stream = lfd.open(write=True, stream=True)
@@ -634,12 +636,10 @@ def _preprocess_add_items(self, items):
 		# END for each item
 		return (paths, entries)
 
-
 	@clear_cache
 	@default_index
 	def add(self, items, force=True, fprogress=lambda *args: None, path_rewriter=None):
-		"""
-		Add files from the working tree, specific blobs or BaseIndexEntries
+		"""Add files from the working tree, specific blobs or BaseIndexEntries
 		to the index. The underlying index file will be written immediately, hence
 		you should provide as many items as possible to minimize the amounts of writes
 
@@ -695,7 +695,7 @@ def add(self, items, force=True, fprogress=lambda *args: None, path_rewriter=Non
 
 		:param fprogress:
 			Function with signature f(path, done=False, item=item) called for each
-			path to be added, once once it is about to be added where done==False
+			path to be added, one time once it is about to be added where done==False
 			and once after it was added where done=True.
 			item is set to the actual item we handle, either a Path or a BaseIndexEntry
 			Please note that the processed path is not guaranteed to be present
@@ -713,8 +713,8 @@ def add(self, items, force=True, fprogress=lambda *args: None, path_rewriter=Non
 		:return:
 			List(BaseIndexEntries) representing the entries just actually added.
 
-		Raises
-			GitCommandError if a supplied Path did not exist. Please note that BaseIndexEntry
+		:raise OSError:
+			if a supplied Path did not exist. Please note that BaseIndexEntry
 			Objects that do not have a null sha will be added even if their paths
 			do not exist.
 		"""
@@ -734,28 +734,45 @@ def add(self, items, force=True, fprogress=lambda *args: None, path_rewriter=Non
 			del(paths[:])
 		# END rewrite paths
 
+
+		def store_path(filepath):
+			"""Store file at filepath in the database and return the base index entry"""
+			st = os.lstat(filepath)		# handles non-symlinks as well
+			stream = None
+			if stat.S_ISLNK(st.st_mode):
+				stream = StringIO(os.readlink(filepath))
+			else:
+				stream = open(filepath, 'rb')
+			# END handle stream
+			fprogress(filepath, False, filepath)
+			istream = self.repo.odb.store(IStream(Blob.type, st.st_size, stream))
+			fprogress(filepath, True, filepath)
+			
+			return BaseIndexEntry((st.st_mode, istream.sha, 0, filepath))
+		# END utility method
+
+
 		# HANDLE PATHS
 		if paths:
-			# to get suitable progress information, pipe paths to stdin
-			args = ("--add", "--replace", "--verbose", "--stdin")
-			proc = self.repo.git.update_index(*args, **{'as_process':True, 'istream':subprocess.PIPE})
-			make_exc = lambda : GitCommandError(("git-update-index",)+args, 128, proc.stderr.read())
+			assert len(entries_added) == 0
 			added_files = list()
-
 			for filepath in self._iter_expand_paths(paths):
-				self._write_path_to_stdin(proc, filepath, filepath, make_exc, 
-											fprogress, read_from_stdout=False)
-				added_files.append(filepath)
+				entries_added.append(store_path(filepath))
 			# END for each filepath
-			self._flush_stdin_and_wait(proc, ignore_stdout=True)	# ignore stdout
+			
+			# add the new entries to this instance, and write it
+			for entry in entries_added:
+				self.entries[(entry.path, 0)] = IndexEntry.from_base(entry)
 
-			# force rereading our entries once it is all done
-			self._delete_entries_cache()
-			entries_added.extend(self.entries[(f,0)] for f in added_files)
+			# finally write the changed index
+			self.write()
 		# END path handling
 
+
 		# HANDLE ENTRIES
 		if entries:
+			# TODO: Add proper IndexEntries to ourselves, and write the index
+			# just once. Currently its done twice at least
 			null_mode_entries = [ e for e in entries if e.mode == 0 ]
 			if null_mode_entries:
 				raise ValueError("At least one Entry has a null-mode - please use index.remove to remove files for clarity")
@@ -765,37 +782,22 @@ def add(self, items, force=True, fprogress=lambda *args: None, path_rewriter=Non
 			# create objects if required, otherwise go with the existing shas
 			null_entries_indices = [ i for i,e in enumerate(entries) if e.sha == Object.NULL_HEX_SHA ]
 			if null_entries_indices:
-				# creating object ids is the time consuming part. Hence we will
-				# send progress for these now.
-				args = ("-w", "--stdin-paths")
-				proc = self.repo.git.hash_object(*args, **{'istream':subprocess.PIPE, 'as_process':True})
-				make_exc = lambda : GitCommandError(("git-hash-object",)+args, 128, proc.stderr.read())
-				obj_ids = list()
 				for ei in null_entries_indices:
-					entry = entries[ei]
-					obj_ids.append(self._write_path_to_stdin(proc, entry.path, entry,
-																make_exc, fprogress, read_from_stdout=True))
+					null_entry = entries[ei]
+					new_entry = store_path(null_entry.path)
+					
+					# update null entry
+					entries[ei] = BaseIndexEntry((null_entry.mode, new_entry.sha, null_entry.stage, null_entry.path))
 				# END for each entry index
-				assert len(obj_ids) == len(null_entries_indices), "git-hash-object did not produce all requested objects: want %i, got %i" % ( len(null_entries_indices), len(obj_ids) )
-
-				# update IndexEntries with new object id
-				for i,new_sha in zip(null_entries_indices, obj_ids):
-					e = entries[i]
-
-					new_entry = BaseIndexEntry((e.mode, new_sha, e.stage, e.path))
-					entries[i] = new_entry
-				# END for each index
 			# END null_entry handling
 
 			# REWRITE PATHS
 			# If we have to rewrite the entries, do so now, after we have generated
 			# all object sha's
 			if path_rewriter:
-				new_entries = list()
-				for e in entries:
-					new_entries.append(BaseIndexEntry((e.mode, e.sha, e.stage, path_rewriter(e))))
+				for i,e in enumerate(entries):
+					entries[i] = BaseIndexEntry((e.mode, e.sha, e.stage, path_rewriter(e)))
 				# END for each entry
-				entries = new_entries
 			# END handle path rewriting
 
 			# feed pure entries to stdin
@@ -821,7 +823,7 @@ def add(self, items, force=True, fprogress=lambda *args: None, path_rewriter=Non
 			self._flush_stdin_and_wait(proc, ignore_stdout=True)
 			entries_added.extend(entries)
 		# END if there are base entries
-
+		
 		return entries_added
 
 	def _items_to_rela_paths(self, items):
diff --git a/lib/git/repo.py b/lib/git/repo.py
@@ -723,16 +723,17 @@ def init(cls, path=None, mkdir=True, **kwargs):
 		return Repo(path)
 
 	def clone(self, path, **kwargs):
-		"""
-		Create a clone from this repository.
-
-		``path``
+		"""Create a clone from this repository.
+		:param path:
 			is the full path of the new repo (traditionally ends with ./<name>.git).
 
-		``kwargs``
-			keyword arguments to be given to the git-clone command
-
-		Returns
+		:param kwargs:
+			odbt = ObjectDatabase Type, allowing to determine the object database
+			implementation used by the returned Repo instance
+			
+			All remaining keyword arguments are given to the git-clone command
+			
+		:return:
 			``git.Repo`` (the newly cloned repo)
 		"""
 		# special handling for windows for path at which the clone should be 
@@ -741,6 +742,7 @@ def clone(self, path, **kwargs):
 		# we at least give a proper error instead of letting git fail
 		prev_cwd = None
 		prev_path = None
+		odbt = kwargs.pop('odbt', GitCmdObjectDB)
 		if os.name == 'nt':
 			if '~' in path:
 				raise OSError("Git cannot handle the ~ character in path %r correctly" % path)
@@ -767,7 +769,7 @@ def clone(self, path, **kwargs):
 				path = prev_path
 			# END reset previous working dir
 		# END bad windows handling
-		return Repo(path)
+		return Repo(path, odbt = odbt)
 
 
 	def archive(self, ostream, treeish=None, prefix=None,  **kwargs):
diff --git a/test/git/performance/lib.py b/test/git/performance/lib.py
@@ -4,6 +4,11 @@
 import shutil
 import tempfile
 
+from git.db import (
+						GitCmdObjectDB,
+						GitDB
+					)
+
 from git import (
 	Repo
 	)
@@ -31,9 +36,14 @@ class TestBigRepoR(TestBase):
 	"""TestCase providing access to readonly 'big' repositories using the following 
 	member variables:
 	
-	* gitrepo
+	* gitrorepo
 	
-	 * Read-Only git repository - actually the repo of git itself"""
+	 * Read-Only git repository - actually the repo of git itself
+	 
+    * puregitrorepo
+    
+     * As gitrepo, but uses pure python implementation
+    """
 	 
 	#{ Invariants
 	head_sha_2k = '235d521da60e4699e5bd59ac658b5b48bd76ddca'
@@ -43,20 +53,23 @@ class TestBigRepoR(TestBase):
 	@classmethod
 	def setUpAll(cls):
 		super(TestBigRepoR, cls).setUpAll()
-		cls.gitrorepo = Repo(resolve_or_fail(k_env_git_repo))
+		repo_path = resolve_or_fail(k_env_git_repo)
+		cls.gitrorepo = Repo(repo_path, odbt=GitCmdObjectDB)
+		cls.puregitrorepo = Repo(repo_path, odbt=GitDB)
 
 
 class TestBigRepoRW(TestBigRepoR):
 	"""As above, but provides a big repository that we can write to.
 	
-	Provides ``self.gitrwrepo``"""
+	Provides ``self.gitrwrepo`` and ``self.puregitrwrepo``"""
 	
 	@classmethod
 	def setUpAll(cls):
 		super(TestBigRepoRW, cls).setUpAll()
 		dirname = tempfile.mktemp()
 		os.mkdir(dirname)
-		cls.gitrwrepo = cls.gitrorepo.clone(dirname, shared=True, bare=True)
+		cls.gitrwrepo = cls.gitrorepo.clone(dirname, shared=True, bare=True, odbt=GitCmdObjectDB)
+		cls.puregitrwrepo = Repo(dirname, odbt=GitDB)
 	
 	@classmethod
 	def tearDownAll(cls):
diff --git a/test/git/performance/test_odb.py b/test/git/performance/test_odb.py
@@ -12,50 +12,58 @@
 class TestObjDBPerformance(TestBigRepoR):
 	
 	def test_random_access(self):
-		
-		# GET COMMITS
-		# TODO: use the actual db for this
-		st = time()
-		root_commit = self.gitrorepo.commit(self.head_sha_2k)
-		commits = list(root_commit.traverse())
-		nc = len(commits)
-		elapsed = time() - st
-		
-		print >> sys.stderr, "Retrieved %i commits from ObjectStore in %g s ( %f commits / s )" % (nc, elapsed, nc / elapsed)
+		results = [ ["Iterate Commits"], ["Iterate Blobs"], ["Retrieve Blob Data"] ]
+		for repo in (self.gitrorepo, self.puregitrorepo):
+			# GET COMMITS
+			st = time()
+			root_commit = repo.commit(self.head_sha_2k)
+			commits = list(root_commit.traverse())
+			nc = len(commits)
+			elapsed = time() - st
 			
+			print >> sys.stderr, "%s: Retrieved %i commits from ObjectStore in %g s ( %f commits / s )" % (type(repo.odb), nc, elapsed, nc / elapsed)
+			results[0].append(elapsed)
+				
+			# GET TREES
+			# walk all trees of all commits
+			st = time()
+			blobs_per_commit = list()
+			nt = 0
+			for commit in commits:
+				tree = commit.tree
+				blobs = list()
+				for item in tree.traverse():
+					nt += 1
+					if item.type == 'blob':
+						blobs.append(item)
+					# direct access for speed
+				# END while trees are there for walking
+				blobs_per_commit.append(blobs)
+			# END for each commit
+			elapsed = time() - st
 			
-		# GET TREES
-		# walk all trees of all commits
-		st = time()
-		blobs_per_commit = list()
-		nt = 0
-		for commit in commits:
-			tree = commit.tree
-			blobs = list()
-			for item in tree.traverse():
-				nt += 1
-				if item.type == 'blob':
-					blobs.append(item)
-				# direct access for speed
-			# END while trees are there for walking
-			blobs_per_commit.append(blobs)
-		# END for each commit
-		elapsed = time() - st
-		
-		print >> sys.stderr, "Retrieved %i objects from %i commits in %g s ( %f objects / s )" % (nt, len(commits), elapsed, nt / elapsed)
-		
-		# GET BLOBS
-		st = time()
-		nb = 0
-		too_many = 15000
-		for blob_list in blobs_per_commit:
-			for blob in blob_list:
-				blob.data
-			# END for each blobsha
-			nb += len(blob_list)
-			if nb > too_many:
-				break
-		# END for each bloblist
-		elapsed = time() - st
+			print >> sys.stderr, "%s: Retrieved %i objects from %i commits in %g s ( %f objects / s )" % (type(repo.odb), nt, len(commits), elapsed, nt / elapsed)
+			results[1].append(elapsed)
+			
+			# GET BLOBS
+			st = time()
+			nb = 0
+			too_many = 15000
+			for blob_list in blobs_per_commit:
+				for blob in blob_list:
+					blob.data
+				# END for each blobsha
+				nb += len(blob_list)
+				if nb > too_many:
+					break
+			# END for each bloblist
+			elapsed = time() - st
+			
+			print >> sys.stderr, "%s: Retrieved %i blob and their data in %g s ( %f blobs / s )" % (type(repo.odb), nb, elapsed, nb / elapsed)
+			results[2].append(elapsed)
+		# END for each repo type
 		
-		print >> sys.stderr, "Retrieved %i blob and their data in %g s ( %f blobs / s )" % (nb, elapsed, nb / elapsed)
+		# final results
+		for test_name, a, b in results:
+			print >> sys.stderr, "%s: %f s vs %f s, pure is %f times slower" % (test_name, a, b, b / a)
+		# END for each result
diff --git a/test/git/test_index.py b/test/git/test_index.py