Skip to content

Commit ac62760

Browse files
committed
index.add: now uses gitdb.store functionality instead of git-hash-file. The python version is about as fast, but could support multithreading using async
1 parent f164627 commit ac62760

File tree

5 files changed

+678
-647
lines changed

5 files changed

+678
-647
lines changed

lib/git/index/base.py

Lines changed: 46 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import stat
1313
import subprocess
1414
import glob
15+
from cStringIO import StringIO
1516

1617
from typ import *
1718
from util import (
@@ -48,6 +49,10 @@
4849
)
4950

5051

52+
from gitdb.base import (
53+
IStream
54+
)
55+
5156
__all__ = ( 'IndexFile', 'CheckoutError' )
5257

5358

@@ -255,9 +260,6 @@ def write(self, file_path = None, ignore_tree_extension_data=False):
255260
256261
Returns
257262
self
258-
259-
Note
260-
Index writing based on the dulwich implementation
261263
"""
262264
lfd = LockedFD(file_path or self._file_path)
263265
stream = lfd.open(write=True, stream=True)
@@ -634,12 +636,10 @@ def _preprocess_add_items(self, items):
634636
# END for each item
635637
return (paths, entries)
636638

637-
638639
@clear_cache
639640
@default_index
640641
def add(self, items, force=True, fprogress=lambda *args: None, path_rewriter=None):
641-
"""
642-
Add files from the working tree, specific blobs or BaseIndexEntries
642+
"""Add files from the working tree, specific blobs or BaseIndexEntries
643643
to the index. The underlying index file will be written immediately, hence
644644
you should provide as many items as possible to minimize the amounts of writes
645645
@@ -695,7 +695,7 @@ def add(self, items, force=True, fprogress=lambda *args: None, path_rewriter=Non
695695
696696
:param fprogress:
697697
Function with signature f(path, done=False, item=item) called for each
698-
path to be added, once once it is about to be added where done==False
698+
path to be added, one time once it is about to be added where done==False
699699
and once after it was added where done=True.
700700
item is set to the actual item we handle, either a Path or a BaseIndexEntry
701701
Please note that the processed path is not guaranteed to be present
@@ -713,8 +713,8 @@ def add(self, items, force=True, fprogress=lambda *args: None, path_rewriter=Non
713713
:return:
714714
List(BaseIndexEntries) representing the entries just actually added.
715715
716-
Raises
717-
GitCommandError if a supplied Path did not exist. Please note that BaseIndexEntry
716+
:raise OSError:
717+
if a supplied Path did not exist. Please note that BaseIndexEntry
718718
Objects that do not have a null sha will be added even if their paths
719719
do not exist.
720720
"""
@@ -734,28 +734,45 @@ def add(self, items, force=True, fprogress=lambda *args: None, path_rewriter=Non
734734
del(paths[:])
735735
# END rewrite paths
736736

737+
738+
def store_path(filepath):
739+
"""Store file at filepath in the database and return the base index entry"""
740+
st = os.lstat(filepath) # handles non-symlinks as well
741+
stream = None
742+
if stat.S_ISLNK(st.st_mode):
743+
stream = StringIO(os.readlink(filepath))
744+
else:
745+
stream = open(filepath, 'rb')
746+
# END handle stream
747+
fprogress(filepath, False, filepath)
748+
istream = self.repo.odb.store(IStream(Blob.type, st.st_size, stream))
749+
fprogress(filepath, True, filepath)
750+
751+
return BaseIndexEntry((st.st_mode, istream.sha, 0, filepath))
752+
# END utility method
753+
754+
737755
# HANDLE PATHS
738756
if paths:
739-
# to get suitable progress information, pipe paths to stdin
740-
args = ("--add", "--replace", "--verbose", "--stdin")
741-
proc = self.repo.git.update_index(*args, **{'as_process':True, 'istream':subprocess.PIPE})
742-
make_exc = lambda : GitCommandError(("git-update-index",)+args, 128, proc.stderr.read())
757+
assert len(entries_added) == 0
743758
added_files = list()
744-
745759
for filepath in self._iter_expand_paths(paths):
746-
self._write_path_to_stdin(proc, filepath, filepath, make_exc,
747-
fprogress, read_from_stdout=False)
748-
added_files.append(filepath)
760+
entries_added.append(store_path(filepath))
749761
# END for each filepath
750-
self._flush_stdin_and_wait(proc, ignore_stdout=True) # ignore stdout
762+
763+
# add the new entries to this instance, and write it
764+
for entry in entries_added:
765+
self.entries[(entry.path, 0)] = IndexEntry.from_base(entry)
751766

752-
# force rereading our entries once it is all done
753-
self._delete_entries_cache()
754-
entries_added.extend(self.entries[(f,0)] for f in added_files)
767+
# finally write the changed index
768+
self.write()
755769
# END path handling
756770

771+
757772
# HANDLE ENTRIES
758773
if entries:
774+
# TODO: Add proper IndexEntries to ourselves, and write the index
775+
# just once. Currently its done twice at least
759776
null_mode_entries = [ e for e in entries if e.mode == 0 ]
760777
if null_mode_entries:
761778
raise ValueError("At least one Entry has a null-mode - please use index.remove to remove files for clarity")
@@ -765,37 +782,22 @@ def add(self, items, force=True, fprogress=lambda *args: None, path_rewriter=Non
765782
# create objects if required, otherwise go with the existing shas
766783
null_entries_indices = [ i for i,e in enumerate(entries) if e.sha == Object.NULL_HEX_SHA ]
767784
if null_entries_indices:
768-
# creating object ids is the time consuming part. Hence we will
769-
# send progress for these now.
770-
args = ("-w", "--stdin-paths")
771-
proc = self.repo.git.hash_object(*args, **{'istream':subprocess.PIPE, 'as_process':True})
772-
make_exc = lambda : GitCommandError(("git-hash-object",)+args, 128, proc.stderr.read())
773-
obj_ids = list()
774785
for ei in null_entries_indices:
775-
entry = entries[ei]
776-
obj_ids.append(self._write_path_to_stdin(proc, entry.path, entry,
777-
make_exc, fprogress, read_from_stdout=True))
786+
null_entry = entries[ei]
787+
new_entry = store_path(null_entry.path)
788+
789+
# update null entry
790+
entries[ei] = BaseIndexEntry((null_entry.mode, new_entry.sha, null_entry.stage, null_entry.path))
778791
# END for each entry index
779-
assert len(obj_ids) == len(null_entries_indices), "git-hash-object did not produce all requested objects: want %i, got %i" % ( len(null_entries_indices), len(obj_ids) )
780-
781-
# update IndexEntries with new object id
782-
for i,new_sha in zip(null_entries_indices, obj_ids):
783-
e = entries[i]
784-
785-
new_entry = BaseIndexEntry((e.mode, new_sha, e.stage, e.path))
786-
entries[i] = new_entry
787-
# END for each index
788792
# END null_entry handling
789793

790794
# REWRITE PATHS
791795
# If we have to rewrite the entries, do so now, after we have generated
792796
# all object sha's
793797
if path_rewriter:
794-
new_entries = list()
795-
for e in entries:
796-
new_entries.append(BaseIndexEntry((e.mode, e.sha, e.stage, path_rewriter(e))))
798+
for i,e in enumerate(entries):
799+
entries[i] = BaseIndexEntry((e.mode, e.sha, e.stage, path_rewriter(e)))
797800
# END for each entry
798-
entries = new_entries
799801
# END handle path rewriting
800802

801803
# feed pure entries to stdin
@@ -821,7 +823,7 @@ def add(self, items, force=True, fprogress=lambda *args: None, path_rewriter=Non
821823
self._flush_stdin_and_wait(proc, ignore_stdout=True)
822824
entries_added.extend(entries)
823825
# END if there are base entries
824-
826+
825827
return entries_added
826828

827829
def _items_to_rela_paths(self, items):

lib/git/repo.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -723,16 +723,17 @@ def init(cls, path=None, mkdir=True, **kwargs):
723723
return Repo(path)
724724

725725
def clone(self, path, **kwargs):
726-
"""
727-
Create a clone from this repository.
728-
729-
``path``
726+
"""Create a clone from this repository.
727+
:param path:
730728
is the full path of the new repo (traditionally ends with ./<name>.git).
731729
732-
``kwargs``
733-
keyword arguments to be given to the git-clone command
734-
735-
Returns
730+
:param kwargs:
731+
odbt = ObjectDatabase Type, allowing to determine the object database
732+
implementation used by the returned Repo instance
733+
734+
All remaining keyword arguments are given to the git-clone command
735+
736+
:return:
736737
``git.Repo`` (the newly cloned repo)
737738
"""
738739
# special handling for windows for path at which the clone should be
@@ -741,6 +742,7 @@ def clone(self, path, **kwargs):
741742
# we at least give a proper error instead of letting git fail
742743
prev_cwd = None
743744
prev_path = None
745+
odbt = kwargs.pop('odbt', GitCmdObjectDB)
744746
if os.name == 'nt':
745747
if '~' in path:
746748
raise OSError("Git cannot handle the ~ character in path %r correctly" % path)
@@ -767,7 +769,7 @@ def clone(self, path, **kwargs):
767769
path = prev_path
768770
# END reset previous working dir
769771
# END bad windows handling
770-
return Repo(path)
772+
return Repo(path, odbt = odbt)
771773

772774

773775
def archive(self, ostream, treeish=None, prefix=None, **kwargs):

test/git/performance/lib.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@
44
import shutil
55
import tempfile
66

7+
from git.db import (
8+
GitCmdObjectDB,
9+
GitDB
10+
)
11+
712
from git import (
813
Repo
914
)
@@ -31,9 +36,14 @@ class TestBigRepoR(TestBase):
3136
"""TestCase providing access to readonly 'big' repositories using the following
3237
member variables:
3338
34-
* gitrepo
39+
* gitrorepo
3540
36-
* Read-Only git repository - actually the repo of git itself"""
41+
* Read-Only git repository - actually the repo of git itself
42+
43+
* puregitrorepo
44+
45+
* As gitrepo, but uses pure python implementation
46+
"""
3747

3848
#{ Invariants
3949
head_sha_2k = '235d521da60e4699e5bd59ac658b5b48bd76ddca'
@@ -43,20 +53,23 @@ class TestBigRepoR(TestBase):
4353
@classmethod
4454
def setUpAll(cls):
4555
super(TestBigRepoR, cls).setUpAll()
46-
cls.gitrorepo = Repo(resolve_or_fail(k_env_git_repo))
56+
repo_path = resolve_or_fail(k_env_git_repo)
57+
cls.gitrorepo = Repo(repo_path, odbt=GitCmdObjectDB)
58+
cls.puregitrorepo = Repo(repo_path, odbt=GitDB)
4759

4860

4961
class TestBigRepoRW(TestBigRepoR):
5062
"""As above, but provides a big repository that we can write to.
5163
52-
Provides ``self.gitrwrepo``"""
64+
Provides ``self.gitrwrepo`` and ``self.puregitrwrepo``"""
5365

5466
@classmethod
5567
def setUpAll(cls):
5668
super(TestBigRepoRW, cls).setUpAll()
5769
dirname = tempfile.mktemp()
5870
os.mkdir(dirname)
59-
cls.gitrwrepo = cls.gitrorepo.clone(dirname, shared=True, bare=True)
71+
cls.gitrwrepo = cls.gitrorepo.clone(dirname, shared=True, bare=True, odbt=GitCmdObjectDB)
72+
cls.puregitrwrepo = Repo(dirname, odbt=GitDB)
6073

6174
@classmethod
6275
def tearDownAll(cls):

test/git/performance/test_odb.py

Lines changed: 52 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -12,50 +12,58 @@
1212
class TestObjDBPerformance(TestBigRepoR):
1313

1414
def test_random_access(self):
15-
16-
# GET COMMITS
17-
# TODO: use the actual db for this
18-
st = time()
19-
root_commit = self.gitrorepo.commit(self.head_sha_2k)
20-
commits = list(root_commit.traverse())
21-
nc = len(commits)
22-
elapsed = time() - st
23-
24-
print >> sys.stderr, "Retrieved %i commits from ObjectStore in %g s ( %f commits / s )" % (nc, elapsed, nc / elapsed)
15+
results = [ ["Iterate Commits"], ["Iterate Blobs"], ["Retrieve Blob Data"] ]
16+
for repo in (self.gitrorepo, self.puregitrorepo):
17+
# GET COMMITS
18+
st = time()
19+
root_commit = repo.commit(self.head_sha_2k)
20+
commits = list(root_commit.traverse())
21+
nc = len(commits)
22+
elapsed = time() - st
2523

24+
print >> sys.stderr, "%s: Retrieved %i commits from ObjectStore in %g s ( %f commits / s )" % (type(repo.odb), nc, elapsed, nc / elapsed)
25+
results[0].append(elapsed)
26+
27+
# GET TREES
28+
# walk all trees of all commits
29+
st = time()
30+
blobs_per_commit = list()
31+
nt = 0
32+
for commit in commits:
33+
tree = commit.tree
34+
blobs = list()
35+
for item in tree.traverse():
36+
nt += 1
37+
if item.type == 'blob':
38+
blobs.append(item)
39+
# direct access for speed
40+
# END while trees are there for walking
41+
blobs_per_commit.append(blobs)
42+
# END for each commit
43+
elapsed = time() - st
2644

27-
# GET TREES
28-
# walk all trees of all commits
29-
st = time()
30-
blobs_per_commit = list()
31-
nt = 0
32-
for commit in commits:
33-
tree = commit.tree
34-
blobs = list()
35-
for item in tree.traverse():
36-
nt += 1
37-
if item.type == 'blob':
38-
blobs.append(item)
39-
# direct access for speed
40-
# END while trees are there for walking
41-
blobs_per_commit.append(blobs)
42-
# END for each commit
43-
elapsed = time() - st
44-
45-
print >> sys.stderr, "Retrieved %i objects from %i commits in %g s ( %f objects / s )" % (nt, len(commits), elapsed, nt / elapsed)
46-
47-
# GET BLOBS
48-
st = time()
49-
nb = 0
50-
too_many = 15000
51-
for blob_list in blobs_per_commit:
52-
for blob in blob_list:
53-
blob.data
54-
# END for each blobsha
55-
nb += len(blob_list)
56-
if nb > too_many:
57-
break
58-
# END for each bloblist
59-
elapsed = time() - st
45+
print >> sys.stderr, "%s: Retrieved %i objects from %i commits in %g s ( %f objects / s )" % (type(repo.odb), nt, len(commits), elapsed, nt / elapsed)
46+
results[1].append(elapsed)
47+
48+
# GET BLOBS
49+
st = time()
50+
nb = 0
51+
too_many = 15000
52+
for blob_list in blobs_per_commit:
53+
for blob in blob_list:
54+
blob.data
55+
# END for each blobsha
56+
nb += len(blob_list)
57+
if nb > too_many:
58+
break
59+
# END for each bloblist
60+
elapsed = time() - st
61+
62+
print >> sys.stderr, "%s: Retrieved %i blob and their data in %g s ( %f blobs / s )" % (type(repo.odb), nb, elapsed, nb / elapsed)
63+
results[2].append(elapsed)
64+
# END for each repo type
6065

61-
print >> sys.stderr, "Retrieved %i blob and their data in %g s ( %f blobs / s )" % (nb, elapsed, nb / elapsed)
66+
# final results
67+
for test_name, a, b in results:
68+
print >> sys.stderr, "%s: %f s vs %f s, pure is %f times slower" % (test_name, a, b, b / a)
69+
# END for each result

0 commit comments

Comments
 (0)