Skip to content

Commit 1e2b461

Browse files
committed
commit.create_from_tree now uses pure python implementation, fixed message parsing which truncated newlines although it was ilegitimate. Its up to the reader to truncate therse, nowhere in the git code I could find anyone adding newlines to commits where it is written
Added performance tests for serialization, it does about 5k commits per second if writing to tmpfs
1 parent 4b4a514 commit 1e2b461

15 files changed

+230
-82
lines changed

CHANGES

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ CHANGES
55
0.2 Beta 2
66
===========
77
* Commit objects now carry the 'encoding' information of their message. It wasn't parsed previously, and defaults to UTF-8
8+
* Commit.create_from_tree now uses a pure-python implementation, mimicing git-commit-tree
89

910
0.2
1011
=====

lib/git/cmd.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -323,12 +323,7 @@ def execute(self, command,
323323
stdout_value = proc.stdout.read().rstrip() # strip trailing "\n"
324324
else:
325325
max_chunk_size = 1024*64
326-
while True:
327-
chunk = proc.stdout.read(max_chunk_size)
328-
output_stream.write(chunk)
329-
if len(chunk) < max_chunk_size:
330-
break
331-
# END reading output stream
326+
stream_copy(proc.stdout, output_stream, max_chunk_size)
332327
stdout_value = output_stream
333328
# END stdout handling
334329
stderr_value = proc.stderr.read().rstrip() # strip trailing "\n"

lib/git/objects/base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,8 +125,8 @@ def data_stream(self):
125125
Returns
126126
File Object compatible stream to the uncompressed raw data of the object
127127
"""
128-
sha, type, size, stream = self.repo.git.stream_object_data(self.sha)
129-
return stream
128+
proc = self.repo.git.cat_file(self.type, self.sha, as_process=True)
129+
return utils.ProcessStreamAdapter(proc, "stdout")
130130

131131
def stream_data(self, ostream):
132132
"""

lib/git/objects/commit.py

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -91,15 +91,6 @@ def __init__(self, repo, sha, tree=None, author=None, authored_date=None, author
9191
"""
9292
super(Commit,self).__init__(repo, sha)
9393
self._set_self_from_args_(locals())
94-
95-
if parents is not None:
96-
cls = type(self)
97-
self.parents = tuple(cls(repo, p) for p in parents if not isinstance(p, cls))
98-
# END for each parent to convert
99-
100-
if self.sha and tree is not None:
101-
self.tree = Tree(repo, tree, path='')
102-
# END id to tree conversion
10394

10495
@classmethod
10596
def _get_intermediate_items(cls, commit):
@@ -350,7 +341,12 @@ def create_from_tree(cls, repo, tree, message, parent_commits=None, head=False):
350341
committer, committer_time, committer_offset,
351342
message, parent_commits, conf_encoding)
352343

353-
# serialize !
344+
stream = StringIO()
345+
new_commit._serialize(stream)
346+
streamlen = stream.tell()
347+
stream.seek(0)
348+
349+
new_commit.sha = repo.odb.to_object(cls.type, streamlen, stream, sha_as_hex=True)
354350

355351
if head:
356352
try:
@@ -377,8 +373,28 @@ def __repr__(self):
377373
#{ Serializable Implementation
378374

379375
def _serialize(self, stream):
380-
# for now, this is very inefficient and in fact shouldn't be used like this
381-
return super(Commit, self)._serialize(stream)
376+
write = stream.write
377+
write("tree %s\n" % self.tree)
378+
for p in self.parents:
379+
write("parent %s\n" % p)
380+
381+
a = self.author
382+
c = self.committer
383+
fmt = "%s %s <%s> %s %s\n"
384+
write(fmt % ("author", a.name, a.email,
385+
self.authored_date,
386+
utils.altz_to_utctz_str(self.author_tz_offset)))
387+
388+
write(fmt % ("committer", c.name, c.email,
389+
self.committed_date,
390+
utils.altz_to_utctz_str(self.committer_tz_offset)))
391+
392+
if self.encoding != self.default_encoding:
393+
write("encoding %s\n" % self.encoding)
394+
395+
write("\n")
396+
write(self.message)
397+
return self
382398

383399
def _deserialize(self, stream):
384400
""":param from_rev_list: if true, the stream format is coming from the rev-list command
@@ -416,7 +432,7 @@ def _deserialize(self, stream):
416432

417433
# a stream from our data simply gives us the plain message
418434
# The end of our message stream is marked with a newline that we strip
419-
self.message = stream.read()[:-1]
435+
self.message = stream.read()
420436
return self
421437

422438
#} END serializable implementation

lib/git/objects/utils.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616
import os
1717

1818
__all__ = ('get_object_type_by_name', 'get_user_id', 'parse_date', 'parse_actor_and_date',
19-
'ProcessStreamAdapter', 'Traversable')
19+
'ProcessStreamAdapter', 'Traversable', 'altz_to_utctz_str', 'utctz_to_altz',
20+
'verify_utctz')
2021

2122
def get_object_type_by_name(object_type_name):
2223
"""
@@ -57,14 +58,24 @@ def get_user_id():
5758
return "%s@%s" % (username, platform.node())
5859

5960

60-
def _utc_tz_to_altz(utctz):
61+
def utctz_to_altz(utctz):
6162
"""we convert utctz to the timezone in seconds, it is the format time.altzone
6263
returns. Git stores it as UTC timezon which has the opposite sign as well,
6364
which explains the -1 * ( that was made explicit here )
6465
:param utctz: git utc timezone string, i.e. +0200"""
6566
return -1 * int(float(utctz)/100*3600)
67+
68+
def altz_to_utctz_str(altz):
69+
"""As above, but inverses the operation, returning a string that can be used
70+
in commit objects"""
71+
utci = -1 * int((altz / 3600)*100)
72+
utcs = str(abs(utci))
73+
utcs = "0"*(4-len(utcs)) + utcs
74+
prefix = (utci < 0 and '-') or '+'
75+
return prefix + utcs
76+
6677

67-
def _verify_utctz(offset):
78+
def verify_utctz(offset):
6879
""":raise ValueError: if offset is incorrect
6980
:return: offset"""
7081
fmt_exc = ValueError("Invalid timezone offset format: %s" % offset)
@@ -97,11 +108,11 @@ def parse_date(string_date):
97108
if string_date.count(' ') == 1 and string_date.rfind(':') == -1:
98109
timestamp, offset = string_date.split()
99110
timestamp = int(timestamp)
100-
return timestamp, _utc_tz_to_altz(_verify_utctz(offset))
111+
return timestamp, utctz_to_altz(verify_utctz(offset))
101112
else:
102113
offset = "+0000" # local time by default
103114
if string_date[-5] in '-+':
104-
offset = _verify_utctz(string_date[-5:])
115+
offset = verify_utctz(string_date[-5:])
105116
string_date = string_date[:-6] # skip space as well
106117
# END split timezone info
107118

@@ -139,7 +150,7 @@ def parse_date(string_date):
139150
fstruct = time.struct_time((dtstruct.tm_year, dtstruct.tm_mon, dtstruct.tm_mday,
140151
tstruct.tm_hour, tstruct.tm_min, tstruct.tm_sec,
141152
dtstruct.tm_wday, dtstruct.tm_yday, tstruct.tm_isdst))
142-
return int(time.mktime(fstruct)), _utc_tz_to_altz(offset)
153+
return int(time.mktime(fstruct)), utctz_to_altz(offset)
143154
except ValueError:
144155
continue
145156
# END exception handling
@@ -167,7 +178,7 @@ def parse_actor_and_date(line):
167178
"""
168179
m = _re_actor_epoch.search(line)
169180
actor, epoch, offset = m.groups()
170-
return (Actor._from_string(actor), int(epoch), _utc_tz_to_altz(offset))
181+
return (Actor._from_string(actor), int(epoch), utctz_to_altz(offset))
171182

172183

173184

lib/git/odb/utils.py

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ def initialize(self, size=0):
137137
# END handle size
138138

139139
# read header
140-
maxb = 8192
140+
maxb = 512 # should really be enough, cgit uses 8192 I believe
141141
self._s = maxb
142142
hdr = self.read(maxb)
143143
hdrend = hdr.find("\0")
@@ -172,20 +172,24 @@ def read(self, size=-1):
172172
# Our performance now depends on StringIO. This way we don't need two large
173173
# buffers in peak times, but only one large one in the end which is
174174
# the return buffer
175-
if size > self.max_read_size:
176-
sio = StringIO()
177-
while size:
178-
read_size = min(self.max_read_size, size)
179-
data = self.read(read_size)
180-
sio.write(data)
181-
size -= len(data)
182-
if len(data) < read_size:
183-
break
184-
# END data loop
185-
sio.seek(0)
186-
return sio.getvalue()
187-
# END handle maxread
175+
# NO: We don't do it - if the user thinks its best, he is right. If he
176+
# has trouble, he will start reading in chunks. According to our tests
177+
# its still faster if we read 10 Mb at once instead of chunking it.
188178

179+
# if size > self.max_read_size:
180+
# sio = StringIO()
181+
# while size:
182+
# read_size = min(self.max_read_size, size)
183+
# data = self.read(read_size)
184+
# sio.write(data)
185+
# size -= len(data)
186+
# if len(data) < read_size:
187+
# break
188+
# # END data loop
189+
# sio.seek(0)
190+
# return sio.getvalue()
191+
# # END handle maxread
192+
#
189193
# deplete the buffer, then just continue using the decompress object
190194
# which has an own buffer. We just need this to transparently parse the
191195
# header from the zlib stream

lib/git/repo.py

Lines changed: 21 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,6 @@
44
# This module is part of GitPython and is released under
55
# the BSD License: http://www.opensource.org/licenses/bsd-license.php
66

7-
import os
8-
import sys
9-
import re
10-
import gzip
11-
import StringIO
12-
137
from errors import InvalidGitRepositoryError, NoSuchPathError
148
from cmd import Git
159
from actor import Actor
@@ -19,6 +13,15 @@
1913
from config import GitConfigParser
2014
from remote import Remote
2115

16+
from odb.db import LooseObjectDB
17+
18+
import os
19+
import sys
20+
import re
21+
import gzip
22+
import StringIO
23+
24+
2225
def touch(filename):
2326
fp = open(filename, "a")
2427
fp.close()
@@ -53,7 +56,7 @@ class Repo(object):
5356
'git_dir' is the .git repository directoy, which is always set.
5457
"""
5558
DAEMON_EXPORT_FILE = 'git-daemon-export-ok'
56-
__slots__ = ( "working_dir", "_working_tree_dir", "git_dir", "_bare", "git" )
59+
__slots__ = ( "working_dir", "_working_tree_dir", "git_dir", "_bare", "git", "odb" )
5760

5861
# precompiled regex
5962
re_whitespace = re.compile(r'\s+')
@@ -65,27 +68,22 @@ class Repo(object):
6568
# represents the configuration level of a configuration file
6669
config_level = ("system", "global", "repository")
6770

68-
def __init__(self, path=None):
69-
"""
70-
Create a new Repo instance
71-
72-
``path``
73-
is the path to either the root git directory or the bare git repo
71+
def __init__(self, path=None, odbt = LooseObjectDB):
72+
""" Create a new Repo instance
7473
75-
Examples::
74+
:param path: is the path to either the root git directory or the bare git repo::
7675
7776
repo = Repo("/Users/mtrier/Development/git-python")
7877
repo = Repo("/Users/mtrier/Development/git-python.git")
7978
repo = Repo("~/Development/git-python.git")
8079
repo = Repo("$REPOSITORIES/Development/git-python.git")
81-
82-
Raises
83-
InvalidGitRepositoryError or NoSuchPathError
84-
85-
Returns
86-
``git.Repo``
87-
"""
88-
80+
81+
:param odbt: Object DataBase type - a type which is constructed by providing
82+
the directory containing the database objects, i.e. .git/objects. It will
83+
be used to access all object data
84+
:raise InvalidGitRepositoryError:
85+
:raise NoSuchPathError:
86+
:return: git.Repo """
8987
epath = os.path.abspath(os.path.expandvars(os.path.expanduser(path or os.getcwd())))
9088

9189
if not os.path.exists(epath):
@@ -130,6 +128,7 @@ def __init__(self, path=None):
130128

131129
self.working_dir = self._working_tree_dir or self.git_dir
132130
self.git = Git(self.working_dir)
131+
self.odb = odbt(os.path.join(self.git_dir, 'objects'))
133132

134133
def __eq__(self, rhs):
135134
if isinstance(rhs, Repo):

lib/git/utils.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,21 @@ def make_sha(source=''):
2727
sha1 = sha.sha(source)
2828
return sha1
2929

30+
def stream_copy(source, destination, chunk_size=512*1024):
31+
"""Copy all data from the source stream into the destination stream in chunks
32+
of size chunk_size
33+
:return: amount of bytes written"""
34+
br = 0
35+
while True:
36+
chunk = source.read(chunk_size)
37+
destination.write(chunk)
38+
br += len(chunk)
39+
if len(chunk) < chunk_size:
40+
break
41+
# END reading output stream
42+
return br
43+
44+
3045
def join_path(a, *p):
3146
"""Join path tokens together similar to os.path.join, but always use
3247
'/' instead of possibly '\' on windows."""

test/git/performance/lib.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
"""Contains library functions"""
22
import os
33
from test.testlib import *
4+
import shutil
5+
import tempfile
46

57
from git import (
68
Repo
@@ -25,7 +27,7 @@ def resolve_or_fail(env_var):
2527

2628
#{ Base Classes
2729

28-
class TestBigRepoReadOnly(TestBase):
30+
class TestBigRepoR(TestBase):
2931
"""TestCase providing access to readonly 'big' repositories using the following
3032
member variables:
3133
@@ -40,7 +42,24 @@ class TestBigRepoReadOnly(TestBase):
4042

4143
@classmethod
4244
def setUpAll(cls):
43-
super(TestBigRepoReadOnly, cls).setUpAll()
44-
cls.gitrepo = Repo(resolve_or_fail(k_env_git_repo))
45+
super(TestBigRepoR, cls).setUpAll()
46+
cls.gitrorepo = Repo(resolve_or_fail(k_env_git_repo))
4547

48+
49+
class TestBigRepoRW(TestBigRepoR):
50+
"""As above, but provides a big repository that we can write to.
51+
52+
Provides ``self.gitrwrepo``"""
53+
54+
@classmethod
55+
def setUpAll(cls):
56+
super(TestBigRepoRW, cls).setUpAll()
57+
dirname = tempfile.mktemp()
58+
os.mkdir(dirname)
59+
cls.gitrwrepo = cls.gitrorepo.clone(dirname, shared=True, bare=True)
60+
61+
@classmethod
62+
def tearDownAll(cls):
63+
shutil.rmtree(cls.gitrwrepo.working_tree_dir)
64+
4665
#} END base classes

0 commit comments

Comments
 (0)