Skip to content

Commit f0c05ea

Browse files
committed
util: pick the type of memory manager based on the python version, to have optimal results in all cases (at least the ones I can test)
pack: now works properly with a sliding memory manager test_packedodb_pure: fixed very memory hungry implementation by using an iterator. This will of course reduce the measured performance a bit, but 750MB of memory is just a little bit too much for an ordinary test. Maybe it would be alright to just reduce the number of items ... but performance isn't a strength of python after all
1 parent 21499d9 commit f0c05ea

File tree

3 files changed

+28
-18
lines changed

3 files changed

+28
-18
lines changed

git/pack.py

+19-13
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373

7474
#{ Utilities
7575

76-
def pack_object_at(data, offset, as_stream):
76+
def pack_object_at(cursor, offset, as_stream):
7777
"""
7878
:return: Tuple(abs_data_offset, PackInfo|PackStream)
7979
an object of the correct type according to the type_id of the object.
@@ -83,7 +83,7 @@ def pack_object_at(data, offset, as_stream):
8383
:parma offset: offset in to the data at which the object information is located
8484
:param as_stream: if True, a stream object will be returned that can read
8585
the data, otherwise you receive an info object only"""
86-
data = buffer(data, offset)
86+
data = cursor.use_region(offset).buffer()
8787
type_id, uncomp_size, data_rela_offset = pack_object_header_info(data)
8888
total_rela_offset = None # set later, actual offset until data stream begins
8989
delta_info = None
@@ -269,6 +269,10 @@ def _set_cache_(self, attr):
269269
# that we can actually write to the location - it could be a read-only
270270
# alternate for instance
271271
self._cursor = mman.make_cursor(self._indexpath).use_region()
272+
# We will assume that the index will always fully fit into memory !
273+
if mman.window_size() > 0 and self._cursor.file_size() > mman.window_size():
274+
raise AssertionError("The index file at %s is too large to fit into a mapped window (%i > %i). This is a limitation of the implementation" % (self._indexpath, self._cursor.file_size(), mman.window_size()))
275+
#END assert window size
272276
else:
273277
# now its time to initialize everything - if we are here, someone wants
274278
# to access the fanout table or related properties
@@ -527,13 +531,13 @@ def _set_cache_(self, attr):
527531

528532
def _iter_objects(self, start_offset, as_stream=True):
529533
"""Handle the actual iteration of objects within this pack"""
530-
data = self._cursor.map()
531-
content_size = len(data) - self.footer_size
534+
c = self._cursor
535+
content_size = c.file_size() - self.footer_size
532536
cur_offset = start_offset or self.first_object_offset
533537

534538
null = NullStream()
535539
while cur_offset < content_size:
536-
data_offset, ostream = pack_object_at(data, cur_offset, True)
540+
data_offset, ostream = pack_object_at(c, cur_offset, True)
537541
# scrub the stream to the end - this decompresses the object, but yields
538542
# the amount of compressed bytes we need to get to the next offset
539543

@@ -562,12 +566,14 @@ def version(self):
562566
def data(self):
563567
"""
564568
:return: read-only data of this pack. It provides random access and usually
565-
is a memory map"""
566-
return self._cursor.map()
569+
is a memory map.
570+
:note: This method is unsafe as it returns a window into a file which might be larger than than the actual window size"""
571+
# can use map as we are starting at offset 0. Otherwise we would have to use buffer()
572+
return self._cursor.use_region().map()
567573

568574
def checksum(self):
569575
""":return: 20 byte sha1 hash on all object sha's contained in this file"""
570-
return self._cursor.map()[-20:]
576+
return self._cursor.use_region(self._cursor.file_size()-20).buffer()[:]
571577

572578
def path(self):
573579
""":return: path to the packfile"""
@@ -586,9 +592,9 @@ def collect_streams(self, offset):
586592
If the object at offset is no delta, the size of the list is 1.
587593
:param offset: specifies the first byte of the object within this pack"""
588594
out = list()
589-
data = self._cursor.map()
595+
c = self._cursor
590596
while True:
591-
ostream = pack_object_at(data, offset, True)[1]
597+
ostream = pack_object_at(c, offset, True)[1]
592598
out.append(ostream)
593599
if ostream.type_id == OFS_DELTA:
594600
offset = ostream.pack_offset - ostream.delta_info
@@ -610,14 +616,14 @@ def info(self, offset):
610616
611617
:param offset: byte offset
612618
:return: OPackInfo instance, the actual type differs depending on the type_id attribute"""
613-
return pack_object_at(self._cursor.map(), offset or self.first_object_offset, False)[1]
619+
return pack_object_at(self._cursor, offset or self.first_object_offset, False)[1]
614620

615621
def stream(self, offset):
616622
"""Retrieve an object at the given file-relative offset as stream along with its information
617623
618624
:param offset: byte offset
619625
:return: OPackStream instance, the actual type differs depending on the type_id attribute"""
620-
return pack_object_at(self._cursor.map(), offset or self.first_object_offset, True)[1]
626+
return pack_object_at(self._cursor, offset or self.first_object_offset, True)[1]
621627

622628
def stream_iter(self, start_offset=0):
623629
"""
@@ -700,7 +706,7 @@ def _object(self, sha, as_stream, index=-1):
700706
sha = self._index.sha(index)
701707
# END assure sha is present ( in output )
702708
offset = self._index.offset(index)
703-
type_id, uncomp_size, data_rela_offset = pack_object_header_info(buffer(self._pack._cursor.map(), offset))
709+
type_id, uncomp_size, data_rela_offset = pack_object_header_info(self._pack._cursor.use_region(offset).buffer())
704710
if as_stream:
705711
if type_id not in delta_types:
706712
packstream = self._pack.stream(offset)

git/test/performance/db/test_packedodb_pure.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -49,18 +49,17 @@ def test_pack_writing(self):
4949
count = 0
5050
total_size = 0
5151
st = time()
52-
objs = list()
5352
for sha in rorepo.sha_iter():
5453
count += 1
55-
objs.append(rorepo.stream(sha))
54+
rorepo.stream(sha)
5655
if count == ni:
5756
break
5857
#END gather objects for pack-writing
5958
elapsed = time() - st
60-
print >> sys.stderr, "PDB Streaming: Got %i streams from %s by sha in in %f s ( %f streams/s )" % (ni, rorepo.__class__.__name__, elapsed, ni / elapsed)
59+
print >> sys.stderr, "PDB Streaming: Got %i streams from %s by sha in in %f s ( %f streams/s )" % (count, rorepo.__class__.__name__, elapsed, count / elapsed)
6160

6261
st = time()
63-
PackEntity.write_pack(objs, ostream.write)
62+
PackEntity.write_pack((rorepo.stream(sha) for sha in rorepo.sha_iter()), ostream.write, object_count=ni)
6463
elapsed = time() - st
6564
total_kb = ostream.bytes_written() / 1000
6665
print >> sys.stderr, "PDB Streaming: Wrote pack of size %i kb in %f s (%f kb/s)" % (total_kb, elapsed, total_kb/elapsed)

git/util.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import tempfile
1818
from smmap import (
1919
StaticWindowMapManager,
20+
SlidingWindowMapManager,
2021
SlidingWindowMapBuffer
2122
)
2223

@@ -72,7 +73,11 @@ def unpack_from(fmt, data, offset=0):
7273

7374
# initialize our global memory manager instance
7475
# Use it to free cached (and unused) resources.
75-
mman = StaticWindowMapManager()
76+
if sys.version_info[1] < 6:
77+
mman = StaticWindowMapManager()
78+
else:
79+
mman = SlidingWindowMapManager()
80+
#END handle mman
7681

7782
#} END globals
7883

0 commit comments

Comments
 (0)