10
10
)
11
11
from util import (
12
12
zlib ,
13
+ mman ,
13
14
LazyMixin ,
14
15
unpack_from ,
15
16
bin_to_hex ,
16
- file_contents_ro_filepath ,
17
17
)
18
18
19
19
from fun import (
73
73
74
74
#{ Utilities
75
75
76
- def pack_object_at (data , offset , as_stream ):
76
+ def pack_object_at (cursor , offset , as_stream ):
77
77
"""
78
78
:return: Tuple(abs_data_offset, PackInfo|PackStream)
79
79
an object of the correct type according to the type_id of the object.
@@ -83,7 +83,7 @@ def pack_object_at(data, offset, as_stream):
83
83
:parma offset: offset in to the data at which the object information is located
84
84
:param as_stream: if True, a stream object will be returned that can read
85
85
the data, otherwise you receive an info object only"""
86
- data = buffer ( data , offset )
86
+ data = cursor . use_region ( offset ). buffer ( )
87
87
type_id , uncomp_size , data_rela_offset = pack_object_header_info (data )
88
88
total_rela_offset = None # set later, actual offset until data stream begins
89
89
delta_info = None
@@ -247,7 +247,7 @@ class PackIndexFile(LazyMixin):
247
247
248
248
# Dont use slots as we dynamically bind functions for each version, need a dict for this
249
249
# The slots you see here are just to keep track of our instance variables
250
- # __slots__ = ('_indexpath', '_fanout_table', '_data ', '_version',
250
+ # __slots__ = ('_indexpath', '_fanout_table', '_cursor ', '_version',
251
251
# '_sha_list_offset', '_crc_list_offset', '_pack_offset', '_pack_64_offset')
252
252
253
253
# used in v2 indices
@@ -261,22 +261,27 @@ def __init__(self, indexpath):
261
261
262
262
def _set_cache_ (self , attr ):
263
263
if attr == "_packfile_checksum" :
264
- self ._packfile_checksum = self ._data [- 40 :- 20 ]
264
+ self ._packfile_checksum = self ._cursor . map () [- 40 :- 20 ]
265
265
elif attr == "_packfile_checksum" :
266
- self ._packfile_checksum = self ._data [- 20 :]
267
- elif attr == "_data " :
266
+ self ._packfile_checksum = self ._cursor . map () [- 20 :]
267
+ elif attr == "_cursor " :
268
268
# Note: We don't lock the file when reading as we cannot be sure
269
269
# that we can actually write to the location - it could be a read-only
270
270
# alternate for instance
271
- self ._data = file_contents_ro_filepath (self ._indexpath )
271
+ self ._cursor = mman .make_cursor (self ._indexpath ).use_region ()
272
+ # We will assume that the index will always fully fit into memory !
273
+ if mman .window_size () > 0 and self ._cursor .file_size () > mman .window_size ():
274
+ raise AssertionError ("The index file at %s is too large to fit into a mapped window (%i > %i). This is a limitation of the implementation" % (self ._indexpath , self ._cursor .file_size (), mman .window_size ()))
275
+ #END assert window size
272
276
else :
273
277
# now its time to initialize everything - if we are here, someone wants
274
278
# to access the fanout table or related properties
275
279
276
280
# CHECK VERSION
277
- self ._version = (self ._data [:4 ] == self .index_v2_signature and 2 ) or 1
281
+ mmap = self ._cursor .map ()
282
+ self ._version = (mmap [:4 ] == self .index_v2_signature and 2 ) or 1
278
283
if self ._version == 2 :
279
- version_id = unpack_from (">L" , self . _data , 4 )[0 ]
284
+ version_id = unpack_from (">L" , mmap , 4 )[0 ]
280
285
assert version_id == self ._version , "Unsupported index version: %i" % version_id
281
286
# END assert version
282
287
@@ -297,16 +302,16 @@ def _set_cache_(self, attr):
297
302
298
303
def _entry_v1 (self , i ):
299
304
""":return: tuple(offset, binsha, 0)"""
300
- return unpack_from (">L20s" , self ._data , 1024 + i * 24 ) + (0 , )
305
+ return unpack_from (">L20s" , self ._cursor . map () , 1024 + i * 24 ) + (0 , )
301
306
302
307
def _offset_v1 (self , i ):
303
308
"""see ``_offset_v2``"""
304
- return unpack_from (">L" , self ._data , 1024 + i * 24 )[0 ]
309
+ return unpack_from (">L" , self ._cursor . map () , 1024 + i * 24 )[0 ]
305
310
306
311
def _sha_v1 (self , i ):
307
312
"""see ``_sha_v2``"""
308
313
base = 1024 + (i * 24 )+ 4
309
- return self ._data [base :base + 20 ]
314
+ return self ._cursor . map () [base :base + 20 ]
310
315
311
316
def _crc_v1 (self , i ):
312
317
"""unsupported"""
@@ -322,25 +327,25 @@ def _entry_v2(self, i):
322
327
def _offset_v2 (self , i ):
323
328
""":return: 32 or 64 byte offset into pack files. 64 byte offsets will only
324
329
be returned if the pack is larger than 4 GiB, or 2^32"""
325
- offset = unpack_from (">L" , self ._data , self ._pack_offset + i * 4 )[0 ]
330
+ offset = unpack_from (">L" , self ._cursor . map () , self ._pack_offset + i * 4 )[0 ]
326
331
327
332
# if the high-bit is set, this indicates that we have to lookup the offset
328
333
# in the 64 bit region of the file. The current offset ( lower 31 bits )
329
334
# are the index into it
330
335
if offset & 0x80000000 :
331
- offset = unpack_from (">Q" , self ._data , self ._pack_64_offset + (offset & ~ 0x80000000 ) * 8 )[0 ]
336
+ offset = unpack_from (">Q" , self ._cursor . map () , self ._pack_64_offset + (offset & ~ 0x80000000 ) * 8 )[0 ]
332
337
# END handle 64 bit offset
333
338
334
339
return offset
335
340
336
341
def _sha_v2 (self , i ):
337
342
""":return: sha at the given index of this file index instance"""
338
343
base = self ._sha_list_offset + i * 20
339
- return self ._data [base :base + 20 ]
344
+ return self ._cursor . map () [base :base + 20 ]
340
345
341
346
def _crc_v2 (self , i ):
342
347
""":return: 4 bytes crc for the object at index i"""
343
- return unpack_from (">L" , self ._data , self ._crc_list_offset + i * 4 )[0 ]
348
+ return unpack_from (">L" , self ._cursor . map () , self ._crc_list_offset + i * 4 )[0 ]
344
349
345
350
#} END access V2
346
351
@@ -358,7 +363,7 @@ def _initialize(self):
358
363
359
364
def _read_fanout (self , byte_offset ):
360
365
"""Generate a fanout table from our data"""
361
- d = self ._data
366
+ d = self ._cursor . map ()
362
367
out = list ()
363
368
append = out .append
364
369
for i in range (256 ):
@@ -382,19 +387,19 @@ def path(self):
382
387
383
388
def packfile_checksum (self ):
384
389
""":return: 20 byte sha representing the sha1 hash of the pack file"""
385
- return self ._data [- 40 :- 20 ]
390
+ return self ._cursor . map () [- 40 :- 20 ]
386
391
387
392
def indexfile_checksum (self ):
388
393
""":return: 20 byte sha representing the sha1 hash of this index file"""
389
- return self ._data [- 20 :]
394
+ return self ._cursor . map () [- 20 :]
390
395
391
396
def offsets (self ):
392
397
""":return: sequence of all offsets in the order in which they were written
393
398
:note: return value can be random accessed, but may be immmutable"""
394
399
if self ._version == 2 :
395
400
# read stream to array, convert to tuple
396
401
a = array .array ('I' ) # 4 byte unsigned int, long are 8 byte on 64 bit it appears
397
- a .fromstring (buffer (self ._data , self ._pack_offset , self ._pack_64_offset - self ._pack_offset ))
402
+ a .fromstring (buffer (self ._cursor . map () , self ._pack_offset , self ._pack_64_offset - self ._pack_offset ))
398
403
399
404
# networkbyteorder to something array likes more
400
405
if sys .byteorder == 'little' :
@@ -501,7 +506,7 @@ class PackFile(LazyMixin):
501
506
for some reason - one clearly doesn't want to read 10GB at once in that
502
507
case"""
503
508
504
- __slots__ = ('_packpath' , '_data ' , '_size' , '_version' )
509
+ __slots__ = ('_packpath' , '_cursor ' , '_size' , '_version' )
505
510
pack_signature = 0x5041434b # 'PACK'
506
511
pack_version_default = 2
507
512
@@ -513,32 +518,26 @@ def __init__(self, packpath):
513
518
self ._packpath = packpath
514
519
515
520
def _set_cache_ (self , attr ):
516
- if attr == '_data' :
517
- self ._data = file_contents_ro_filepath (self ._packpath )
518
-
519
- # read the header information
520
- type_id , self ._version , self ._size = unpack_from (">LLL" , self ._data , 0 )
521
-
522
- # TODO: figure out whether we should better keep the lock, or maybe
523
- # add a .keep file instead ?
524
- else : # must be '_size' or '_version'
525
- # read header info - we do that just with a file stream
526
- type_id , self ._version , self ._size = unpack (">LLL" , open (self ._packpath ).read (12 ))
527
- # END handle header
521
+ # we fill the whole cache, whichever attribute gets queried first
522
+ self ._cursor = mman .make_cursor (self ._packpath ).use_region ()
528
523
524
+ # read the header information
525
+ type_id , self ._version , self ._size = unpack_from (">LLL" , self ._cursor .map (), 0 )
526
+
527
+ # TODO: figure out whether we should better keep the lock, or maybe
528
+ # add a .keep file instead ?
529
529
if type_id != self .pack_signature :
530
530
raise ParseError ("Invalid pack signature: %i" % type_id )
531
- #END assert type id
532
531
533
532
def _iter_objects (self , start_offset , as_stream = True ):
534
533
"""Handle the actual iteration of objects within this pack"""
535
- data = self ._data
536
- content_size = len ( data ) - self .footer_size
534
+ c = self ._cursor
535
+ content_size = c . file_size ( ) - self .footer_size
537
536
cur_offset = start_offset or self .first_object_offset
538
537
539
538
null = NullStream ()
540
539
while cur_offset < content_size :
541
- data_offset , ostream = pack_object_at (data , cur_offset , True )
540
+ data_offset , ostream = pack_object_at (c , cur_offset , True )
542
541
# scrub the stream to the end - this decompresses the object, but yields
543
542
# the amount of compressed bytes we need to get to the next offset
544
543
@@ -567,12 +566,14 @@ def version(self):
567
566
def data (self ):
568
567
"""
569
568
:return: read-only data of this pack. It provides random access and usually
570
- is a memory map"""
571
- return self ._data
569
+ is a memory map.
570
+ :note: This method is unsafe as it returns a window into a file which might be larger than than the actual window size"""
571
+ # can use map as we are starting at offset 0. Otherwise we would have to use buffer()
572
+ return self ._cursor .use_region ().map ()
572
573
573
574
def checksum (self ):
574
575
""":return: 20 byte sha1 hash on all object sha's contained in this file"""
575
- return self ._data [ - 20 :]
576
+ return self ._cursor . use_region ( self . _cursor . file_size () - 20 ). buffer ()[ :]
576
577
577
578
def path (self ):
578
579
""":return: path to the packfile"""
@@ -591,8 +592,9 @@ def collect_streams(self, offset):
591
592
If the object at offset is no delta, the size of the list is 1.
592
593
:param offset: specifies the first byte of the object within this pack"""
593
594
out = list ()
595
+ c = self ._cursor
594
596
while True :
595
- ostream = pack_object_at (self . _data , offset , True )[1 ]
597
+ ostream = pack_object_at (c , offset , True )[1 ]
596
598
out .append (ostream )
597
599
if ostream .type_id == OFS_DELTA :
598
600
offset = ostream .pack_offset - ostream .delta_info
@@ -614,14 +616,14 @@ def info(self, offset):
614
616
615
617
:param offset: byte offset
616
618
:return: OPackInfo instance, the actual type differs depending on the type_id attribute"""
617
- return pack_object_at (self ._data , offset or self .first_object_offset , False )[1 ]
619
+ return pack_object_at (self ._cursor , offset or self .first_object_offset , False )[1 ]
618
620
619
621
def stream (self , offset ):
620
622
"""Retrieve an object at the given file-relative offset as stream along with its information
621
623
622
624
:param offset: byte offset
623
625
:return: OPackStream instance, the actual type differs depending on the type_id attribute"""
624
- return pack_object_at (self ._data , offset or self .first_object_offset , True )[1 ]
626
+ return pack_object_at (self ._cursor , offset or self .first_object_offset , True )[1 ]
625
627
626
628
def stream_iter (self , start_offset = 0 ):
627
629
"""
@@ -704,7 +706,7 @@ def _object(self, sha, as_stream, index=-1):
704
706
sha = self ._index .sha (index )
705
707
# END assure sha is present ( in output )
706
708
offset = self ._index .offset (index )
707
- type_id , uncomp_size , data_rela_offset = pack_object_header_info (buffer ( self ._pack ._data , offset ))
709
+ type_id , uncomp_size , data_rela_offset = pack_object_header_info (self ._pack ._cursor . use_region ( offset ). buffer ( ))
708
710
if as_stream :
709
711
if type_id not in delta_types :
710
712
packstream = self ._pack .stream (offset )
0 commit comments