10
10
from pandas .core .frame import _arrays_to_mgr
11
11
import pymongo
12
12
from pymongo .errors import OperationFailure
13
+ import copy
13
14
14
15
from ..date import DateRange , to_pandas_closed_closed , mktz , datetime_to_ms , CLOSED_CLOSED , to_dt
15
16
from ..decorators import mongo_retry
68
69
COLUMNS = 'cs'
69
70
DATA = 'd'
70
71
DTYPE = 't'
72
+ IMAGE_TIME = 't'
71
73
ROWMASK = 'm'
72
74
73
75
COUNT = 'c'
78
80
79
81
class TickStore (object ):
80
82
81
- chunk_size = 100000
82
-
83
83
@classmethod
84
84
def initialize_library (cls , arctic_lib , ** kwargs ):
85
85
TickStore (arctic_lib )._ensure_index ()
@@ -91,7 +91,16 @@ def _ensure_index(self):
91
91
(START , pymongo .ASCENDING )], background = True )
92
92
collection .create_index ([(START , pymongo .ASCENDING )], background = True )
93
93
94
- def __init__ (self , arctic_lib ):
94
+ def __init__ (self , arctic_lib , chunk_size = 100000 ):
95
+ """
96
+ Parameters
97
+ ----------
98
+ arctic_lib : TickStore
99
+ Arctic Library
100
+ chunk_size : int
101
+ Number of ticks to store in a document before splitting to another document.
102
+ if the library was obtained through get_library then set with: self._chuck_size = 10000
103
+ """
95
104
self ._arctic_lib = arctic_lib
96
105
97
106
# Do we allow reading from secondaries
@@ -100,6 +109,8 @@ def __init__(self, arctic_lib):
100
109
# The default collections
101
110
self ._collection = arctic_lib .get_top_level_collection ()
102
111
112
+ self ._chunk_size = chunk_size
113
+
103
114
def __getstate__ (self ):
104
115
return {'arctic_lib' : self ._arctic_lib }
105
116
@@ -334,7 +345,7 @@ def _set_or_promote_dtype(self, column_dtypes, c, dtype):
334
345
335
346
def _prepend_image (self , document , im , rtn_length , column_dtypes , column_set , columns ):
336
347
image = im [IMAGE ]
337
- first_dt = im ['t' ]
348
+ first_dt = im [IMAGE_TIME ]
338
349
if not first_dt .tzinfo :
339
350
first_dt = first_dt .replace (tzinfo = mktz ('UTC' ))
340
351
document [INDEX ] = np .insert (document [INDEX ], 0 , np .uint64 (datetime_to_ms (first_dt )))
@@ -354,7 +365,7 @@ def _prepend_image(self, document, im, rtn_length, column_dtypes, column_set, co
354
365
for field in set (document ).difference (set (image )):
355
366
if field == INDEX :
356
367
continue
357
- logger .debug ("Field %s is missing from image!" , field )
368
+ logger .debug ("Field %s is missing from image!" % field )
358
369
if document [field ] is not None :
359
370
val = np .nan
360
371
document [field ] = np .insert (document [field ], 0 , document [field ].dtype .type (val ))
@@ -450,16 +461,21 @@ def _assert_nonoverlapping_data(self, symbol, start, end):
450
461
raise OverlappingDataException ("Document already exists with start:{} end:{} in the range of our start:{} end:{}" .format (
451
462
doc [START ], doc [END ], start , end ))
452
463
453
- def write (self , symbol , data ):
464
+ def write (self , symbol , data , initial_image = None ):
454
465
"""
455
466
Writes a list of market data events.
456
467
457
468
Parameters
458
469
----------
459
470
symbol : `str`
460
471
symbol name for the item
461
- data : list of dicts
472
+ data : list of dicts or a pandas.DataFrame
462
473
List of ticks to store to the tick-store.
474
+ if a list of dicts, each dict must contain a 'index' datetime
475
+ if a pandas.DataFrame the index must be a Timestamp that can be converted to a datetime
476
+ initial_image : dict
477
+ Dict of the initial image at the start of the document. If this contains a 'index' entry it is
478
+ assumed to be the time of the timestamp of the index
463
479
"""
464
480
pandas = False
465
481
# Check for overlapping data
@@ -475,38 +491,42 @@ def write(self, symbol, data):
475
491
self ._assert_nonoverlapping_data (symbol , to_dt (start ), to_dt (end ))
476
492
477
493
if pandas :
478
- buckets = self ._pandas_to_buckets (data , symbol )
494
+ buckets = self ._pandas_to_buckets (data , symbol , initial_image )
479
495
else :
480
- buckets = self ._to_buckets (data , symbol )
496
+ buckets = self ._to_buckets (data , symbol , initial_image )
481
497
self ._write (buckets )
482
498
483
499
def _write (self , buckets ):
484
500
start = dt .now ()
485
501
mongo_retry (self ._collection .insert_many )(buckets )
486
502
t = (dt .now () - start ).total_seconds ()
487
- ticks = len (buckets ) * self .chunk_size
488
- print ("%d buckets in %s: approx %s ticks/sec" % (len (buckets ), t , int (ticks / t )))
503
+ ticks = len (buckets ) * self ._chunk_size
504
+ logger . debug ("%d buckets in %s: approx %s ticks/sec" % (len (buckets ), t , int (ticks / t )))
489
505
490
- def _pandas_to_buckets (self , x , symbol ):
506
+ def _pandas_to_buckets (self , x , symbol , initial_image ):
491
507
rtn = []
492
- for i in range (0 , len (x ), self .chunk_size ):
493
- rtn .append (self ._pandas_to_bucket (x [i :i + self .chunk_size ], symbol ))
508
+ for i in range (0 , len (x ), self ._chunk_size ):
509
+ bucket , initial_image = TickStore ._pandas_to_bucket (x [i :i + self ._chunk_size ], symbol , initial_image )
510
+ rtn .append (bucket )
494
511
return rtn
495
512
496
- def _to_buckets (self , x , symbol ):
513
+ def _to_buckets (self , x , symbol , initial_image ):
497
514
rtn = []
498
- for i in range (0 , len (x ), self .chunk_size ):
499
- rtn .append (self ._to_bucket (x [i :i + self .chunk_size ], symbol ))
515
+ for i in range (0 , len (x ), self ._chunk_size ):
516
+ bucket , initial_image = TickStore ._to_bucket (x [i :i + self ._chunk_size ], symbol , initial_image )
517
+ rtn .append (bucket )
500
518
return rtn
501
519
502
- def _to_ms (self , date ):
520
+ @staticmethod
521
+ def _to_ms (date ):
503
522
if isinstance (date , dt ):
504
523
if not date .tzinfo :
505
- logger .warning ('WARNING: treating naive datetime as London in write path' )
524
+ logger .warning ('WARNING: treating naive datetime as UTC in write path' )
506
525
return datetime_to_ms (date )
507
526
return date
508
527
509
- def _str_dtype (self , dtype ):
528
+ @staticmethod
529
+ def _str_dtype (dtype ):
510
530
"""
511
531
Represent dtypes without byte order, as earlier Java tickstore code doesn't support explicit byte order.
512
532
"""
@@ -522,8 +542,8 @@ def _str_dtype(self, dtype):
522
542
else :
523
543
raise UnhandledDtypeException ("Bad dtype '%s'" % dtype )
524
544
525
-
526
- def _ensure_supported_dtypes (self , array ):
545
+ @ staticmethod
546
+ def _ensure_supported_dtypes (array ):
527
547
# We only support these types for now, as we need to read them in Java
528
548
if (array .dtype .kind ) == 'i' :
529
549
array = array .astype ('<i8' )
@@ -538,42 +558,68 @@ def _ensure_supported_dtypes(self, array):
538
558
array = array .astype (array .dtype .newbyteorder ('<' ))
539
559
return array
540
560
541
- def _pandas_to_bucket (self , df , symbol ):
542
- start = to_dt (df .index [0 ].to_datetime ())
561
+ @staticmethod
562
+ def _pandas_compute_final_image (df , image , end ):
563
+ # Compute the final image with forward fill of df applied to the image
564
+ final_image = copy .copy (image )
565
+ last_values = df .ffill ().tail (1 ).to_dict ()
566
+ last_dict = {i : list (a .values ())[0 ] for i , a in last_values .items ()}
567
+ final_image .update (last_dict )
568
+ final_image ['index' ] = end
569
+ return final_image
570
+
571
+ @staticmethod
572
+ def _pandas_to_bucket (df , symbol , initial_image ):
573
+ rtn = {SYMBOL : symbol , VERSION : CHUNK_VERSION_NUMBER , COLUMNS : {}, COUNT : len (df )}
543
574
end = to_dt (df .index [- 1 ].to_datetime ())
544
- rtn = {START : start , END : end , SYMBOL : symbol }
545
- rtn [VERSION ] = CHUNK_VERSION_NUMBER
546
- rtn [COUNT ] = len (df )
547
- rtn [COLUMNS ] = {}
575
+ if initial_image :
576
+ if 'index' in initial_image :
577
+ start = min (to_dt (df .index [0 ].to_datetime ()), initial_image ['index' ])
578
+ else :
579
+ start = to_dt (df .index [0 ].to_datetime ())
580
+ image_start = initial_image .get ('index' , start )
581
+ image = {k : v for k , v in initial_image .items () if k != 'index' }
582
+ rtn [IMAGE_DOC ] = {IMAGE_TIME : image_start , IMAGE : initial_image }
583
+ final_image = TickStore ._pandas_compute_final_image (df , initial_image , end )
584
+ else :
585
+ start = to_dt (df .index [0 ].to_datetime ())
586
+ final_image = {}
587
+ rtn [END ] = end
588
+ rtn [START ] = start
548
589
549
590
logger .warning ("NB treating all values as 'exists' - no longer sparse" )
550
591
rowmask = Binary (lz4 .compressHC (np .packbits (np .ones (len (df ), dtype = 'uint8' ))))
551
592
552
593
recs = df .to_records (convert_datetime64 = False )
553
594
for col in df :
554
- array = self ._ensure_supported_dtypes (recs [col ])
595
+ array = TickStore ._ensure_supported_dtypes (recs [col ])
555
596
col_data = {}
556
597
col_data [DATA ] = Binary (lz4 .compressHC (array .tostring ()))
557
598
col_data [ROWMASK ] = rowmask
558
- col_data [DTYPE ] = self ._str_dtype (array .dtype )
599
+ col_data [DTYPE ] = TickStore ._str_dtype (array .dtype )
559
600
rtn [COLUMNS ][col ] = col_data
560
601
rtn [INDEX ] = Binary (lz4 .compressHC (np .concatenate (([recs ['index' ][0 ].astype ('datetime64[ms]' ).view ('uint64' )],
561
602
np .diff (recs ['index' ].astype ('datetime64[ms]' ).view ('uint64' )))
562
603
).tostring ()))
563
- return rtn
604
+ return rtn , final_image
564
605
565
- def _to_bucket (self , ticks , symbol ):
606
+ @staticmethod
607
+ def _to_bucket (ticks , symbol , initial_image ):
608
+ rtn = {SYMBOL : symbol , VERSION : CHUNK_VERSION_NUMBER , COLUMNS : {}, COUNT : len (ticks )}
566
609
data = {}
567
610
rowmask = {}
568
611
start = to_dt (ticks [0 ]['index' ])
569
612
end = to_dt (ticks [- 1 ]['index' ])
613
+ final_image = copy .copy (initial_image ) if initial_image else {}
570
614
for i , t in enumerate (ticks ):
615
+ if initial_image :
616
+ final_image .update (t )
571
617
for k , v in iteritems (t ):
572
618
try :
573
619
if k != 'index' :
574
620
rowmask [k ][i ] = 1
575
621
else :
576
- v = self ._to_ms (v )
622
+ v = TickStore ._to_ms (v )
577
623
data [k ].append (v )
578
624
except KeyError :
579
625
if k != 'index' :
@@ -583,21 +629,22 @@ def _to_bucket(self, ticks, symbol):
583
629
584
630
rowmask = dict ([(k , Binary (lz4 .compressHC (np .packbits (v ).tostring ())))
585
631
for k , v in iteritems (rowmask )])
586
-
587
- rtn = {START : start , END : end , SYMBOL : symbol }
588
- rtn [VERSION ] = CHUNK_VERSION_NUMBER
589
- rtn [COUNT ] = len (ticks )
590
- rtn [COLUMNS ] = {}
591
632
for k , v in iteritems (data ):
592
633
if k != 'index' :
593
634
v = np .array (v )
594
- v = self ._ensure_supported_dtypes (v )
635
+ v = TickStore ._ensure_supported_dtypes (v )
595
636
rtn [COLUMNS ][k ] = {DATA : Binary (lz4 .compressHC (v .tostring ())),
596
- DTYPE : self ._str_dtype (v .dtype ),
637
+ DTYPE : TickStore ._str_dtype (v .dtype ),
597
638
ROWMASK : rowmask [k ]}
598
639
640
+ if initial_image :
641
+ image_start = initial_image .get ('index' , start )
642
+ start = min (start , image_start )
643
+ rtn [IMAGE_DOC ] = {IMAGE_TIME : image_start , IMAGE : initial_image }
644
+ rtn [END ] = end
645
+ rtn [START ] = start
599
646
rtn [INDEX ] = Binary (lz4 .compressHC (np .concatenate (([data ['index' ][0 ]], np .diff (data ['index' ]))).tostring ()))
600
- return rtn
647
+ return rtn , final_image
601
648
602
649
def max_date (self , symbol ):
603
650
"""
0 commit comments