Skip to content

Commit e9a1814

Browse files
author
reasto
committed
Add initial_image as optional parameter on tickstore write() - pandas-dev#98
1 parent 517d402 commit e9a1814

File tree

5 files changed

+144
-77
lines changed

5 files changed

+144
-77
lines changed

arctic/tickstore/tickstore.py

+86-41
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from pandas.core.frame import _arrays_to_mgr
1111
import pymongo
1212
from pymongo.errors import OperationFailure
13+
import copy
1314

1415
from ..date import DateRange, to_pandas_closed_closed, mktz, datetime_to_ms, CLOSED_CLOSED, to_dt
1516
from ..decorators import mongo_retry
@@ -78,8 +79,6 @@
7879

7980
class TickStore(object):
8081

81-
chunk_size = 100000
82-
8382
@classmethod
8483
def initialize_library(cls, arctic_lib, **kwargs):
8584
TickStore(arctic_lib)._ensure_index()
@@ -91,7 +90,16 @@ def _ensure_index(self):
9190
(START, pymongo.ASCENDING)], background=True)
9291
collection.create_index([(START, pymongo.ASCENDING)], background=True)
9392

94-
def __init__(self, arctic_lib):
93+
def __init__(self, arctic_lib, chunk_size=100000):
94+
"""
95+
Parameters
96+
----------
97+
arctic_lib : TickStore
98+
Arctic Library
99+
chunk_size : int
100+
Number of ticks to store in a document before splitting to another document.
101+
if the library was obtained through get_library then set with: self._chuck_size = 10000
102+
"""
95103
self._arctic_lib = arctic_lib
96104

97105
# Do we allow reading from secondaries
@@ -100,6 +108,8 @@ def __init__(self, arctic_lib):
100108
# The default collections
101109
self._collection = arctic_lib.get_top_level_collection()
102110

111+
self._chunk_size = chunk_size
112+
103113
def __getstate__(self):
104114
return {'arctic_lib': self._arctic_lib}
105115

@@ -334,7 +344,7 @@ def _set_or_promote_dtype(self, column_dtypes, c, dtype):
334344

335345
def _prepend_image(self, document, im, rtn_length, column_dtypes, column_set, columns):
336346
image = im[IMAGE]
337-
first_dt = im['t']
347+
first_dt = im[DTYPE]
338348
if not first_dt.tzinfo:
339349
first_dt = first_dt.replace(tzinfo=mktz('UTC'))
340350
document[INDEX] = np.insert(document[INDEX], 0, np.uint64(datetime_to_ms(first_dt)))
@@ -354,7 +364,7 @@ def _prepend_image(self, document, im, rtn_length, column_dtypes, column_set, co
354364
for field in set(document).difference(set(image)):
355365
if field == INDEX:
356366
continue
357-
logger.debug("Field %s is missing from image!", field)
367+
logger.debug("Field %s is missing from image!" % field)
358368
if document[field] is not None:
359369
val = np.nan
360370
document[field] = np.insert(document[field], 0, document[field].dtype.type(val))
@@ -450,16 +460,21 @@ def _assert_nonoverlapping_data(self, symbol, start, end):
450460
raise OverlappingDataException("Document already exists with start:{} end:{} in the range of our start:{} end:{}".format(
451461
doc[START], doc[END], start, end))
452462

453-
def write(self, symbol, data):
463+
def write(self, symbol, data, initial_image=None):
454464
"""
455465
Writes a list of market data events.
456466
457467
Parameters
458468
----------
459469
symbol : `str`
460470
symbol name for the item
461-
data : list of dicts
471+
data : list of dicts or a panda.DataFrame
462472
List of ticks to store to the tick-store.
473+
if a list of dicts, each dict must contain a 'index' datetime
474+
if a panda.DataFrame the index must be a Timestamp that can be converted to a datetime
475+
initial_image : dict
476+
Dict of the initial image at the start of the document. If this contains a 'index' entry it is
477+
assumed to be the time of the timestamp of the index
463478
"""
464479
pandas = False
465480
# Check for overlapping data
@@ -475,38 +490,41 @@ def write(self, symbol, data):
475490
self._assert_nonoverlapping_data(symbol, to_dt(start), to_dt(end))
476491

477492
if pandas:
478-
buckets = self._pandas_to_buckets(data, symbol)
493+
buckets = self._pandas_to_buckets(data, symbol, initial_image)
479494
else:
480-
buckets = self._to_buckets(data, symbol)
495+
buckets = self._to_buckets(data, symbol, initial_image)
481496
self._write(buckets)
482497

483498
def _write(self, buckets):
484499
start = dt.now()
485500
mongo_retry(self._collection.insert_many)(buckets)
486501
t = (dt.now() - start).total_seconds()
487-
ticks = len(buckets) * self.chunk_size
488-
print("%d buckets in %s: approx %s ticks/sec" % (len(buckets), t, int(ticks / t)))
502+
ticks = len(buckets) * self._chunk_size
489503

490-
def _pandas_to_buckets(self, x, symbol):
504+
def _pandas_to_buckets(self, x, symbol, initial_image):
491505
rtn = []
492-
for i in range(0, len(x), self.chunk_size):
493-
rtn.append(self._pandas_to_bucket(x[i:i + self.chunk_size], symbol))
506+
for i in range(0, len(x), self._chunk_size):
507+
bucket, initial_image = TickStore._pandas_to_bucket(x[i:i + self._chunk_size], symbol, initial_image)
508+
rtn.append(bucket)
494509
return rtn
495510

496-
def _to_buckets(self, x, symbol):
511+
def _to_buckets(self, x, symbol, initial_image):
497512
rtn = []
498-
for i in range(0, len(x), self.chunk_size):
499-
rtn.append(self._to_bucket(x[i:i + self.chunk_size], symbol))
513+
for i in range(0, len(x), self._chunk_size):
514+
bucket, initial_image = TickStore._to_bucket(x[i:i + self._chunk_size], symbol, initial_image)
515+
rtn.append(bucket)
500516
return rtn
501517

502-
def _to_ms(self, date):
518+
@staticmethod
519+
def _to_ms(date):
503520
if isinstance(date, dt):
504521
if not date.tzinfo:
505-
logger.warning('WARNING: treating naive datetime as London in write path')
522+
logger.warning('WARNING: treating naive datetime as UTC in write path')
506523
return datetime_to_ms(date)
507524
return date
508525

509-
def _str_dtype(self, dtype):
526+
@staticmethod
527+
def _str_dtype(dtype):
510528
"""
511529
Represent dtypes without byte order, as earlier Java tickstore code doesn't support explicit byte order.
512530
"""
@@ -522,8 +540,8 @@ def _str_dtype(self, dtype):
522540
else:
523541
raise UnhandledDtypeException("Bad dtype '%s'" % dtype)
524542

525-
526-
def _ensure_supported_dtypes(self, array):
543+
@staticmethod
544+
def _ensure_supported_dtypes(array):
527545
# We only support these types for now, as we need to read them in Java
528546
if (array.dtype.kind) == 'i':
529547
array = array.astype('<i8')
@@ -538,42 +556,68 @@ def _ensure_supported_dtypes(self, array):
538556
array = array.astype(array.dtype.newbyteorder('<'))
539557
return array
540558

541-
def _pandas_to_bucket(self, df, symbol):
542-
start = to_dt(df.index[0].to_datetime())
559+
@staticmethod
560+
def _pandas_compute_final_image(df, image, end):
561+
# Compute the final image with forward fill of df applied to the image
562+
final_image = copy.copy(image)
563+
last_values = df.ffill().tail(1).to_dict()
564+
last_dict = {i: a.values()[0] for i, a in last_values.items()}
565+
final_image.update(last_dict)
566+
final_image['index'] = end
567+
return final_image
568+
569+
@staticmethod
570+
def _pandas_to_bucket(df, symbol, initial_image):
571+
rtn = {SYMBOL: symbol, VERSION: CHUNK_VERSION_NUMBER, COLUMNS: {}, COUNT: len(df)}
543572
end = to_dt(df.index[-1].to_datetime())
544-
rtn = {START: start, END: end, SYMBOL: symbol}
545-
rtn[VERSION] = CHUNK_VERSION_NUMBER
546-
rtn[COUNT] = len(df)
547-
rtn[COLUMNS] = {}
573+
if initial_image :
574+
if 'index' in initial_image:
575+
start = min(to_dt(df.index[0].to_datetime()), initial_image['index'])
576+
else:
577+
start = to_dt(df.index[0].to_datetime())
578+
image_start = initial_image.get('index', start)
579+
image = {k: v for k, v in initial_image.items() if k != 'index'}
580+
rtn[IMAGE_DOC] = {DTYPE: image_start, START: 0, IMAGE: initial_image}
581+
final_image = TickStore._pandas_compute_final_image(df, initial_image, end)
582+
else:
583+
start = to_dt(df.index[0].to_datetime())
584+
final_image = {}
585+
rtn[END] = end
586+
rtn[START] = start
548587

549588
logger.warning("NB treating all values as 'exists' - no longer sparse")
550589
rowmask = Binary(lz4.compressHC(np.packbits(np.ones(len(df), dtype='uint8'))))
551590

552591
recs = df.to_records(convert_datetime64=False)
553592
for col in df:
554-
array = self._ensure_supported_dtypes(recs[col])
593+
array = TickStore._ensure_supported_dtypes(recs[col])
555594
col_data = {}
556595
col_data[DATA] = Binary(lz4.compressHC(array.tostring()))
557596
col_data[ROWMASK] = rowmask
558-
col_data[DTYPE] = self._str_dtype(array.dtype)
597+
col_data[DTYPE] = TickStore._str_dtype(array.dtype)
559598
rtn[COLUMNS][col] = col_data
560599
rtn[INDEX] = Binary(lz4.compressHC(np.concatenate(([recs['index'][0].astype('datetime64[ms]').view('uint64')],
561600
np.diff(recs['index'].astype('datetime64[ms]').view('uint64')))
562601
).tostring()))
563-
return rtn
602+
return rtn, final_image
564603

565-
def _to_bucket(self, ticks, symbol):
604+
@staticmethod
605+
def _to_bucket(ticks, symbol, initial_image):
606+
rtn = {SYMBOL: symbol, VERSION: CHUNK_VERSION_NUMBER, COLUMNS: {}, COUNT: len(ticks)}
566607
data = {}
567608
rowmask = {}
568609
start = to_dt(ticks[0]['index'])
569610
end = to_dt(ticks[-1]['index'])
611+
final_image = copy.copy(initial_image) if initial_image else {}
570612
for i, t in enumerate(ticks):
613+
if initial_image:
614+
final_image.update(t)
571615
for k, v in iteritems(t):
572616
try:
573617
if k != 'index':
574618
rowmask[k][i] = 1
575619
else:
576-
v = self._to_ms(v)
620+
v = TickStore._to_ms(v)
577621
data[k].append(v)
578622
except KeyError:
579623
if k != 'index':
@@ -583,21 +627,22 @@ def _to_bucket(self, ticks, symbol):
583627

584628
rowmask = dict([(k, Binary(lz4.compressHC(np.packbits(v).tostring())))
585629
for k, v in iteritems(rowmask)])
586-
587-
rtn = {START: start, END: end, SYMBOL: symbol}
588-
rtn[VERSION] = CHUNK_VERSION_NUMBER
589-
rtn[COUNT] = len(ticks)
590-
rtn[COLUMNS] = {}
591630
for k, v in iteritems(data):
592631
if k != 'index':
593632
v = np.array(v)
594-
v = self._ensure_supported_dtypes(v)
633+
v = TickStore._ensure_supported_dtypes(v)
595634
rtn[COLUMNS][k] = {DATA: Binary(lz4.compressHC(v.tostring())),
596-
DTYPE: self._str_dtype(v.dtype),
635+
DTYPE: TickStore._str_dtype(v.dtype),
597636
ROWMASK: rowmask[k]}
598637

638+
if initial_image:
639+
image_start = initial_image.get('index', start)
640+
start = min(start, image_start)
641+
rtn[IMAGE_DOC] = {DTYPE: image_start, START: 0, IMAGE: final_image}
642+
rtn[END] = end
643+
rtn[START] = start
599644
rtn[INDEX] = Binary(lz4.compressHC(np.concatenate(([data['index'][0]], np.diff(data['index']))).tostring()))
600-
return rtn
645+
return rtn, final_image
601646

602647
def max_date(self, symbol):
603648
"""

tests/integration/tickstore/test_ts_delete.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def test_delete(tickstore_lib):
2121
'index': dt(2013, 1, 30, tzinfo=mktz('Europe/London'))
2222
},
2323
]
24-
tickstore_lib.chunk_size = 1
24+
tickstore_lib._chunk_size = 1
2525
tickstore_lib.write('SYM', DUMMY_DATA)
2626
tickstore_lib.delete('SYM')
2727
with pytest.raises(NoDataFoundException):
@@ -45,7 +45,7 @@ def test_delete_daterange(tickstore_lib):
4545
'index': dt(2013, 2, 1, tzinfo=mktz('Europe/London'))
4646
},
4747
]
48-
tickstore_lib.chunk_size = 1
48+
tickstore_lib._chunk_size = 1
4949
tickstore_lib.write('SYM', DUMMY_DATA)
5050

5151
# Delete with a date-range

tests/integration/tickstore/test_ts_read.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ def test_read_all_cols_all_dtypes(tickstore_lib, chunk_size):
112112
'index': dt(1970, 1, 1, 0, 0, 1, tzinfo=mktz('UTC')),
113113
},
114114
]
115-
tickstore_lib.chunk_size = 3
115+
tickstore_lib._chunk_size = chunk_size
116116
tickstore_lib.write('sym', data)
117117
df = tickstore_lib.read('sym', columns=None)
118118

@@ -167,7 +167,7 @@ def test_date_range(tickstore_lib):
167167
tickstore_lib.delete('SYM')
168168

169169
# Chunk every 3 symbols and lets have some fun
170-
tickstore_lib.chunk_size = 3
170+
tickstore_lib._chunk_size = 3
171171
tickstore_lib.write('SYM', DUMMY_DATA)
172172

173173
with patch.object(tickstore_lib._collection, 'find', side_effect=tickstore_lib._collection.find) as f:
@@ -222,7 +222,7 @@ def test_date_range_end_not_in_range(tickstore_lib):
222222
},
223223
]
224224

225-
tickstore_lib.chunk_size = 1
225+
tickstore_lib._chunk_size = 1
226226
tickstore_lib.write('SYM', DUMMY_DATA)
227227
with patch.object(tickstore_lib._collection, 'find', side_effect=tickstore_lib._collection.find) as f:
228228
df = tickstore_lib.read('SYM', date_range=DateRange(20130101, dt(2013, 1, 2, 9, 0)), columns=None)
@@ -251,7 +251,7 @@ def test_date_range_default_timezone(tickstore_lib, tz_name):
251251
]
252252

253253
with patch('arctic.date._mktz.DEFAULT_TIME_ZONE_NAME', tz_name):
254-
tickstore_lib.chunk_size = 1
254+
tickstore_lib._chunk_size = 1
255255
tickstore_lib.write('SYM', DUMMY_DATA)
256256
df = tickstore_lib.read('SYM', date_range=DateRange(20130101, 20130701), columns=None)
257257
assert len(df) == 2
@@ -281,7 +281,7 @@ def test_date_range_no_bounds(tickstore_lib):
281281
},
282282
]
283283

284-
tickstore_lib.chunk_size = 1
284+
tickstore_lib._chunk_size = 1
285285
tickstore_lib.write('SYM', DUMMY_DATA)
286286

287287
# 1) No start, no end
@@ -315,7 +315,7 @@ def test_date_range_BST(tickstore_lib):
315315
'index': dt(2013, 6, 1, 13, 00, tzinfo=mktz('Europe/London'))
316316
},
317317
]
318-
tickstore_lib.chunk_size = 1
318+
tickstore_lib._chunk_size = 1
319319
tickstore_lib.write('SYM', DUMMY_DATA)
320320

321321
df = tickstore_lib.read('SYM', columns=None)
@@ -363,7 +363,7 @@ def test_read_out_of_order(tickstore_lib):
363363
'index': dt(2013, 6, 1, 13, 00, tzinfo=mktz('UTC'))
364364
},
365365
]
366-
tickstore_lib.chunk_size = 3
366+
tickstore_lib._chunk_size = 3
367367
tickstore_lib.write('SYM', DUMMY_DATA)
368368
tickstore_lib.read('SYM', columns=None)
369369
assert len(tickstore_lib.read('SYM', columns=None, date_range=DateRange(dt(2013, 6, 1, tzinfo=mktz('UTC')), dt(2013, 6, 2, tzinfo=mktz('UTC'))))) == 3
@@ -380,7 +380,7 @@ def test_read_longs(tickstore_lib):
380380
'index': dt(2013, 6, 1, 13, 00, tzinfo=mktz('Europe/London'))
381381
},
382382
]
383-
tickstore_lib.chunk_size = 3
383+
tickstore_lib._chunk_size = 3
384384
tickstore_lib.write('SYM', DUMMY_DATA)
385385
tickstore_lib.read('SYM', columns=None)
386386
read = tickstore_lib.read('SYM', columns=None, date_range=DateRange(dt(2013, 6, 1), dt(2013, 6, 2)))

tests/integration/tickstore/test_ts_write.py

-25
Original file line numberDiff line numberDiff line change
@@ -79,28 +79,3 @@ def test_ts_write_pandas(tickstore_lib):
7979

8080
read = tickstore_lib.read('SYM', columns=None)
8181
assert_frame_equal(read, data, check_names=False)
82-
83-
84-
def test_to_bucket(tickstore_lib):
85-
bucket = tickstore_lib._to_bucket(DUMMY_DATA, 'SYM')
86-
assert bucket[SYMBOL] == 'SYM'
87-
assert bucket[START] == dt(2013, 1, 1, tzinfo=mktz('Europe/London'))
88-
assert bucket[END] == dt(2013, 7, 5, tzinfo=mktz('Europe/London'))
89-
assert bucket[COUNT] == 5
90-
91-
92-
def test_pandas_to_bucket(tickstore_lib):
93-
df = read_str_as_pandas(""" index | near
94-
2012-09-08 17:06:11 | 1.0
95-
2012-10-08 17:06:11 | 2.0
96-
2012-10-09 17:06:11 | 2.5
97-
2012-11-08 17:06:11 | 3.0""")
98-
df = df.tz_localize('UTC')
99-
bucket = tickstore_lib._pandas_to_bucket(df, 'SYM')
100-
assert bucket[SYMBOL] == 'SYM'
101-
assert bucket[START] == dt(2012, 9, 8, 17, 6, 11, tzinfo=mktz('UTC'))
102-
assert bucket[END] == dt(2012, 11, 8, 17, 6, 11, tzinfo=mktz('UTC'))
103-
assert bucket[COUNT] == 4
104-
assert len(bucket[COLUMNS]) == 1
105-
assert 'near' in bucket[COLUMNS]
106-

0 commit comments

Comments
 (0)