Skip to content

Commit b273a34

Browse files
committed
update based on code review comments
1 parent 79a55b2 commit b273a34

File tree

2 files changed

+36
-32
lines changed

2 files changed

+36
-32
lines changed

arctic/chunkstore/chunkstore.py

+36-30
Original file line numberDiff line numberDiff line change
@@ -32,22 +32,20 @@ def initialize_library(cls, arctic_lib, **kwargs):
3232

3333
@mongo_retry
3434
def _ensure_index(self):
35-
collection = self._collection.symbols
36-
collection.create_index([("symbol", pymongo.ASCENDING),
37-
("_id", pymongo.DESCENDING)],
38-
background=True)
39-
40-
collection = self._collection
41-
collection.create_index([('symbol', pymongo.HASHED)], background=True)
42-
collection.create_index([('symbol', pymongo.ASCENDING),
43-
('sha', pymongo.ASCENDING)],
44-
unique=True,
45-
background=True)
46-
collection.create_index([('symbol', pymongo.ASCENDING),
47-
('parent', pymongo.ASCENDING),
48-
('start', pymongo.ASCENDING),
49-
('end', pymongo.ASCENDING)],
50-
unique=True, background=True)
35+
self._symbols.create_index([("symbol", pymongo.ASCENDING)],
36+
unique=True,
37+
background=True)
38+
39+
self._collection.create_index([('symbol', pymongo.HASHED)],
40+
background=True)
41+
self._collection.create_index([('symbol', pymongo.ASCENDING),
42+
('sha', pymongo.ASCENDING)],
43+
unique=True,
44+
background=True)
45+
self._collection.create_index([('symbol', pymongo.ASCENDING),
46+
('start', pymongo.ASCENDING),
47+
('end', pymongo.ASCENDING)],
48+
unique=True, background=True)
5149

5250
@mongo_retry
5351
def __init__(self, arctic_lib, chunker=DateChunker()):
@@ -123,7 +121,6 @@ def read(self, symbol, chunk_range=None):
123121
raise NoDataFoundException('No data found for %s in library %s' % (symbol, self._collection.get_name()))
124122

125123
spec = {'symbol': symbol,
126-
'parent': sym['_id'],
127124
}
128125

129126
if chunk_range is not None:
@@ -159,7 +156,7 @@ def write(self, symbol, item, chunk_size):
159156
A chunk size that is understood by the specified chunker
160157
"""
161158

162-
doc = {'_id': bson.ObjectId()}
159+
doc = {}
163160
doc['symbol'] = symbol
164161
doc['chunk_size'] = chunk_size
165162

@@ -170,6 +167,12 @@ def write(self, symbol, item, chunk_size):
170167
else:
171168
raise Exception("Can only chunk Series and DataFrames")
172169

170+
previous_shas = []
171+
if self._get_symbol_info(symbol):
172+
previous_shas = set([x['sha'] for x in self._collection.find({'symbol': symbol},
173+
projection={'sha': True, '_id': False},
174+
)])
175+
173176
records = []
174177
ranges = []
175178
dtype = None
@@ -184,12 +187,6 @@ def write(self, symbol, item, chunk_size):
184187
if record.dtype.hasobject:
185188
raise UnhandledDtypeException()
186189

187-
sym = self._get_symbol_info(symbol)
188-
if sym:
189-
# if the symbol already exists, we are basically overwriting it
190-
# clean up the data before we orphan the symbol chunks
191-
self.delete(symbol)
192-
193190
doc['dtype'] = str(dtype)
194191
doc['shape'] = (-1,) + item.shape[1:]
195192
doc['dtype_metadata'] = dict(dtype.metadata or {})
@@ -210,16 +207,25 @@ def write(self, symbol, item, chunk_size):
210207
chunk['end'] = end
211208
chunk['symbol'] = symbol
212209
chunk['sha'] = checksum(symbol, chunk)
213-
bulk.find({'symbol': symbol, 'sha': chunk['sha'], 'start': chunk['start']}
214-
).upsert().update_one({'$set': chunk, '$addToSet': {'parent': doc['_id']}})
210+
if chunk['sha'] not in previous_shas:
211+
bulk.find({'symbol': symbol, 'sha': chunk['sha']},
212+
).upsert().update_one({'$set': chunk})
213+
else:
214+
# already exists, dont need to update in mongo
215+
previous_shas = previous_shas.remove(chunk['sha'])
215216
if seg_count != 0:
216217
bulk.execute()
217218

218219
doc['chunk_count'] = seg_count
219220
doc['append_size'] = 0
220221
doc['append_count'] = 0
221222

222-
mongo_retry(self._symbols.insert_one)(doc)
223+
if previous_shas:
224+
mongo_retry(self._collection.delete_many)({'sha': {'$in': list(previous_shas)}})
225+
226+
mongo_retry(self._symbols.update_one)({'symbol': symbol},
227+
{'$set': doc},
228+
upsert=True)
223229

224230
def append(self, symbol, item):
225231
"""
@@ -294,7 +300,7 @@ def append(self, symbol, item):
294300
segment['start'] = start
295301
segment['end'] = end
296302
self._collection.update_one({'symbol': symbol, 'sha': checksum(symbol, segment)},
297-
{'$set': segment, '$addToSet': {'parent': sym['_id']}},
303+
{'$set': segment},
298304
upsert=True)
299305

300306
self._symbols.replace_one({'symbol': symbol}, sym)
@@ -362,10 +368,10 @@ def update(self, symbol, item):
362368
if orig_start is None:
363369
# new chunk
364370
bulk.find({'symbol': symbol, 'sha': sha, 'start': segment['start']}
365-
).upsert().update_one({'$set': segment, '$addToSet': {'parent': sym['_id']}})
371+
).upsert().update_one({'$set': segment})
366372
else:
367373
bulk.find({'symbol': symbol, 'start': orig_start}
368-
).update_one({'$set': segment, '$addToSet': {'parent': sym['_id']}})
374+
).update_one({'$set': segment})
369375
if len(chunks) > 0:
370376
bulk.execute()
371377

tests/integration/chunkstore/test_chunkstore.py

-2
Original file line numberDiff line numberDiff line change
@@ -579,5 +579,3 @@ def helper(chunkstore_lib, chunk_size, name, df, append):
579579

580580
for chunk_size in ['D', 'M', 'Y']:
581581
helper(chunkstore_lib, chunk_size, 'test_monthly_' + chunk_size, df, append)
582-
583-

0 commit comments

Comments
 (0)