Skip to content

Commit 9611531

Browse files
authored
Fix issue pandas-dev#292 - Metadata size not accounted for in ChunkStore size chunking
When ChunkStore chunks for size (due to 16MB doc limit in MongoDB), it was not including the size of the metadata in the size of the document. In general, this is ok, the metadata is very small, but it can be quite large when array masks are included in the metadata (this happens when there are columns of strings that have missing values).
1 parent 75c9c84 commit 9611531

File tree

3 files changed

+15
-10
lines changed

3 files changed

+15
-10
lines changed

CHANGES.md

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
### 1.34
44

5+
* Bugfix: #292 Account for metadata size during size chunking in ChunkStore
56
* Feature: #283 Support for all pandas frequency strings in ChunkStore DateChunker
67
* Feature: #286 Add has_symbol to ChunkStore and support for partial symbol matching in list_symbols
78

arctic/chunkstore/chunkstore.py

+13-7
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import logging
22
import pymongo
33
import hashlib
4+
import bson
45

56
from bson.binary import Binary
67
from pandas import DataFrame, Series
@@ -283,10 +284,13 @@ def write(self, symbol, item, chunker=DateChunker(), **kwargs):
283284
doc[METADATA] = {'columns': data[METADATA][COLUMNS] if COLUMNS in data[METADATA] else ''}
284285
doc[CHUNK_SIZE] = chunk_size
285286

286-
size_chunked = len(data[DATA]) > MAX_CHUNK_SIZE
287-
for i in xrange(int(len(data[DATA]) / MAX_CHUNK_SIZE + 1)):
288-
chunk = {DATA: Binary(data[DATA][i * MAX_CHUNK_SIZE: (i + 1) * MAX_CHUNK_SIZE])}
289-
chunk[METADATA] = data[METADATA]
287+
metadata_len = len(bson.BSON.encode(data[METADATA]))
288+
289+
size_chunked = len(data[DATA]) + metadata_len > MAX_CHUNK_SIZE
290+
for i in xrange(int(len(data[DATA]) / (MAX_CHUNK_SIZE - metadata_len) + 1)):
291+
chunk = {DATA: Binary(data[DATA][i * (MAX_CHUNK_SIZE - metadata_len) : (i + 1) * (MAX_CHUNK_SIZE - metadata_len)])}
292+
if i is 0:
293+
chunk[METADATA] = data[METADATA]
290294
if size_chunked:
291295
chunk[SEGMENT] = i
292296
else:
@@ -363,7 +367,8 @@ def __update(self, sym, item, combine_method=None, chunk_range=None):
363367

364368
# remove old segments for this chunk in case we now have less
365369
# segments than we did before
366-
chunk_count = int(len(data[DATA]) / MAX_CHUNK_SIZE + 1)
370+
metadata_len = len(bson.BSON.encode(data[METADATA]))
371+
chunk_count = int(len(data[DATA]) / (MAX_CHUNK_SIZE - metadata_len) + 1)
367372
seg_count = self._collection.count({SYMBOL: symbol, START: start, END: end})
368373
if seg_count > chunk_count:
369374
# if chunk count is 1, the segment id will be -1, not 1
@@ -374,8 +379,9 @@ def __update(self, sym, item, combine_method=None, chunk_range=None):
374379

375380
size_chunked = chunk_count > 1
376381
for i in xrange(chunk_count):
377-
chunk = {DATA: Binary(data[DATA][i * MAX_CHUNK_SIZE: (i + 1) * MAX_CHUNK_SIZE])}
378-
chunk[METADATA] = data[METADATA]
382+
chunk = {DATA: Binary(data[DATA][i * (MAX_CHUNK_SIZE - metadata_len): (i + 1) * (MAX_CHUNK_SIZE - metadata_len)])}
383+
if i is 0:
384+
chunk[METADATA] = data[METADATA]
379385
if size_chunked:
380386
chunk[SEGMENT] = i
381387
else:

arctic/serialization/numpy_arrays.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,6 @@ def docify(self, df):
7777
df: DataFrame
7878
The Pandas DataFrame to encode
7979
"""
80-
doc = SON({DATA: {}, METADATA: {}})
81-
8280
dtypes = {}
8381
masks = {}
8482
lengths = {}
@@ -108,12 +106,12 @@ def docify(self, df):
108106
start += len(d)
109107
data += d
110108

109+
doc = SON({DATA: data, METADATA: {}})
111110
doc[METADATA] = {COLUMNS: columns,
112111
MASK: masks,
113112
LENGTHS: lengths,
114113
DTYPE: dtypes
115114
}
116-
doc[DATA] = data
117115

118116
return doc
119117

0 commit comments

Comments
 (0)