Fix issue pandas-dev#292 - Metadata size not accounted for in ChunkStore size chunking

bmoscon · web-flow · commit 9611531e7cea · 2016-11-29T07:57:19.000-05:00
When ChunkStore chunks for size (due to 16MB doc limit in MongoDB), it was not including the size of the metadata in the size of the document. In general, this is ok, the metadata is very small, but it can be quite large when array masks are included in the metadata (this happens when there are columns of strings that have missing values).
diff --git a/CHANGES.md b/CHANGES.md
@@ -2,6 +2,7 @@
 
 ### 1.34
 
+  * Bugfix:  #292 Account for metadata size during size chunking in ChunkStore
   * Feature: #283 Support for all pandas frequency strings in ChunkStore DateChunker
   * Feature: #286 Add has_symbol to ChunkStore and support for partial symbol matching in list_symbols
 
diff --git a/arctic/chunkstore/chunkstore.py b/arctic/chunkstore/chunkstore.py
@@ -1,6 +1,7 @@
 import logging
 import pymongo
 import hashlib
+import bson
 
 from bson.binary import Binary
 from pandas import DataFrame, Series
@@ -283,10 +284,13 @@ def write(self, symbol, item, chunker=DateChunker(), **kwargs):
             doc[METADATA] = {'columns': data[METADATA][COLUMNS] if COLUMNS in data[METADATA] else ''}
             doc[CHUNK_SIZE] = chunk_size
 
-            size_chunked = len(data[DATA]) > MAX_CHUNK_SIZE
-            for i in xrange(int(len(data[DATA]) / MAX_CHUNK_SIZE + 1)):
-                chunk = {DATA: Binary(data[DATA][i * MAX_CHUNK_SIZE: (i + 1) * MAX_CHUNK_SIZE])}
-                chunk[METADATA] = data[METADATA]
+            metadata_len = len(bson.BSON.encode(data[METADATA]))
+
+            size_chunked = len(data[DATA]) + metadata_len > MAX_CHUNK_SIZE
+            for i in xrange(int(len(data[DATA]) / (MAX_CHUNK_SIZE - metadata_len) + 1)):
+                chunk = {DATA: Binary(data[DATA][i * (MAX_CHUNK_SIZE - metadata_len) : (i + 1) * (MAX_CHUNK_SIZE - metadata_len)])}
+                if i is 0:
+                    chunk[METADATA] = data[METADATA]
                 if size_chunked:
                     chunk[SEGMENT] = i
                 else:
@@ -363,7 +367,8 @@ def __update(self, sym, item, combine_method=None, chunk_range=None):
 
             # remove old segments for this chunk in case we now have less
             # segments than we did before
-            chunk_count = int(len(data[DATA]) / MAX_CHUNK_SIZE + 1)
+            metadata_len = len(bson.BSON.encode(data[METADATA]))
+            chunk_count = int(len(data[DATA]) / (MAX_CHUNK_SIZE - metadata_len) + 1)
             seg_count = self._collection.count({SYMBOL: symbol, START: start, END: end})
             if seg_count > chunk_count:
                 # if chunk count is 1, the segment id will be -1, not 1
@@ -374,8 +379,9 @@ def __update(self, sym, item, combine_method=None, chunk_range=None):
 
             size_chunked = chunk_count > 1
             for i in xrange(chunk_count):
-                chunk = {DATA: Binary(data[DATA][i * MAX_CHUNK_SIZE: (i + 1) * MAX_CHUNK_SIZE])}
-                chunk[METADATA] = data[METADATA]
+                chunk = {DATA: Binary(data[DATA][i * (MAX_CHUNK_SIZE - metadata_len): (i + 1) * (MAX_CHUNK_SIZE - metadata_len)])}
+                if i is 0:
+                    chunk[METADATA] = data[METADATA]
                 if size_chunked:
                     chunk[SEGMENT] = i
                 else:
diff --git a/arctic/serialization/numpy_arrays.py b/arctic/serialization/numpy_arrays.py
@@ -77,8 +77,6 @@ def docify(self, df):
         df:  DataFrame
             The Pandas DataFrame to encode
         """
-        doc = SON({DATA: {}, METADATA: {}})
-
         dtypes = {}
         masks = {}
         lengths = {}
@@ -108,12 +106,12 @@ def docify(self, df):
             start += len(d)
             data += d
 
+        doc = SON({DATA: data, METADATA: {}})
         doc[METADATA] = {COLUMNS: columns,
                          MASK: masks,
                          LENGTHS: lengths,
                          DTYPE: dtypes
                          }
-        doc[DATA] = data
 
         return doc