Merge pull request pandas-dev#533 from manahl/issue-490

aflag · web-flow · commit d677ed1c38c5 · 2018-04-16T10:33:25.000+01:00
Issue pandas-dev#490: Make arctic compatible with numpy 1.14
diff --git a/CHANGES.md b/CHANGES.md
@@ -2,6 +2,7 @@
 
 ### 1.64
   * Bugfix: #531 arctic_prune_versions: clean broken snapshot references before pruning
+  * Feature: #490 add support to numpy 1.14
 
 ### 1.63 (2018-04-06)
   * Bugfix: #521 Clang 6.0 compiler support on macOS
diff --git a/arctic/serialization/numpy_arrays.py b/arctic/serialization/numpy_arrays.py
@@ -139,15 +139,17 @@ def objify(self, doc, columns=None):
 
         for col in cols:
             d = decompress(doc[DATA][doc[METADATA][LENGTHS][col][0]: doc[METADATA][LENGTHS][col][1] + 1])
-            d = np.fromstring(d, doc[METADATA][DTYPE][col])
+            # d is ready-only but that's not an issue since DataFrame will copy the data anyway.
+            d = np.frombuffer(d, doc[METADATA][DTYPE][col])
 
             if MASK in doc[METADATA] and col in doc[METADATA][MASK]:
                 mask_data = decompress(doc[METADATA][MASK][col])
-                mask = np.fromstring(mask_data, 'bool')
+                mask = np.frombuffer(mask_data, 'bool')
                 d = ma.masked_array(d, mask)
             data[col] = d
 
-        return pd.DataFrame(data, columns=cols)[cols]
+        # Copy into
+        return pd.DataFrame(data, columns=cols, copy=True)[cols]
 
 
 class FrametoArraySerializer(Serializer):
diff --git a/arctic/store/_ndarray_store.py b/arctic/store/_ndarray_store.py
@@ -62,6 +62,44 @@ def _attempt_update_unchanged(symbol, unchanged_segment_ids, collection, version
             symbol, previous_version['version'], result.matched_count, len(unchanged_segment_ids)))
 
 
+def _resize_with_dtype(arr, dtype):
+    """
+    This function will transform arr into an array with the same type as dtype. It will do this by
+    filling new columns with zeros (or NaNs, if it is a float column). Also, columns that are not
+    in the new dtype will be dropped.
+    """
+    structured_arrays = dtype.names is not None and arr.dtype.names is not None
+    old_columns = set(arr.dtype.names or [])
+    new_columns = set(dtype.names or [])
+
+    # In numpy 1.9 the ndarray.astype method used to handle changes in number of fields. The code below
+    # should replicate the same behaviour the old astype used to have.
+    #
+    # One may be tempted to use np.lib.recfunctions.stack_arrays to implement both this step and the
+    # concatenate that follows but it 2x slower and it requires providing your own default values (instead
+    # of np.zeros).
+    #
+    # Numpy 1.14 supports doing new_arr[old_columns] = arr[old_columns], which is faster than the code below
+    # (in benchmarks it seems to be even slightly faster than using the old astype). However, that is not
+    # supported by numpy 1.9.2.
+    if structured_arrays and (old_columns != new_columns):
+        new_arr = np.zeros(arr.shape, dtype)
+        for c in old_columns & new_columns:
+            new_arr[c] = arr[c]
+
+        # missing float columns should default to nan rather than zero
+        _is_float_type = lambda _dtype: _dtype.type in (np.float32, np.float64)
+        _is_void_float_type = lambda _dtype: _dtype.type == np.void and _is_float_type(_dtype.subdtype[0])
+        _is_float_or_void_float_type = lambda _dtype: _is_float_type(_dtype) or _is_void_float_type(_dtype)
+        _is_float = lambda column: _is_float_or_void_float_type(dtype.fields[column][0])
+        for new_column in filter(_is_float, new_columns - old_columns):
+            new_arr[new_column] = np.nan
+    else:
+        new_arr = arr.astype(dtype)
+
+    return new_arr
+
+
 class NdarrayStore(object):
     """Chunked store for arbitrary ndarrays, supporting append.
 
@@ -210,23 +248,18 @@ def _do_read(self, collection, version, symbol, index_range=None):
         else:
             segment_count = version.get('segment_count', None)
 
-        segments = []
+        data = bytearray()
         i = -1
         for i, x in enumerate(collection.find(spec, sort=[('segment', pymongo.ASCENDING)],)):
-            segments.append(decompress(x['data']) if x['compressed'] else x['data'])
-
-        data = b''.join(segments)
-
-        # free up memory from initial copy of data
-        del segments
+            data.extend(decompress(x['data']) if x['compressed'] else x['data'])
 
         # Check that the correct number of segments has been returned
         if segment_count is not None and i + 1 != segment_count:
             raise OperationFailure("Incorrect number of segments returned for {}:{}.  Expected: {}, but got {}. {}".format(
                                    symbol, version['version'], segment_count, i + 1, collection.database.name + '.' + collection.name))
 
         dtype = self._dtype(version['dtype'], version.get('dtype_metadata', {}))
-        rtn = np.fromstring(data, dtype=dtype).reshape(version.get('shape', (-1)))
+        rtn = np.frombuffer(data, dtype=dtype).reshape(version.get('shape', (-1)))
         return rtn
 
     def _promote_types(self, dtype, dtype_str):
@@ -247,6 +280,9 @@ def append(self, arctic_lib, version, symbol, item, previous_version, dtype=None
 
         if not dtype:
             dtype = item.dtype
+
+        if (self._dtype(previous_version['dtype']).fields is None) != (dtype.fields is None):
+            raise ValueError("type changes to or from structured array not supported")
         
         if previous_version['up_to'] == 0:
             dtype = dtype
@@ -263,17 +299,10 @@ def append(self, arctic_lib, version, symbol, item, previous_version, dtype=None
             version['dtype_metadata'] = dict(dtype.metadata or {})
             version['type'] = self.TYPE
 
-            old_arr = self._do_read(collection, previous_version, symbol).astype(dtype)
-            # missing float columns should default to nan rather than zero
-            old_dtype = self._dtype(previous_version['dtype'])
-            if dtype.names is not None and old_dtype.names is not None:
-                new_columns = set(dtype.names) - set(old_dtype.names)
-                _is_float_type = lambda _dtype: _dtype.type in (np.float32, np.float64)
-                _is_void_float_type = lambda _dtype: _dtype.type == np.void and _is_float_type(_dtype.subdtype[0])
-                _is_float_or_void_float_type = lambda _dtype: _is_float_type(_dtype) or _is_void_float_type(_dtype)
-                _is_float = lambda column: _is_float_or_void_float_type(dtype.fields[column][0])
-                for new_column in filter(_is_float, new_columns):
-                    old_arr[new_column] = np.nan
+            # This function will drop columns read from the previous version if they are not found in the
+            # new append. However, the promote_types will raise an exception in that case and this code
+            # will not be reached.
+            old_arr = _resize_with_dtype(self._do_read(collection, previous_version, symbol), dtype)
 
             item = np.concatenate([old_arr, item])
             version['up_to'] = len(item)
diff --git a/arctic/store/_pandas_ndarray_store.py b/arctic/store/_pandas_ndarray_store.py
@@ -51,7 +51,8 @@ def _segment_index(self, recarr, existing_index, start, new_segments):
                                                dtype=INDEX_DTYPE)
             # append to existing index if exists
             if existing_index:
-                existing_index_arr = np.fromstring(decompress(existing_index), dtype=INDEX_DTYPE)
+                # existing_index_arr is read-only but it's never written to
+                existing_index_arr = np.frombuffer(decompress(existing_index), dtype=INDEX_DTYPE)
                 if start > 0:
                     existing_index_arr = existing_index_arr[existing_index_arr['index'] < start]
                 index = np.concatenate((existing_index_arr, index))
@@ -74,7 +75,8 @@ def _index_range(self, version, symbol, date_range=None, **kwargs):
         with the date_range. As the segment index is (id -> last datetime)
         we need to take care in choosing the correct chunks. """
         if date_range and 'segment_index' in version:
-            index = np.fromstring(decompress(version['segment_index']), dtype=INDEX_DTYPE)
+            # index is read-only but it's never written to
+            index = np.frombuffer(decompress(version['segment_index']), dtype=INDEX_DTYPE)
             dtcol = self._datetime64_index(index)
             if dtcol and len(index):
                 dts = index[dtcol]
diff --git a/arctic/tickstore/tickstore.py b/arctic/tickstore/tickstore.py
@@ -435,7 +435,8 @@ def _read_bucket(self, doc, column_set, column_dtypes, include_symbol, include_i
         rtn = {}
         if doc[VERSION] != 3:
             raise ArcticException("Unhandled document version: %s" % doc[VERSION])
-        rtn[INDEX] = np.cumsum(np.fromstring(decompress(doc[INDEX]), dtype='uint64'))
+        # np.cumsum copies the read-only array created with frombuffer
+        rtn[INDEX] = np.cumsum(np.frombuffer(decompress(doc[INDEX]), dtype='uint64'))
         doc_length = len(rtn[INDEX])
         column_set.update(doc[COLUMNS].keys())
 
@@ -444,7 +445,8 @@ def _read_bucket(self, doc, column_set, column_dtypes, include_symbol, include_i
         for c in column_set:
             try:
                 coldata = doc[COLUMNS][c]
-                mask = np.fromstring(decompress(coldata[ROWMASK]), dtype='uint8')
+                # the or below will make a copy of this read-only array
+                mask = np.frombuffer(decompress(coldata[ROWMASK]), dtype='uint8')
                 union_mask = union_mask | mask
             except KeyError:
                 rtn[c] = None
@@ -460,10 +462,13 @@ def _read_bucket(self, doc, column_set, column_dtypes, include_symbol, include_i
             try:
                 coldata = doc[COLUMNS][c]
                 dtype = np.dtype(coldata[DTYPE])
-                values = np.fromstring(decompress(coldata[DATA]), dtype=dtype)
+                # values ends up being copied by pandas before being returned to the user. However, we
+                # copy it into a bytearray here for safety.
+                values = np.frombuffer(bytearray(decompress(coldata[DATA])), dtype=dtype)
                 self._set_or_promote_dtype(column_dtypes, c, dtype)
                 rtn[c] = self._empty(rtn_length, dtype=column_dtypes[c])
-                rowmask = np.unpackbits(np.fromstring(decompress(coldata[ROWMASK]),
+                # unpackbits will make a copy of the read-only array created by frombuffer
+                rowmask = np.unpackbits(np.frombuffer(decompress(coldata[ROWMASK]),
                                         dtype='uint8'))[:doc_length].astype('bool')
                 rowmask = rowmask[union_mask]
                 rtn[c][rowmask] = values
diff --git a/setup.py b/setup.py
@@ -139,7 +139,7 @@ def extensions():
     ext_modules=defer_cythonize(extensions),
     setup_requires=["six",
                     "cython",
-                    "numpy<=1.13.3",
+                    "numpy",
                     "setuptools-git",
                    ],
     install_requires=["cython",
diff --git a/tests/integration/store/test_ndarray_store_append.py b/tests/integration/store/test_ndarray_store_append.py
@@ -1,6 +1,7 @@
 import bson
 import numpy as np
 from numpy.testing import assert_equal
+import pytest
 
 from arctic.store._ndarray_store import NdarrayStore, _APPEND_COUNT
 from arctic.store.version_store import register_versioned_storage
@@ -46,6 +47,40 @@ def test_promote_types2(library):
     assert np.all(ndarr.astype([('abc', np.promote_types('float64', 'int64'))]) == saved_arr)
 
 
+def test_promote_types_smaller_sizes(library):
+    library.write('MYARR', np.ones(100, dtype='int64'))
+    library.append('MYARR', np.ones(100, dtype='int32'))
+    saved_arr = library.read('MYARR').data
+    assert np.all(np.ones(200, dtype='int64') == saved_arr)
+
+
+def test_promote_types_larger_sizes(library):
+    library.write('MYARR', np.ones(100, dtype='int32'))
+    library.append('MYARR', np.ones(100, dtype='int64'))
+    saved_arr = library.read('MYARR').data
+    assert np.all(np.ones(200, dtype='int64') == saved_arr)
+
+
+def test_promote_field_types_smaller_sizes(library):
+    arr = np.array([(3, 7)], dtype=[('a', '<i8'), ('b', '<i8')])
+    library.write('MYARR', arr)
+    arr = np.array([(9, 8)], dtype=[('a', '<i4'), ('b', '<i8')])
+    library.append('MYARR', arr)
+    saved_arr = library.read('MYARR').data
+    expected = np.array([(3, 7), (9, 8)], dtype=[('a', '<i8'), ('b', '<i8')])
+    assert np.all(saved_arr == expected)
+
+
+def test_promote_field_types_larger_sizes(library):
+    arr = np.array([(3, 7)], dtype=[('a', '<i4'), ('b', '<i8')])
+    library.write('MYARR', arr)
+    arr = np.array([(9, 8)], dtype=[('a', '<i8'), ('b', '<i8')])
+    library.append('MYARR', arr)
+    saved_arr = library.read('MYARR').data
+    expected = np.array([(3, 7), (9, 8)], dtype=[('a', '<i8'), ('b', '<i8')])
+    assert np.all(saved_arr == expected)
+
+
 def test_append_ndarray_with_field_shape(library):
     ndarr = np.empty(10, dtype=[('A', 'int64'), ('B', 'float64', (2,))])
     ndarr['A'] = 1
@@ -131,6 +166,15 @@ def test_append_too_large_ndarray(library):
     assert np.all(np.concatenate([ndarr, ndarr]) == saved_arr)
 
 
+def test_empty_field_append_keeps_all_columns(library):
+    ndarr = np.array([(3, 5)], dtype=[('a', '<i'), ('b', '<i')])
+    ndarr2 = np.array([], dtype=[('a', '<i')])
+    library.write('MYARR', ndarr)
+    library.append('MYARR', ndarr2)
+    saved_arr = library.read('MYARR').data
+    assert np.all(saved_arr == np.array([(3, 5)], dtype=[('a', '<i'), ('b', '<i')]))
+
+
 def test_empty_append_promotes_dtype(library):
     ndarr = np.array(["a", "b", "c"])
     ndarr2 = np.array([])
@@ -160,6 +204,14 @@ def test_empty_append_promotes_dtype3(library):
     assert np.all(saved_arr == np.hstack((ndarr2, ndarr2)))
 
 
+def test_convert_to_structured_array(library):
+    arr = np.ones(100, dtype='int64')
+    library.write('MYARR', arr)
+    arr = np.array([(6,)], dtype=[('a', '<i8')])
+    with pytest.raises(ValueError):
+        library.append('MYARR', arr)
+
+
 def test_empty_append_concat_and_rewrite(library):
     ndarr = np.array([])
     ndarr2 = np.array(["a", "b", "c"])
diff --git a/tests/integration/tickstore/test_ts_read.py b/tests/integration/tickstore/test_ts_read.py
@@ -45,6 +45,35 @@ def test_read(tickstore_lib):
     assert tickstore_lib._collection.find_one()['c'] == 2
 
 
+def test_read_data_is_modifiable(tickstore_lib):
+    data = [{'ASK': 1545.25,
+                  'ASKSIZE': 1002.0,
+                  'BID': 1545.0,
+                  'BIDSIZE': 55.0,
+                  'CUMVOL': 2187387.0,
+                  'DELETED_TIME': 0,
+                  'INSTRTYPE': 'FUT',
+                  'PRICE': 1545.0,
+                  'SIZE': 1.0,
+                  'TICK_STATUS': 0,
+                  'TRADEHIGH': 1561.75,
+                  'TRADELOW': 1537.25,
+                  'index': 1185076787070},
+                 {'CUMVOL': 354.0,
+                  'DELETED_TIME': 0,
+                  'PRICE': 1543.75,
+                  'SIZE': 354.0,
+                  'TRADEHIGH': 1543.75,
+                  'TRADELOW': 1543.75,
+                  'index': 1185141600600}]
+    tickstore_lib.write('FEED::SYMBOL', data)
+
+    df = tickstore_lib.read('FEED::SYMBOL', columns=['BID', 'ASK', 'PRICE'])
+
+    df[['BID', 'ASK', 'PRICE']] = 7
+    assert np.all(df[['BID', 'ASK', 'PRICE']].values == np.array([[7, 7, 7], [7, 7, 7]]))
+
+
 def test_read_allow_secondary(tickstore_lib):
     data = [{'ASK': 1545.25,
                   'ASKSIZE': 1002.0,
diff --git a/tests/unit/serialization/test_numpy_arrays.py b/tests/unit/serialization/test_numpy_arrays.py
@@ -92,3 +92,12 @@ def test_multi_column_fail():
     with pytest.raises(Exception) as e:
         n.deserialize(a, columns=['A', 'B'])
     assert('Duplicate' in str(e))
+
+
+def test_dataframe_writable_after_objify():
+    f = FrameConverter()
+    df = pd.DataFrame(data={'one': [5, 6, 2]})
+    df = f.objify(f.docify(df))
+    df['one'] = 7
+
+    assert np.all(df['one'].values == np.array([7, 7, 7]))
diff --git a/tests/unit/tickstore/test_tickstore.py b/tests/unit/tickstore/test_tickstore.py
@@ -94,7 +94,7 @@ def test_tickstore_to_bucket_with_image():
     assert get_coldata(bucket[COLUMNS]['B']) == ([27.2], [0, 1, 0, 0, 0, 0, 0, 0])
     assert get_coldata(bucket[COLUMNS]['D']) == ([0], [1, 0, 0, 0, 0, 0, 0, 0])
     index = [dt.fromtimestamp(int(i/1000)).replace(tzinfo=mktz(tz)) for i in
-             list(np.cumsum(np.fromstring(decompress(bucket[INDEX]), dtype='uint64')))]
+             list(np.cumsum(np.frombuffer(decompress(bucket[INDEX]), dtype='uint64')))]
     assert index == [i['index'] for i in data]
     assert bucket[COLUMNS]['A'][DTYPE] == 'int64'
     assert bucket[COLUMNS]['B'][DTYPE] == 'float64'
@@ -128,8 +128,8 @@ def test_tickstore_to_bucket_always_forwards_image():
 def get_coldata(coldata):
     """ return values and rowmask """
     dtype = np.dtype(coldata[DTYPE])
-    values = np.fromstring(decompress(coldata[DATA]), dtype=dtype)
-    rowmask = np.unpackbits(np.fromstring(decompress(coldata[ROWMASK]), dtype='uint8'))
+    values = np.frombuffer(decompress(coldata[DATA]), dtype=dtype)
+    rowmask = np.unpackbits(np.frombuffer(decompress(coldata[ROWMASK]), dtype='uint8'))
     return list(values), list(rowmask)
 
 
@@ -159,7 +159,7 @@ def test_tickstore_pandas_to_bucket_image():
     assert values[0] == 1 and values[2] == 1
     assert rowmask == [1, 1, 1, 0, 0, 0, 0, 0]
     index = [dt.fromtimestamp(int(i/1000)).replace(tzinfo=mktz(tz)) for i in
-             list(np.cumsum(np.fromstring(decompress(bucket[INDEX]), dtype='uint64')))]
+             list(np.cumsum(np.frombuffer(decompress(bucket[INDEX]), dtype='uint64')))]
     assert index == tick_index
     assert bucket[COLUMNS]['A'][DTYPE] == 'int64'
     assert bucket[COLUMNS]['B'][DTYPE] == 'float64'