Merge pull request pandas-dev#33 from manahl/pandas_daterange

jamesblackburn · jamesblackburn · commit 8f1d8736156e · 2015-09-29T15:54:23.000+01:00
Pandas DateRange query support
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,6 +1,11 @@
 
 ## Changelog
 
+### 1.7 (2015-09-18)
+
+  * Feature: Add support for reading a subset of a pandas DataFrame
+    in VersionStore.read by passing in an arctic.date.DateRange
+
 ### 1.6 (2015-09-16)
 
   * Feature: Add support for multi-index Bitemporal DataFrame storage.
diff --git a/README.md b/README.md
@@ -120,7 +120,7 @@ It wouldn't be possible without the work of the AHL Data Engineering Team includ
  * [Richard Bounds](https://github.com/richardbounds)
  * [James Blackburn](https://github.com/jamesblackburn)
  * [Vlad Mereuta](https://github.com/vmereuta)
- * Tom Taylor
+ * [Tom Taylor](https://github.com/TomTaylorLondon)
  * Tope Olukemi
  * Drake Siard
  * [Slavi Marinov](https://github.com/slavi)
diff --git a/arctic/store/_ndarray_store.py b/arctic/store/_ndarray_store.py
@@ -37,7 +37,70 @@ def _promote(type1, type2):
 
 
 class NdarrayStore(object):
-    """Chunked store for arbitrary ndarrays, supporting append."""
+    """Chunked store for arbitrary ndarrays, supporting append.
+    
+    for the simple example:
+    dat = np.empty(10)
+    library.write('test', dat) #version 1
+    library.append('test', dat) #version 2
+    
+    version documents:
+    
+    [
+     {u'_id': ObjectId('55fa9a7781f12654382e58b8'),
+      u'symbol': u'test',
+      u'version': 1
+      u'type': u'ndarray',
+      u'up_to': 10,  # no. of rows included in the data for this version
+      u'append_count': 0,
+      u'append_size': 0,
+      u'base_sha': Binary('........', 0),
+      u'dtype': u'float64',
+      u'dtype_metadata': {},
+      u'segment_count': 1, #only 1 segment included in this version
+      u'sha': Binary('.........', 0),
+      u'shape': [-1],
+      },
+      
+     {u'_id': ObjectId('55fa9aa981f12654382e58ba'),
+      u'symbol': u'test',
+      u'version': 2
+      u'type': u'ndarray',
+      u'up_to': 20, # no. of rows included in the data for this version
+      u'append_count': 1, # 1 append operation so far
+      u'append_size': 80, # 80 bytes appended
+      u'base_sha': Binary('.........', 0), # equal to sha for version 1
+      u'base_version_id': ObjectId('55fa9a7781f12654382e58b8'), # _id of version 1
+      u'dtype': u'float64',
+      u'dtype_metadata': {},
+      u'segment_count': 2, #2 segments included in this version
+      }
+      ]
+    
+
+    segment documents:
+    
+    [
+     #first chunk written:
+     {u'_id': ObjectId('55fa9a778b376a68efdd10e3'),
+      u'compressed': True, #data is lz4 compressed on write()
+      u'data': Binary('...........', 0),
+      u'parent': [ObjectId('55fa9a7781f12654382e58b8')],
+      u'segment': 9, #10 rows in the data up to this segment, so last row is 9
+      u'sha': Binary('.............', 0), # checksum of (symbol, {'data':.., 'compressed':.., 'segment':...})
+      u'symbol': u'test'},
+
+     #second chunk appended:
+     {u'_id': ObjectId('55fa9aa98b376a68efdd10e6'),
+      u'compressed': False, # no initial compression for append()
+      u'data': Binary('...........', 0),
+      u'parent': [ObjectId('55fa9a7781f12654382e58b8')],
+      u'segment': 19, #20 rows in the data up to this segment, so last row is 19
+      u'sha': Binary('............', 0), # checksum of (symbol, {'data':.., 'compressed':.., 'segment':...})
+      u'symbol': u'test'},
+      ]
+
+    """
     TYPE = 'ndarray'
 
     @classmethod
@@ -117,16 +180,22 @@ def read(self, arctic_lib, version, symbol, read_preference=None, **kwargs):
         return self._do_read(collection, version, symbol, index_range=index_range)
 
     def _do_read(self, collection, version, symbol, index_range=None):
+        '''
+        index_range is a 2-tuple of integers - a [from, to) range of segments to be read. 
+            Either from or to can be None, indicating no bound. 
+        '''
         from_index = index_range[0] if index_range else None
-        to_index = index_range[1] if index_range and index_range[1] is not None \
-            and index_range[1] < version['up_to'] else version['up_to']
+        to_index = version['up_to']
+        if index_range and index_range[1] and index_range[1] < version['up_to']:
+            to_index = index_range[1]
         segment_count = None
 
         spec = {'symbol': symbol,
                 'parent': version.get('base_version_id', version['_id']),
-                'segment': {'$lt': to_index}}
+                'segment': {'$lt': to_index}
+                }
         if from_index:
-            spec['segment'] = {'$lt': version['up_to'], '$gte': from_index}
+            spec['segment']['$gte'] = from_index
         else:
             segment_count = version.get('segment_count', None)
 
@@ -389,5 +458,22 @@ def _do_write(self, collection, version, symbol, item, previous_version, segment
 
         self.check_written(collection, symbol, version)
 
-    def _segment_index(self, item, existing_index, start, new_segments):
-        pass
+    def _segment_index(self, new_data, existing_index, start, new_segments):
+        """
+        Generate a segment index which can be used in subselect data in _index_range.
+        This function must handle both generation of the index and appending to an existing index
+
+        Parameters:
+        -----------
+        new_data: new data being written (or appended)
+        existing_index: index field from the versions document of the previous version
+        start: first (0-based) offset of the new data
+        segments: list of offsets. Each offset is the row index of the
+                  the last row of a particular chunk relative to the start of the _original_ item.
+                  array(new_data) - segments = array(offsets in item)
+        
+        Returns:
+        --------
+        Library specific index metadata to be stored in the version document.
+        """
+        pass  # numpy arrays have no index
diff --git a/arctic/store/_pandas_ndarray_store.py b/arctic/store/_pandas_ndarray_store.py
@@ -1,18 +1,27 @@
 import logging
 
-from _ndarray_store import NdarrayStore
+from bson.binary import Binary
 from pandas import DataFrame, MultiIndex, Series, DatetimeIndex, Panel
 from pandas.tslib import Timestamp, get_timezone
 import numpy as np
 
+from .._compression import compress, decompress
+from ..exceptions import ArcticException
+from ._ndarray_store import NdarrayStore
+from ..date._util import to_pandas_closed_closed
+
 log = logging.getLogger(__name__)
 
+DTN64_DTYPE = 'datetime64[ns]'
+
+INDEX_DTYPE = [('datetime', DTN64_DTYPE), ('index', 'i8')]
+
 
 def _to_primitive(arr):
     if arr.dtype.hasobject:
         if len(arr) > 0:
             if isinstance(arr[0], Timestamp):
-                return arr.astype('datetime64[ns]')
+                return arr.astype(DTN64_DTYPE)
         return np.array(list(arr))
     return arr
 
@@ -102,6 +111,103 @@ def can_convert_to_records_without_objects(self, df, symbol):
             else:
                 return True
 
+    def _segment_index(self, recarr, existing_index, start, new_segments):
+        """
+        Generate index of datetime64 -> item offset.
+
+        Parameters:
+        -----------
+        new_data: new data being written (or appended)
+        existing_index: index field from the versions document of the previous version
+        start: first (0-based) offset of the new data
+        segments: list of offsets. Each offset is the row index of the
+                  the last row of a particular chunk relative to the start of the _original_ item.
+                  array(new_data) - segments = array(offsets in item)
+
+        Returns:
+        --------
+        Binary(compress(array([(index, datetime)]))
+            Where index is the 0-based index of the datetime in the DataFrame
+        """
+        # find the index of the first datetime64 column
+        idx_col = self._datetime64_index(recarr)
+        # if one exists let's create the index on it
+        if idx_col is not None:
+            new_segments = np.array(new_segments, dtype='i8')
+            last_rows = recarr[new_segments - start]
+            # create numpy index
+            index = np.core.records.fromarrays([last_rows[idx_col]]
+                                               + [new_segments, ],
+                                               dtype=INDEX_DTYPE)
+            # append to existing index if exists
+            if existing_index:
+                existing_index_arr = np.fromstring(decompress(existing_index), dtype=INDEX_DTYPE)
+                if start > 0:
+                    existing_index_arr = existing_index_arr[existing_index_arr['index'] < start]
+                index = np.concatenate((existing_index_arr, index))
+            return Binary(compress(index.tostring()))
+        elif existing_index:
+            raise ArcticException("Could not find datetime64 index in item but existing data contains one")
+        return None
+
+    def _datetime64_index(self, recarr):
+        """ Given a np.recarray find the first datetime64 column """
+        # TODO: Handle multi-indexes
+        names = recarr.dtype.names
+        for name in names:
+            if recarr[name].dtype == DTN64_DTYPE:
+                return name
+        return None
+
+    def _index_range(self, version, symbol, date_range=None, **kwargs):
+        """ Given a version, read the segment_index and return the chunks associated
+        with the date_range. As the segment index is (id -> last datetime)
+        we need to take care in choosing the correct chunks. """
+        if date_range and 'segment_index' in version:
+            index = np.fromstring(decompress(version['segment_index']), dtype=INDEX_DTYPE)
+            dtcol = self._datetime64_index(index)
+            if dtcol and len(index):
+                dts = index[dtcol]
+                start, end = _start_end(date_range, dts)
+                if start > dts[-1]:
+                    return -1, -1
+                idxstart = min(np.searchsorted(dts, start), len(dts))
+                idxend = min(np.searchsorted(dts, end), len(dts))
+                return index['index'][idxstart], index['index'][idxend] + 1
+        return super(PandasStore, self)._index_range(version, symbol, **kwargs)
+
+    def _daterange(self, recarr, date_range):
+        """ Given a recarr, slice out the given artic.date.DateRange if a
+        datetime64 index exists """
+        idx = self._datetime64_index(recarr)
+        if idx and len(recarr):
+            dts = recarr[idx]
+            mask = Series(np.zeros(len(dts)), index=dts)
+            start, end = _start_end(date_range, dts)
+            mask[start:end] = 1.0
+            return recarr[mask.values.astype(bool)]
+        return recarr
+
+    def read(self, arctic_lib, version, symbol, read_preference=None, date_range=None, **kwargs):
+        item = super(PandasStore, self).read(arctic_lib, version, symbol, read_preference,
+                                             date_range=date_range, **kwargs)
+        if date_range:
+            item = self._daterange(item, date_range)
+        return item
+
+
+def _start_end(date_range, dts):
+    """
+    Return tuple: [start, end] of np.datetime64 dates that are inclusive of the passed
+    in datetimes.
+    """
+    # FIXME: timezones
+    assert len(dts)
+    date_range = to_pandas_closed_closed(date_range)
+    start = np.datetime64(date_range.start) if date_range.start else dts[0]
+    end = np.datetime64(date_range.end) if date_range.end else dts[-1]
+    return start, end
+
 
 class PandasSeriesStore(PandasStore):
     TYPE = 'pandasseries'
diff --git a/arctic/store/version_store.py b/arctic/store/version_store.py
@@ -285,7 +285,7 @@ def _write_handler(self, version, symbol, data, **kwargs):
             handler = self._bson_handler
         return handler
 
-    def read(self, symbol, as_of=None, from_version=None, allow_secondary=None, **kwargs):
+    def read(self, symbol, as_of=None, date_range=None, from_version=None, allow_secondary=None, **kwargs):
         """
         Read data for the named symbol.  Returns a VersionedItem object with
         a data and metdata element (as passed into write).
@@ -299,6 +299,9 @@ def read(self, symbol, as_of=None, from_version=None, allow_secondary=None, **kw
             `int` : specific version number
             `str` : snapshot name which contains the version
             `datetime.datetime` : the version of the data that existed as_of the requested point in time
+        date_range: `arctic.date.DateRange`
+            DateRange to read data for.  Applies to Pandas data, with a DateTime index
+            returns only the part of the data that falls in the DateRange.
         allow_secondary : `bool` or `None`
             Override the default behavior for allowing reads from secondary members of a cluster:
             `None` : use the settings from the top-level `Arctic` object used to query this version store.
@@ -312,7 +315,8 @@ def read(self, symbol, as_of=None, from_version=None, allow_secondary=None, **kw
         try:
             read_preference = self._read_preference(allow_secondary)
             _version = self._read_metadata(symbol, as_of=as_of, read_preference=read_preference)
-            return self._do_read(symbol, _version, from_version, read_preference=read_preference, **kwargs)
+            return self._do_read(symbol, _version, from_version,
+                                 date_range=date_range, read_preference=read_preference, **kwargs)
         except (OperationFailure, AutoReconnect) as e:
             # Log the exception so we know how often this is happening
             log_exception('read', e, 1)
@@ -321,6 +325,7 @@ def read(self, symbol, as_of=None, from_version=None, allow_secondary=None, **kw
             _version = mongo_retry(self._read_metadata)(symbol, as_of=as_of,
                                                         read_preference=ReadPreference.PRIMARY)
             return self._do_read_retry(symbol, _version, from_version,
+                                       date_range=date_range,
                                        read_preference=ReadPreference.PRIMARY,
                                        **kwargs)
         except Exception, e:
diff --git a/tests/integration/store/test_pandas_store.py b/tests/integration/store/test_pandas_store.py
diff --git a/tests/unit/store/test_version_store.py b/tests/unit/store/test_version_store.py