Skip to content

Commit e6695da

Browse files
authored
Merge pull request pandas-dev#363 from manahl/add-concat-flag
Raise if appending out-of-order items; Add concat flag
2 parents 5621ce4 + b52c8ab commit e6695da

File tree

4 files changed

+79
-11
lines changed

4 files changed

+79
-11
lines changed

CHANGES.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,16 @@
22

33
### 1.44
44
* Feature: Expose compressHC from internal arctic LZ4 and remove external LZ4 dependency
5+
* Feature: Appending older data (compare to what's exist in library) will raise. Use `concat=True` to append only the
6+
new bits
57

68
### 1.43 (2017-05-30)
79
* Bugfix: #350 remove deprecated pandas calls
810
* Bugfix: #360 version incorrect in empty append in VersionStore
911
* Feature: #365 add generic BSON store
1012

1113
### 1.42 (2017-05-12)
12-
* Bugfix: #346 fixed daterange subsetting error on very large dateframes in version store
14+
* Bugfix: #346 fixed daterange subsetting error on very large dataframes in version store
1315
* Bugfix: #351 $size queries can't use indexes, use alternative queries
1416

1517
### 1.41 (2017-04-20)

arctic/store/_pandas_ndarray_store.py

+25-5
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
import ast
22
import logging
33

4+
import numpy as np
45
from bson.binary import Binary
56
from pandas import DataFrame, Series, Panel
6-
import numpy as np
77

8+
from arctic.exceptions import UnorderedDataException
89
from arctic.serialization.numpy_records import SeriesSerializer, DataFrameSerializer
10+
from ._ndarray_store import NdarrayStore
911
from .._compression import compress, decompress
1012
from ..date._util import to_pandas_closed_closed
1113
from ..exceptions import ArcticException
12-
from ._ndarray_store import NdarrayStore
13-
1414

1515
log = logging.getLogger(__name__)
1616

@@ -116,6 +116,24 @@ def get_info(self, version):
116116
ret['dtype'] = ast.literal_eval(version['dtype'])
117117
return ret
118118

119+
def read_segment_last_dt(self, version):
120+
if 'segment_index' in version:
121+
index = np.fromstring(decompress(version['segment_index']), dtype=INDEX_DTYPE)
122+
dt_index = self._datetime64_index(index)
123+
if dt_index:
124+
return index[dt_index][-1]
125+
return None
126+
127+
def slice_overlap_item_or_raise(self, item, previous_version, concat):
128+
"""If new item has overlap dt with previous version, keep only new bits if concat=True; raise if concat=False"""
129+
prev_version_last_dt = self.read_segment_last_dt(previous_version)
130+
if prev_version_last_dt and len(item) > 0 and item.index[0] <= prev_version_last_dt:
131+
if concat:
132+
item = item[item.index > prev_version_last_dt]
133+
else:
134+
raise UnorderedDataException(
135+
"new data {} before to symbol ending {}".format(item.index[0], prev_version_last_dt))
136+
return item
119137

120138
def _start_end(date_range, dts):
121139
"""
@@ -152,7 +170,8 @@ def write(self, arctic_lib, version, symbol, item, previous_version):
152170
item, md = self.SERIALIZER.serialize(item)
153171
super(PandasSeriesStore, self).write(arctic_lib, version, symbol, item, previous_version, dtype=md)
154172

155-
def append(self, arctic_lib, version, symbol, item, previous_version, **kwargs):
173+
def append(self, arctic_lib, version, symbol, item, previous_version, concat=False, **kwargs):
174+
item = self.slice_overlap_item_or_raise(item, previous_version, concat)
156175
item, md = self.SERIALIZER.serialize(item)
157176
super(PandasSeriesStore, self).append(arctic_lib, version, symbol, item, previous_version, dtype=md, **kwargs)
158177

@@ -176,7 +195,8 @@ def write(self, arctic_lib, version, symbol, item, previous_version):
176195
item, md = self.SERIALIZER.serialize(item)
177196
super(PandasDataFrameStore, self).write(arctic_lib, version, symbol, item, previous_version, dtype=md)
178197

179-
def append(self, arctic_lib, version, symbol, item, previous_version, **kwargs):
198+
def append(self, arctic_lib, version, symbol, item, previous_version, concat=False, **kwargs):
199+
item = self.slice_overlap_item_or_raise(item, previous_version, concat)
180200
item, md = self.SERIALIZER.serialize(item)
181201
super(PandasDataFrameStore, self).append(arctic_lib, version, symbol, item, previous_version, dtype=md, **kwargs)
182202

tests/integration/store/test_pandas_store.py

+46
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,52 @@ def test_dataframe_append_should_add_new_columns_and_reorder(library):
344344
assert_frame_equal(expected, actual)
345345

346346

347+
def test_series_append_concat(library):
348+
s1 = Series(data=[1.0], index=[dt(2012, 1, 1)])
349+
s2 = Series([1.0, 2.0], [dt(2012, 1, 1), dt(2012, 1, 2)])
350+
s2.index.name = 'index'
351+
s2.name = 'values'
352+
library.write('TEST_1', s1)
353+
library.append('TEST_1', s2, concat=True)
354+
result = library.read('TEST_1').data
355+
assert_series_equal(s2, result)
356+
357+
358+
def test_series_append_concat_only_appends_end(library):
359+
s1 = Series([1.0], [dt(2012, 1, 1)])
360+
s2 = Series([2.0, 2.0], [dt(2012, 1, 1), dt(2012, 1, 2)])
361+
library.write('TEST_1', s1)
362+
library.append('TEST_1', s2, concat=True)
363+
364+
result = library.read('TEST_1').data
365+
expected = Series([1.0, 2.0], [dt(2012, 1, 1), dt(2012, 1, 2)])
366+
expected.index.name = 'index'
367+
expected.name = 'values'
368+
assert_series_equal(expected, result)
369+
370+
371+
def test_frame_append_concat(library):
372+
df1 = DataFrame(data=[1.0], index=[dt(2012, 1, 1)], columns=['a'])
373+
df2 = DataFrame([1.0, 2.0], [dt(2012, 1, 1), dt(2012, 1, 2)], columns=['a'])
374+
df2.index.name = 'index'
375+
library.write('TEST_1', df1)
376+
library.append('TEST_1', df2, concat=True)
377+
result = library.read('TEST_1').data
378+
assert_frame_equal(df2, result)
379+
380+
381+
def test_frame_append_concat_only_appends_end(library):
382+
df1 = DataFrame([1.0], [dt(2012, 1, 1)], columns=['a'])
383+
df2 = DataFrame([2.0, 2.0], [dt(2012, 1, 1), dt(2012, 1, 2)], columns=['a'])
384+
library.write('TEST_1', df1)
385+
library.append('TEST_1', df2, concat=True)
386+
387+
result = library.read('TEST_1').data
388+
expected = DataFrame([1.0, 2.0], [dt(2012, 1, 1), dt(2012, 1, 2)], columns=['a'])
389+
expected.index.name = 'index'
390+
assert_frame_equal(expected, result)
391+
392+
347393
# -- auto generated tests --- #
348394
def dataframe(columns, length, index):
349395
df = DataFrame(np.ones((length, columns)), columns=list(string.ascii_lowercase[:columns]))

tests/integration/store/test_version_store.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,11 @@
3535
2012-11-08 17:06:11.040 | 3.0""")
3636

3737
ts1_append = read_str_as_pandas(""" times | near
38-
2012-09-08 17:06:11.040 | 1.0
39-
2012-10-08 17:06:11.040 | 2.0
40-
2012-10-09 17:06:11.040 | 2.5
41-
2012-11-08 17:06:11.040 | 3.0
42-
2012-11-09 17:06:11.040 | 3.0""")
38+
2012-11-09 17:06:11.040 | 1.0
39+
2012-11-10 17:06:11.040 | 2.0
40+
2012-11-11 17:06:11.040 | 2.5
41+
2012-11-12 17:06:11.040 | 3.0
42+
2012-11-13 17:06:11.040 | 3.0""")
4343

4444

4545
symbol = 'TS1'

0 commit comments

Comments
 (0)