Skip to content

Commit a3cacb0

Browse files
authored
Merge pull request pandas-dev#535 from manahl/issue-534
Check dtype before concatenating on write (issue pandas-dev#534)
2 parents d677ed1 + 12bfcab commit a3cacb0

File tree

5 files changed

+271
-0
lines changed

5 files changed

+271
-0
lines changed

CHANGES.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
## Changelog
22

33
### 1.64
4+
* Bugfix: #534 VersionStore: overwriting a symbol with different dtype (but same data format) does not
5+
raise exceptions anymore
46
* Bugfix: #531 arctic_prune_versions: clean broken snapshot references before pruning
57
* Feature: #490 add support to numpy 1.14
68

arctic/fixtures/arctic.py

Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1+
import base64
12
import getpass
23
import logging
34

5+
import bson
46
import pytest as pytest
57

68
from .. import arctic as m
@@ -33,6 +35,250 @@ def arctic_secondary(mongo_server, arctic):
3335
arctic = m.Arctic(mongo_host=mongo_server.api, allow_secondary=True)
3436
return arctic
3537

38+
@pytest.fixture(scope="function")
39+
def multicolumn_store_with_uncompressed_write(mongo_server):
40+
"""
41+
The database state created by this fixture is equivalent to the following operations using arctic 1.40
42+
or previous:
43+
44+
arctic.initialize_library('arctic_test.TEST', m.VERSION_STORE, segment='month')
45+
library = arctic.get_library('arctic_test.TEST')
46+
df = pd.DataFrame([[1,2], [3,4]], index=['x','y'], columns=[['a','w'], ['a','v']])
47+
library.write('pandas', df)
48+
49+
different from newer versions, the last write creates a uncompressed chunk.
50+
"""
51+
mongo_server.api.drop_database('arctic_test')
52+
53+
library_name = 'arctic_test.TEST'
54+
arctic = m.Arctic(mongo_host=mongo_server.api)
55+
arctic.initialize_library(library_name, m.VERSION_STORE, segment='month')
56+
57+
db = mongo_server.api.arctic_test
58+
db.TEST.insert_many([
59+
{
60+
'parent': [bson.ObjectId('5ad0dc065c911d1188b512d8')],
61+
'data': bson.Binary(b'\x11\x00\x00\x002x\x01\x00\x01\x00\x80\x02\x00\x00\x00\x00\x00\x00\x00', 0),
62+
'symbol': 'pandas',
63+
'sha': bson.Binary(b'\xaa\\`\x0e\xc2D-\xc1_\xf7\xfd\x12\xfa\xd2\x17\x05`\x00\x98\xe2', 0),
64+
'compressed': True,
65+
'_id': bson.ObjectId('5ad0dc067934ecad404070be'),
66+
'segment': 0
67+
},
68+
{
69+
'parent': [bson.ObjectId('5ad0dc065c911d1188b512d8')],
70+
'data': bson.Binary(b'y\x03\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00', 0),
71+
'symbol': 'pandas',
72+
'sha': bson.Binary(b'\xfe=WQ\xb5\xfdL\xb7\xcavd\x85o\x04]\x04\xdb\xa8]3', 0),
73+
'compressed': False,
74+
'_id': bson.ObjectId('5ad0dc077934ecad404070bf'),
75+
'segment': 1
76+
}
77+
])
78+
db.TEST.ARCTIC.update_one({"_id": "ARCTIC_META"}, {"$set": {"_id": "ARCTIC_META", "TYPE": "VersionStore", "QUOTA": 10737418240}})
79+
db.TEST.changes.insert_many([
80+
{
81+
'append_count': 0,
82+
'dtype_metadata': {
83+
'index': ['index'],
84+
'columns': ["('a', 'a')", "('w', 'v')"]
85+
},
86+
'segment_count': 1,
87+
'dtype': '[(\'index\', \'S1\'), ("(\'a\', \'a\')", \'<i8\'), ("(\'w\', \'v\')", \'<i8\')]',
88+
'symbol': 'pandas',
89+
'up_to': 1,
90+
'metadata': None,
91+
'sha': bson.Binary(b'\xf2\x15h\x9d\x925\x95\xa5\x0e\x95J\xc4x\xfc\xfc\xd5\x80\xe0\x1d\xef', 0),
92+
'shape': [-1],
93+
'version': 1,
94+
'base_sha': bson.Binary(b'\xf2\x15h\x9d\x925\x95\xa5\x0e\x95J\xc4x\xfc\xfc\xd5\x80\xe0\x1d\xef', 0),
95+
'_id': bson.ObjectId('5ad0dc065c911d1188b512d8'),
96+
'type': 'pandasdf',
97+
'append_size': 0
98+
},
99+
{
100+
'append_count': 1,
101+
'dtype_metadata': {
102+
'index': ['index'],
103+
'columns': ["('a', 'a')", "('w', 'v')"]
104+
},
105+
'segment_count': 2,
106+
'sha': bson.Binary(b'1\x83[ZO\xec\x080D\x80f\xe4@\xe4\xd3\x94yG\xe2\x08', 0),
107+
'dtype': '[(\'index\', \'S1\'), ("(\'a\', \'a\')", \'<i8\'), ("(\'w\', \'v\')", \'<i8\')]',
108+
'symbol': 'pandas',
109+
'up_to': 2,
110+
'metadata': None,
111+
'base_version_id': bson.ObjectId('5ad0dc065c911d1188b512d8'),
112+
'shape': [-1],
113+
'version': 2,
114+
'base_sha': bson.Binary(b'\xf2\x15h\x9d\x925\x95\xa5\x0e\x95J\xc4x\xfc\xfc\xd5\x80\xe0\x1d\xef', 0),
115+
'_id': bson.ObjectId('5ad0dc075c911d1188b512d9'),
116+
'type': 'pandasdf',
117+
'append_size': 17
118+
}
119+
])
120+
db.TEST.version_nums.insert_one({'symbol': 'pandas', '_id': bson.ObjectId('5ad0dc067934ecad404070bd'), 'version': 2})
121+
db.TEST.versions.insert_many([
122+
{
123+
'append_count': 0,
124+
'dtype_metadata': {
125+
'index': ['index'],
126+
'columns': ["('a', 'a')", "('w', 'v')"]
127+
},
128+
'segment_count': 1,
129+
'dtype': '[(\'index\', \'S1\'), ("(\'a\', \'a\')", \'<i8\'), ("(\'w\', \'v\')", \'<i8\')]',
130+
'symbol': 'pandas',
131+
'up_to': 1,
132+
'metadata': None,
133+
'sha': bson.Binary(b'\xf2\x15h\x9d\x925\x95\xa5\x0e\x95J\xc4x\xfc\xfc\xd5\x80\xe0\x1d\xef', 0),
134+
'shape': [-1],
135+
'version': 1,
136+
'base_sha': bson.Binary(b'\xf2\x15h\x9d\x925\x95\xa5\x0e\x95J\xc4x\xfc\xfc\xd5\x80\xe0\x1d\xef', 0),
137+
'_id': bson.ObjectId('5ad0dc065c911d1188b512d8'),
138+
'type': 'pandasdf',
139+
'append_size': 0
140+
},
141+
{
142+
'append_count': 1,
143+
'dtype_metadata': {
144+
'index': ['index'],
145+
'columns': ["('a', 'a')", "('w', 'v')"]
146+
},
147+
'segment_count': 2,
148+
'sha': bson.Binary(b'1\x83[ZO\xec\x080D\x80f\xe4@\xe4\xd3\x94yG\xe2\x08', 0),
149+
'dtype': '[(\'index\', \'S1\'), ("(\'a\', \'a\')", \'<i8\'), ("(\'w\', \'v\')", \'<i8\')]',
150+
'symbol': 'pandas',
151+
'up_to': 2,
152+
'metadata': None,
153+
'base_version_id': bson.ObjectId('5ad0dc065c911d1188b512d8'),
154+
'shape': [-1],
155+
'version': 2,
156+
'base_sha': bson.Binary(b'\xf2\x15h\x9d\x925\x95\xa5\x0e\x95J\xc4x\xfc\xfc\xd5\x80\xe0\x1d\xef', 0),
157+
'_id': bson.ObjectId('5ad0dc075c911d1188b512d9'),
158+
'type': 'pandasdf',
159+
'append_size': 17
160+
}
161+
])
162+
163+
return {'symbol': 'pandas', 'store': arctic.get_library('arctic_test.TEST')}
164+
165+
166+
@pytest.fixture(scope="function")
167+
def ndarray_store_with_uncompressed_write(mongo_server):
168+
"""
169+
The database state created by this fixture is equivalent to the following operations using arctic 1.40
170+
or previous:
171+
172+
arctic.initialize_library('arctic_test.TEST', m.VERSION_STORE, segment='month')
173+
library = arctic.get_library('arctic_test.TEST')
174+
arr = np.arange(2).astype([('abc', 'int64')])
175+
library.write('MYARR', arr[:1])
176+
library.write('MYARR', arr)
177+
178+
different from newer versions, the last write creates a uncompressed chunk.
179+
"""
180+
mongo_server.api.drop_database('arctic_test')
181+
182+
library_name = 'arctic_test.TEST'
183+
arctic = m.Arctic(mongo_host=mongo_server.api)
184+
arctic.initialize_library(library_name, m.VERSION_STORE, segment='month')
185+
186+
db = mongo_server.api.arctic_test
187+
db.TEST.insert_many([
188+
{
189+
"_id": bson.ObjectId("5ad0742ca0949de6727cf994"),
190+
"segment": 0,
191+
"sha": bson.Binary(base64.b64decode("Fk+quqPVSDfaajYJkOAvnDyXtGQ="), 0),
192+
"symbol": "MYARR",
193+
"data": bson.Binary(base64.b64decode("CAAAAIAAAAAAAAAAAA=="), 0),
194+
"compressed": True,
195+
"parent": [bson.ObjectId("5ad0742c5c911d4d80ee2ea3")]
196+
},
197+
{
198+
"_id": bson.ObjectId("5ad0742ca0949de6727cf995"),
199+
"sha": bson.Binary(base64.b64decode("eqpp8VOJBttTz0j5H+QGtOQ+r44="), 0),
200+
"symbol": "MYARR",
201+
"segment": 1,
202+
"data": bson.Binary(base64.b64decode("AQAAAAAAAAA="), 0),
203+
"compressed": False,
204+
"parent": [bson.ObjectId("5ad0742c5c911d4d80ee2ea3")]
205+
}
206+
])
207+
db.TEST.ARCTIC.update_one({"_id": "ARCTIC_META"}, {"$set": {"_id": "ARCTIC_META", "TYPE": "VersionStore", "QUOTA": 10737418240}})
208+
db.TEST.changes.insert_many([
209+
{
210+
"_id": bson.ObjectId("5ad0742c5c911d4d80ee2ea3"),
211+
"append_count": 0,
212+
"dtype_metadata": {},
213+
"segment_count": 1,
214+
"dtype": "[('abc', '<i8')]",
215+
"symbol": "MYARR",
216+
"up_to": 1,
217+
"append_size": 0,
218+
"sha": bson.Binary(base64.b64decode("Bf5AV1MWbxJVWefJrFWGVPEHx+k="), 0),
219+
"shape": [-1],
220+
"version": 1,
221+
"base_sha": bson.Binary(base64.b64decode("Bf5AV1MWbxJVWefJrFWGVPEHx+k="), 0),
222+
"type": "ndarray",
223+
"metadata": None
224+
},
225+
{
226+
"_id": bson.ObjectId("5ad0742c5c911d4d80ee2ea4"),
227+
"append_count": 1,
228+
"dtype_metadata": {},
229+
"segment_count": 2,
230+
"base_version_id": bson.ObjectId("5ad0742c5c911d4d80ee2ea3"),
231+
"dtype": "[('abc', '<i8')]",
232+
"symbol": "MYARR",
233+
"up_to": 2,
234+
"append_size": 8,
235+
"sha": bson.Binary(base64.b64decode("Ax7oBxVFw1/9wKog2gfOLjbOVD8="), 0),
236+
"shape": [-1],
237+
"version": 2,
238+
"base_sha": bson.Binary(base64.b64decode("Bf5AV1MWbxJVWefJrFWGVPEHx+k="), 0),
239+
"type": "ndarray",
240+
"metadata": None
241+
}
242+
])
243+
db.TEST.versions_nums.insert_one({"_id": bson.ObjectId("5ad0742ca0949de6727cf993"), "symbol": "MYARR", "version": 2})
244+
db.TEST.versions.insert_many([
245+
{
246+
"_id": bson.ObjectId("5ad0742c5c911d4d80ee2ea3"),
247+
"append_count": 0,
248+
"dtype_metadata": {},
249+
"segment_count": 1,
250+
"dtype": "[('abc', '<i8')]",
251+
"symbol": "MYARR",
252+
"up_to": 1,
253+
"append_size": 0,
254+
"sha": bson.Binary(base64.b64decode("Bf5AV1MWbxJVWefJrFWGVPEHx+k="), 0),
255+
"shape": [-1],
256+
"version": 1,
257+
"base_sha": bson.Binary(base64.b64decode("Bf5AV1MWbxJVWefJrFWGVPEHx+k="), 0),
258+
"type": "ndarray",
259+
"metadata": None
260+
},
261+
{
262+
"_id": bson.ObjectId("5ad0742c5c911d4d80ee2ea4"),
263+
"append_count": 1,
264+
"dtype_metadata": {},
265+
"segment_count": 2,
266+
"base_version_id": bson.ObjectId("5ad0742c5c911d4d80ee2ea3"),
267+
"dtype": "[('abc', '<i8')]",
268+
"symbol": "MYARR",
269+
"up_to": 2,
270+
"append_size": 8,
271+
"sha": bson.Binary(base64.b64decode("Ax7oBxVFw1/9wKog2gfOLjbOVD8="), 0),
272+
"shape": [-1],
273+
"version": 2,
274+
"base_sha": bson.Binary(base64.b64decode("Bf5AV1MWbxJVWefJrFWGVPEHx+k="), 0),
275+
"type": "ndarray",
276+
"metadata": None
277+
}
278+
])
279+
280+
return {'symbol': 'MYARR', 'store': arctic.get_library('arctic_test.TEST')}
281+
36282

37283
@pytest.fixture(scope="function")
38284
def library_name():

arctic/store/_ndarray_store.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -456,6 +456,7 @@ def write(self, arctic_lib, version, symbol, item, previous_version, dtype=None)
456456

457457
if previous_version:
458458
if 'sha' in previous_version \
459+
and previous_version['dtype'] == version['dtype'] \
459460
and self.checksum(item[:previous_version['up_to']]) == previous_version['sha']:
460461
# The first n rows are identical to the previous version, so just append.
461462
# Do a 'dirty' append (i.e. concat & start from a new base version) for safety

tests/integration/store/test_ndarray_store.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,18 @@
1515
register_versioned_storage(NdarrayStore)
1616

1717

18+
def test_write_new_column_name_to_arctic_1_40_data(ndarray_store_with_uncompressed_write):
19+
store = ndarray_store_with_uncompressed_write['store']
20+
symbol = ndarray_store_with_uncompressed_write['symbol']
21+
22+
arr = store.read(symbol).data
23+
new_arr = np.array(list(arr) + [(2,)], dtype=[('fgh', '<i8')])
24+
25+
store.write(symbol, new_arr)
26+
27+
assert np.all(store.read(symbol).data == new_arr)
28+
29+
1830
def test_save_read_simple_ndarray(library):
1931
ndarr = np.ones(1000)
2032
library.write('MYARR', ndarr)

tests/integration/store/test_pandas_store.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,16 @@
2222
register_versioned_storage(PandasDataFrameStore)
2323

2424

25+
def test_write_multi_column_to_arctic_1_40_data(multicolumn_store_with_uncompressed_write):
26+
store = multicolumn_store_with_uncompressed_write['store']
27+
symbol = multicolumn_store_with_uncompressed_write['symbol']
28+
29+
df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], index=['x', 'y', 'z'], columns=[[u'a', 'w'], ['a', 'v']])
30+
store.write(symbol, df)
31+
32+
assert np.all(store.read(symbol).data == df)
33+
34+
2535
def test_save_read_pandas_series(library):
2636
s = Series(data=[1, 2, 3], index=[4, 5, 6])
2737
library.write('pandas', s)

0 commit comments

Comments
 (0)