Skip to content

Commit 75c9c84

Browse files
authored
Merge pull request pandas-dev#287 from manahl/has_symbol
support for has_symbol and updates to list_symbols
2 parents c7402a3 + cca780f commit 75c9c84

File tree

4 files changed

+91
-51
lines changed

4 files changed

+91
-51
lines changed

CHANGES.md

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
### 1.34
44

55
* Feature: #283 Support for all pandas frequency strings in ChunkStore DateChunker
6+
* Feature: #286 Add has_symbol to ChunkStore and support for partial symbol matching in list_symbols
67

78
### 1.33 (2016-11-07)
89

arctic/chunkstore/chunkstore.py

+25-2
Original file line numberDiff line numberDiff line change
@@ -136,15 +136,23 @@ def delete(self, symbol, chunk_range=None):
136136
self._collection.delete_many(query)
137137
self._collection.symbols.delete_many(query)
138138

139-
def list_symbols(self):
139+
def list_symbols(self, partial_match=None):
140140
"""
141141
Returns all symbols in the library
142142
143+
Parameters
144+
----------
145+
partial: None or str
146+
if not none, use this string to do a partial match on symbol names
147+
143148
Returns
144149
-------
145150
list of str
146151
"""
147-
return self._symbols.distinct(SYMBOL)
152+
symbols = self._symbols.distinct(SYMBOL)
153+
if partial_match is None:
154+
return symbols
155+
return [x for x in symbols if partial_match in x]
148156

149157
def _get_symbol_info(self, symbol):
150158
return self._symbols.find_one({SYMBOL: symbol})
@@ -574,3 +582,18 @@ def stats(self):
574582
'size': res['chunks']['size'] + res['symbols']['size'],
575583
}
576584
return res
585+
586+
def has_symbol(self, symbol):
587+
'''
588+
Check if symbol exists in collection
589+
590+
Parameters
591+
----------
592+
symbol: str
593+
The symbol to look up in the collection
594+
595+
Returns
596+
-------
597+
bool
598+
'''
599+
return self._get_symbol_info(symbol) is not None

arctic/store/version_store.py

+47-49
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from ..date import mktz, datetime_to_ms, ms_to_datetime
1212
from ..decorators import mongo_retry
1313
from ..exceptions import NoDataFoundException, DuplicateSnapshotException, \
14-
OptimisticLockException, ArcticException
14+
OptimisticLockException
1515
from ..hooks import log_exception
1616
from ._pickle_store import PickleStore
1717
from ._version_store_utils import cleanup
@@ -105,7 +105,7 @@ def __str__(self):
105105

106106
def __repr__(self):
107107
return str(self)
108-
108+
109109
def _read_preference(self, allow_secondary):
110110
""" Return the mongo read preference given an 'allow_secondary' argument
111111
"""
@@ -136,7 +136,7 @@ def list_symbols(self, all_symbols=False, snapshot=None, regex=None, **kwargs):
136136
"""
137137
query = {}
138138
if regex is not None:
139-
query ['symbol'] = {'$regex' : regex}
139+
query['symbol'] = {'$regex': regex}
140140
if kwargs:
141141
for k, v in six.iteritems(kwargs):
142142
query['metadata.' + k] = v
@@ -154,19 +154,19 @@ def list_symbols(self, all_symbols=False, snapshot=None, regex=None, **kwargs):
154154
# Match based on user criteria first
155155
pipeline.append({'$match': query})
156156
pipeline.extend([
157-
# Id is by insert time which matches version order
158-
{'$sort': {'_id':-1}},
159-
# Group by 'symbol'
160-
{'$group': {'_id': '$symbol',
161-
'deleted': {'$first': '$metadata.deleted'},
162-
},
163-
},
164-
# Don't include symbols which are part of some snapshot, but really deleted...
165-
{'$match': {'deleted': {'$ne': True}}},
166-
{'$project': {'_id': 0,
167-
'symbol': '$_id',
168-
}
169-
}])
157+
# Id is by insert time which matches version order
158+
{'$sort': {'_id':-1}},
159+
# Group by 'symbol'
160+
{'$group': {'_id': '$symbol',
161+
'deleted': {'$first': '$metadata.deleted'},
162+
},
163+
},
164+
# Don't include symbols which are part of some snapshot, but really deleted...
165+
{'$match': {'deleted': {'$ne': True}}},
166+
{'$project': {'_id': 0,
167+
'symbol': '$_id',
168+
}
169+
}])
170170

171171
results = self._versions.aggregate(pipeline)
172172
return sorted([x['symbol'] for x in results])
@@ -371,8 +371,6 @@ def get_info(self, symbol, as_of=None):
371371
return handler.get_info(version)
372372
return {}
373373

374-
375-
376374
def _do_read(self, symbol, version, from_version=None, **kwargs):
377375
if version.get('deleted'):
378376
raise NoDataFoundException("No data found for %s in library %s" % (symbol, self._arctic_lib.get_name()))
@@ -429,8 +427,8 @@ def _read_metadata(self, symbol, as_of=None, read_preference=None):
429427
if not as_of.tzinfo:
430428
as_of = as_of.replace(tzinfo=mktz())
431429
_version = versions_coll.find_one({'symbol': symbol,
432-
'_id': {'$lt': bson.ObjectId.from_datetime(as_of + timedelta(seconds=1))}},
433-
sort=[('_id', pymongo.DESCENDING)])
430+
'_id': {'$lt': bson.ObjectId.from_datetime(as_of + timedelta(seconds=1))}},
431+
sort=[('_id', pymongo.DESCENDING)])
434432
else:
435433
# Backward compatibility - as of is a version number
436434
_version = versions_coll.find_one({'symbol': symbol, 'version': as_of})
@@ -484,7 +482,7 @@ def append(self, symbol, data, metadata=None, prune_previous_version=True, upser
484482
# If the version numbers aren't in line, then we've lost some data.
485483
next_ver = self._version_nums.find_one({'symbol': symbol})['version']
486484
if next_ver != previous_version['version']:
487-
logger.error('''version_nums is out of sync with previous version document.
485+
logger.error('''version_nums is out of sync with previous version document.
488486
This probably means that either a version document write has previously failed, or the previous version has been deleted.
489487
There will be a gap in the data.''')
490488

@@ -508,8 +506,8 @@ def append(self, symbol, data, metadata=None, prune_previous_version=True, upser
508506

509507
# Get the next version number - check there hasn't been a concurrent write
510508
next_ver = self._version_nums.find_one_and_update({'symbol': symbol, 'version': next_ver},
511-
{'$inc': {'version': 1}},
512-
upsert=False, new=True)
509+
{'$inc': {'version': 1}},
510+
upsert=False, new=True)
513511
if next_ver is None:
514512
raise OptimisticLockException()
515513

@@ -549,7 +547,7 @@ def write(self, symbol, data, metadata=None, prune_previous_version=True, **kwar
549547
Default: True
550548
kwargs :
551549
passed through to the write handler
552-
550+
553551
Returns
554552
-------
555553
VersionedItem named tuple containing the metadata and verison number
@@ -559,13 +557,13 @@ def write(self, symbol, data, metadata=None, prune_previous_version=True, **kwar
559557
version = {'_id': bson.ObjectId()}
560558
version['symbol'] = symbol
561559
version['version'] = self._version_nums.find_one_and_update({'symbol': symbol},
562-
{'$inc': {'version': 1}},
563-
upsert=True, new=True)['version']
560+
{'$inc': {'version': 1}},
561+
upsert=True, new=True)['version']
564562
version['metadata'] = metadata
565563

566564
previous_version = self._versions.find_one({'symbol': symbol, 'version': {'$lt': version['version']}},
567-
sort=[('version', pymongo.DESCENDING)],
568-
)
565+
sort=[('version', pymongo.DESCENDING)],
566+
)
569567

570568
handler = self._write_handler(version, symbol, data, **kwargs)
571569
mongo_retry(handler.write)(self._arctic_lib, version, symbol, data, previous_version, **kwargs)
@@ -590,34 +588,34 @@ def _prune_previous_versions(self, symbol, keep_mins=120):
590588
# Find all non-snapshotted versions older than a version that's at least keep_mins minutes old
591589
# Based on documents available on the secondary
592590
versions_find = mongo_retry(self._versions.with_options(read_preference=ReadPreference.SECONDARY_PREFERRED if keep_mins > 0 else
593-
ReadPreference.PRIMARY)
591+
ReadPreference.PRIMARY)
594592
.find)
595593
versions = list(versions_find({ # Find versions of this symbol
596-
'symbol': symbol,
597-
# Not snapshotted
598-
'$or': [{'parent': {'$exists': False}}, {'parent': {'$size': 0}}],
599-
# At least 'keep_mins' old
600-
'_id': {'$lt': bson.ObjectId.from_datetime(
594+
'symbol': symbol,
595+
# Not snapshotted
596+
'$or': [{'parent': {'$exists': False}}, {'parent': {'$size': 0}}],
597+
# At least 'keep_mins' old
598+
'_id': {'$lt': bson.ObjectId.from_datetime(
601599
dt.utcnow()
602-
# Add one second as the ObjectId str has random fuzz
600+
# Add one second as the ObjectId str has random fuzz
603601
+ timedelta(seconds=1)
604602
- timedelta(minutes=keep_mins))
605-
}
606-
},
607-
# Using version number here instead of _id as there's a very unlikely case
608-
# where the versions are created on different hosts or processes at exactly
609-
# the same time.
610-
sort=[('version', pymongo.DESCENDING)],
611-
# Keep one, that's at least 10 mins old, around
612-
# (cope with replication delay)
613-
skip=1,
614-
projection=['_id', 'type'],
615-
))
603+
}
604+
},
605+
# Using version number here instead of _id as there's a very unlikely case
606+
# where the versions are created on different hosts or processes at exactly
607+
# the same time.
608+
sort=[('version', pymongo.DESCENDING)],
609+
# Keep one, that's at least 10 mins old, around
610+
# (cope with replication delay)
611+
skip=1,
612+
projection=['_id', 'type'],
613+
))
616614
if not versions:
617615
return
618616
version_ids = [v['_id'] for v in versions]
619617

620-
#Find any version_ids that are the basis of other, 'current' versions - don't prune these.
618+
# Find any version_ids that are the basis of other, 'current' versions - don't prune these.
621619
base_versions = set([x['base_version_id'] for x in mongo_retry(self._versions.find)({
622620
'symbol': symbol,
623621
'_id': {'$nin': version_ids},
@@ -728,8 +726,8 @@ def snapshot(self, snap_name, metadata=None, skip_symbols=None, versions=None):
728726
snapshot = {'_id': bson.ObjectId()}
729727
snapshot['name'] = snap_name
730728
snapshot['metadata'] = metadata
731-
732-
skip_symbols = set() if skip_symbols is None else set(skip_symbols)
729+
730+
skip_symbols = set() if skip_symbols is None else set(skip_symbols)
733731

734732
if versions is None:
735733
versions = {sym: None for sym in set(self.list_symbols()) - skip_symbols}

tests/integration/chunkstore/test_chunkstore.py

+18
Original file line numberDiff line numberDiff line change
@@ -1187,3 +1187,21 @@ def test_quarterly_data(chunkstore_lib):
11871187
count += 1
11881188

11891189
assert(count == 4)
1190+
1191+
1192+
def test_list_symbols(chunkstore_lib):
1193+
df = DataFrame(data={'data': np.random.randint(0, 100, size=366)},
1194+
index=pd.date_range('2016-01-01', '2016-12-31'))
1195+
df.index.name = 'date'
1196+
1197+
chunkstore_lib.write('rabbit', df)
1198+
chunkstore_lib.write('dragon', df)
1199+
chunkstore_lib.write('snake', df)
1200+
chunkstore_lib.write('wolf', df)
1201+
chunkstore_lib.write('bear', df)
1202+
1203+
assert('dragon' in chunkstore_lib.list_symbols())
1204+
assert(set(['rabbit', 'dragon', 'bear']) == set(chunkstore_lib.list_symbols(partial_match='r')))
1205+
1206+
assert(chunkstore_lib.has_symbol('dragon'))
1207+
assert(chunkstore_lib.has_symbol('marmot') is False)

0 commit comments

Comments
 (0)