Skip to content

Commit 39faa23

Browse files
committed
BUG: Bug in selection from a HDFStore with a fixed format and start and/or stop specified will now return the selected range
closes pandas-dev#8287
1 parent 8662cb9 commit 39faa23

File tree

3 files changed

+132
-44
lines changed

3 files changed

+132
-44
lines changed

doc/source/whatsnew/v0.18.2.txt

+2
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ Other enhancements
7979
- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
8080
- ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`)
8181

82+
8283
.. _whatsnew_0182.api:
8384

8485
API changes
@@ -207,6 +208,7 @@ Bug Fixes
207208
- Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`)
208209
- Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`)
209210
- Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`)
211+
- Bug in selection from a ``HDFStore`` with a fixed format and ``start`` and/or ``stop`` specified will now return the selected range (:issue:`8287`)
210212

211213

212214
- Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`)

pandas/io/pytables.py

+79-41
Original file line numberDiff line numberDiff line change
@@ -1314,12 +1314,20 @@ def __init__(self, store, s, func, where, nrows, start=None, stop=None,
13141314
self.s = s
13151315
self.func = func
13161316
self.where = where
1317-
self.nrows = nrows or 0
1318-
self.start = start or 0
13191317

1320-
if stop is None:
1321-
stop = self.nrows
1322-
self.stop = min(self.nrows, stop)
1318+
# set start/stop if they are not set if we are a table
1319+
if self.s.is_table:
1320+
if nrows is None:
1321+
nrows = 0
1322+
if start is None:
1323+
start = 0
1324+
if stop is None:
1325+
stop = nrows
1326+
stop = min(nrows, stop)
1327+
1328+
self.nrows = nrows
1329+
self.start = start
1330+
self.stop = stop
13231331

13241332
self.coordinates = None
13251333
if iterator or chunksize is not None:
@@ -2303,14 +2311,23 @@ def f(values, freq=None, tz=None):
23032311
return klass
23042312

23052313
def validate_read(self, kwargs):
2306-
if kwargs.get('columns') is not None:
2314+
"""
2315+
remove table keywords from kwargs and return
2316+
raise if any keywords are passed which are not-None
2317+
"""
2318+
kwargs = copy.copy(kwargs)
2319+
2320+
columns = kwargs.pop('columns', None)
2321+
if columns is not None:
23072322
raise TypeError("cannot pass a column specification when reading "
23082323
"a Fixed format store. this store must be "
23092324
"selected in its entirety")
2310-
if kwargs.get('where') is not None:
2325+
where = kwargs.pop('where', None)
2326+
if where is not None:
23112327
raise TypeError("cannot pass a where specification when reading "
23122328
"from a Fixed format store. this store must be "
23132329
"selected in its entirety")
2330+
return kwargs
23142331

23152332
@property
23162333
def is_exists(self):
@@ -2329,11 +2346,11 @@ def get_attrs(self):
23292346
def write(self, obj, **kwargs):
23302347
self.set_attrs()
23312348

2332-
def read_array(self, key):
2349+
def read_array(self, key, start=None, stop=None):
23332350
""" read an array for the specified node (off of group """
23342351
import tables
23352352
node = getattr(self.group, key)
2336-
data = node[:]
2353+
data = node[start:stop]
23372354
attrs = node._v_attrs
23382355

23392356
transposed = getattr(attrs, 'transposed', False)
@@ -2363,17 +2380,17 @@ def read_array(self, key):
23632380
else:
23642381
return ret
23652382

2366-
def read_index(self, key):
2383+
def read_index(self, key, **kwargs):
23672384
variety = _ensure_decoded(getattr(self.attrs, '%s_variety' % key))
23682385

23692386
if variety == u('multi'):
2370-
return self.read_multi_index(key)
2387+
return self.read_multi_index(key, **kwargs)
23712388
elif variety == u('block'):
2372-
return self.read_block_index(key)
2389+
return self.read_block_index(key, **kwargs)
23732390
elif variety == u('sparseint'):
2374-
return self.read_sparse_intindex(key)
2391+
return self.read_sparse_intindex(key, **kwargs)
23752392
elif variety == u('regular'):
2376-
_, index = self.read_index_node(getattr(self.group, key))
2393+
_, index = self.read_index_node(getattr(self.group, key), **kwargs)
23772394
return index
23782395
else: # pragma: no cover
23792396
raise TypeError('unrecognized index variety: %s' % variety)
@@ -2411,19 +2428,19 @@ def write_block_index(self, key, index):
24112428
self.write_array('%s_blengths' % key, index.blengths)
24122429
setattr(self.attrs, '%s_length' % key, index.length)
24132430

2414-
def read_block_index(self, key):
2431+
def read_block_index(self, key, **kwargs):
24152432
length = getattr(self.attrs, '%s_length' % key)
2416-
blocs = self.read_array('%s_blocs' % key)
2417-
blengths = self.read_array('%s_blengths' % key)
2433+
blocs = self.read_array('%s_blocs' % key, **kwargs)
2434+
blengths = self.read_array('%s_blengths' % key, **kwargs)
24182435
return BlockIndex(length, blocs, blengths)
24192436

24202437
def write_sparse_intindex(self, key, index):
24212438
self.write_array('%s_indices' % key, index.indices)
24222439
setattr(self.attrs, '%s_length' % key, index.length)
24232440

2424-
def read_sparse_intindex(self, key):
2441+
def read_sparse_intindex(self, key, **kwargs):
24252442
length = getattr(self.attrs, '%s_length' % key)
2426-
indices = self.read_array('%s_indices' % key)
2443+
indices = self.read_array('%s_indices' % key, **kwargs)
24272444
return IntIndex(length, indices)
24282445

24292446
def write_multi_index(self, key, index):
@@ -2448,27 +2465,28 @@ def write_multi_index(self, key, index):
24482465
label_key = '%s_label%d' % (key, i)
24492466
self.write_array(label_key, lab)
24502467

2451-
def read_multi_index(self, key):
2468+
def read_multi_index(self, key, **kwargs):
24522469
nlevels = getattr(self.attrs, '%s_nlevels' % key)
24532470

24542471
levels = []
24552472
labels = []
24562473
names = []
24572474
for i in range(nlevels):
24582475
level_key = '%s_level%d' % (key, i)
2459-
name, lev = self.read_index_node(getattr(self.group, level_key))
2476+
name, lev = self.read_index_node(getattr(self.group, level_key),
2477+
**kwargs)
24602478
levels.append(lev)
24612479
names.append(name)
24622480

24632481
label_key = '%s_label%d' % (key, i)
2464-
lab = self.read_array(label_key)
2482+
lab = self.read_array(label_key, **kwargs)
24652483
labels.append(lab)
24662484

24672485
return MultiIndex(levels=levels, labels=labels, names=names,
24682486
verify_integrity=True)
24692487

2470-
def read_index_node(self, node):
2471-
data = node[:]
2488+
def read_index_node(self, node, start=None, stop=None):
2489+
data = node[start:stop]
24722490
# If the index was an empty array write_array_empty() will
24732491
# have written a sentinel. Here we relace it with the original.
24742492
if ('shape' in node._v_attrs and
@@ -2607,17 +2625,17 @@ def write_array(self, key, value, items=None):
26072625

26082626
class LegacyFixed(GenericFixed):
26092627

2610-
def read_index_legacy(self, key):
2628+
def read_index_legacy(self, key, start=None, stop=None):
26112629
node = getattr(self.group, key)
2612-
data = node[:]
2630+
data = node[start:stop]
26132631
kind = node._v_attrs.kind
26142632
return _unconvert_index_legacy(data, kind, encoding=self.encoding)
26152633

26162634

26172635
class LegacySeriesFixed(LegacyFixed):
26182636

26192637
def read(self, **kwargs):
2620-
self.validate_read(kwargs)
2638+
kwargs = self.validate_read(kwargs)
26212639
index = self.read_index_legacy('index')
26222640
values = self.read_array('values')
26232641
return Series(values, index=index)
@@ -2626,7 +2644,7 @@ def read(self, **kwargs):
26262644
class LegacyFrameFixed(LegacyFixed):
26272645

26282646
def read(self, **kwargs):
2629-
self.validate_read(kwargs)
2647+
kwargs = self.validate_read(kwargs)
26302648
index = self.read_index_legacy('index')
26312649
columns = self.read_index_legacy('columns')
26322650
values = self.read_array('values')
@@ -2645,9 +2663,9 @@ def shape(self):
26452663
return None
26462664

26472665
def read(self, **kwargs):
2648-
self.validate_read(kwargs)
2649-
index = self.read_index('index')
2650-
values = self.read_array('values')
2666+
kwargs = self.validate_read(kwargs)
2667+
index = self.read_index('index', **kwargs)
2668+
values = self.read_array('values', **kwargs)
26512669
return Series(values, index=index, name=self.name)
26522670

26532671
def write(self, obj, **kwargs):
@@ -2657,12 +2675,25 @@ def write(self, obj, **kwargs):
26572675
self.attrs.name = obj.name
26582676

26592677

2660-
class SparseSeriesFixed(GenericFixed):
2678+
class SparseFixed(GenericFixed):
2679+
2680+
def validate_read(self, kwargs):
2681+
"""
2682+
we don't support start, stop kwds in Sparse
2683+
"""
2684+
kwargs = super(SparseFixed, self).validate_read(kwargs)
2685+
if 'start' in kwargs or 'stop' in kwargs:
2686+
raise NotImplementedError("start and/or stop are not supported "
2687+
"in fixed Sparse reading")
2688+
return kwargs
2689+
2690+
2691+
class SparseSeriesFixed(SparseFixed):
26612692
pandas_kind = u('sparse_series')
26622693
attributes = ['name', 'fill_value', 'kind']
26632694

26642695
def read(self, **kwargs):
2665-
self.validate_read(kwargs)
2696+
kwargs = self.validate_read(kwargs)
26662697
index = self.read_index('index')
26672698
sp_values = self.read_array('sp_values')
26682699
sp_index = self.read_index('sp_index')
@@ -2681,12 +2712,12 @@ def write(self, obj, **kwargs):
26812712
self.attrs.kind = obj.kind
26822713

26832714

2684-
class SparseFrameFixed(GenericFixed):
2715+
class SparseFrameFixed(SparseFixed):
26852716
pandas_kind = u('sparse_frame')
26862717
attributes = ['default_kind', 'default_fill_value']
26872718

26882719
def read(self, **kwargs):
2689-
self.validate_read(kwargs)
2720+
kwargs = self.validate_read(kwargs)
26902721
columns = self.read_index('columns')
26912722
sdict = {}
26922723
for c in columns:
@@ -2714,12 +2745,12 @@ def write(self, obj, **kwargs):
27142745
self.write_index('columns', obj.columns)
27152746

27162747

2717-
class SparsePanelFixed(GenericFixed):
2748+
class SparsePanelFixed(SparseFixed):
27182749
pandas_kind = u('sparse_panel')
27192750
attributes = ['default_kind', 'default_fill_value']
27202751

27212752
def read(self, **kwargs):
2722-
self.validate_read(kwargs)
2753+
kwargs = self.validate_read(kwargs)
27232754
items = self.read_index('items')
27242755

27252756
sdict = {}
@@ -2782,19 +2813,26 @@ def shape(self):
27822813
except:
27832814
return None
27842815

2785-
def read(self, **kwargs):
2786-
self.validate_read(kwargs)
2816+
def read(self, start=None, stop=None, **kwargs):
2817+
# start, stop applied to rows, so 0th axis only
2818+
2819+
kwargs = self.validate_read(kwargs)
2820+
select_axis = self.obj_type()._get_block_manager_axis(0)
27872821

27882822
axes = []
27892823
for i in range(self.ndim):
2790-
ax = self.read_index('axis%d' % i)
2824+
2825+
_start, _stop = (start, stop) if i == select_axis else (None, None)
2826+
ax = self.read_index('axis%d' % i, start=_start, stop=_stop)
27912827
axes.append(ax)
27922828

27932829
items = axes[0]
27942830
blocks = []
27952831
for i in range(self.nblocks):
2832+
27962833
blk_items = self.read_index('block%d_items' % i)
2797-
values = self.read_array('block%d_values' % i)
2834+
values = self.read_array('block%d_values' % i,
2835+
start=_start, stop=_stop)
27982836
blk = make_block(values,
27992837
placement=items.get_indexer(blk_items))
28002838
blocks.append(blk)

pandas/io/tests/test_pytables.py

+51-3
Original file line numberDiff line numberDiff line change
@@ -4128,10 +4128,11 @@ def test_nan_selection_bug_4858(self):
41284128
result = store.select('df', where='values>2.0')
41294129
assert_frame_equal(result, expected)
41304130

4131-
def test_start_stop(self):
4131+
def test_start_stop_table(self):
41324132

41334133
with ensure_clean_store(self.path) as store:
41344134

4135+
# table
41354136
df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20)))
41364137
store.append('df', df)
41374138

@@ -4143,8 +4144,55 @@ def test_start_stop(self):
41434144
# out of range
41444145
result = store.select(
41454146
'df', [Term("columns=['A']")], start=30, stop=40)
4146-
assert(len(result) == 0)
4147-
assert(type(result) == DataFrame)
4147+
self.assertTrue(len(result) == 0)
4148+
expected = df.ix[30:40, ['A']]
4149+
tm.assert_frame_equal(result, expected)
4150+
4151+
def test_start_stop_fixed(self):
4152+
4153+
with ensure_clean_store(self.path) as store:
4154+
4155+
# fixed, GH 8287
4156+
df = DataFrame(dict(A=np.random.rand(20),
4157+
B=np.random.rand(20)),
4158+
index=pd.date_range('20130101', periods=20))
4159+
store.put('df', df)
4160+
4161+
result = store.select(
4162+
'df', start=0, stop=5)
4163+
expected = df.iloc[0:5, :]
4164+
tm.assert_frame_equal(result, expected)
4165+
4166+
result = store.select(
4167+
'df', start=5, stop=10)
4168+
expected = df.iloc[5:10, :]
4169+
tm.assert_frame_equal(result, expected)
4170+
4171+
# out of range
4172+
result = store.select(
4173+
'df', start=30, stop=40)
4174+
expected = df.iloc[30:40, :]
4175+
tm.assert_frame_equal(result, expected)
4176+
4177+
# series
4178+
s = df.A
4179+
store.put('s', s)
4180+
result = store.select('s', start=0, stop=5)
4181+
expected = s.iloc[0:5]
4182+
tm.assert_series_equal(result, expected)
4183+
4184+
result = store.select('s', start=5, stop=10)
4185+
expected = s.iloc[5:10]
4186+
tm.assert_series_equal(result, expected)
4187+
4188+
# sparse; not implemented
4189+
df = tm.makeDataFrame()
4190+
df.ix[3:5, 1:3] = np.nan
4191+
df.ix[8:10, -2] = np.nan
4192+
dfs = df.to_sparse()
4193+
store.put('dfs', dfs)
4194+
with self.assertRaises(NotImplementedError):
4195+
store.select('dfs', start=0, stop=5)
41484196

41494197
def test_select_filter_corner(self):
41504198

0 commit comments

Comments
 (0)