Skip to content

Commit a6f27ff

Browse files
jrebackBrendan Boerner
authored and
Brendan Boerner
committed
BUG: fix HDFStore iterator to handle a where properly (GH8014)
1 parent 126580d commit a6f27ff

File tree

3 files changed

+100
-19
lines changed

3 files changed

+100
-19
lines changed

doc/source/v0.15.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -454,7 +454,7 @@ Bug Fixes
454454
- Bug in pickle deserialization that failed for pre-0.14.1 containers with dup items trying to avoid ambiguity
455455
when matching block and manager items, when there's only one block there's no ambiguity (:issue:`7794`)
456456

457-
457+
- Bug in HDFStore iteration when passing a where (:issue:`8014`)
458458

459459
- Bug in repeated timeseries line and area plot may result in ``ValueError`` or incorrect kind (:issue:`7733`)
460460

pandas/io/pytables.py

+33-18
Original file line numberDiff line numberDiff line change
@@ -662,15 +662,22 @@ def select(self, key, where=None, start=None, stop=None, columns=None,
662662
s = self._create_storer(group)
663663
s.infer_axes()
664664

665-
# what we are actually going to do for a chunk
666665
def func(_start, _stop):
667-
return s.read(where=where, start=_start, stop=_stop,
666+
return s.read(start=_start, stop=_stop,
667+
where=where,
668668
columns=columns, **kwargs)
669669

670670
if iterator or chunksize is not None:
671671
if not s.is_table:
672672
raise TypeError(
673673
"can only use an iterator or chunksize on a table")
674+
675+
# read the coordinates & iterate
676+
if where is not None:
677+
c = s.read_coordinates(where=where, **kwargs)
678+
def func(_start, _stop):
679+
return s.read(where=c[_start:_stop], columns=columns, **kwargs)
680+
674681
return TableIterator(self, func, nrows=s.nrows, start=start,
675682
stop=stop, chunksize=chunksize,
676683
auto_close=auto_close)
@@ -779,18 +786,26 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None,
779786
# axis is the concentation axes
780787
axis = list(set([t.non_index_axes[0][0] for t in tbls]))[0]
781788

782-
def func(_start, _stop):
783-
if where is not None:
784-
c = s.read_coordinates(where=where, start=_start, stop=_stop, **kwargs)
785-
else:
786-
c = None
789+
# for a not-none where, select the coordinates and chunk on those
790+
if where is not None:
791+
c = s.read_coordinates(where=where, **kwargs)
787792

788-
objs = [t.read(where=c, start=_start, stop=_stop,
789-
columns=columns, **kwargs) for t in tbls]
793+
def func(_start, _stop):
794+
objs = [t.read(where=c[_start:_stop], columns=columns, **kwargs) for t in tbls]
790795

791-
# concat and return
792-
return concat(objs, axis=axis,
793-
verify_integrity=False).consolidate()
796+
# concat and return
797+
return concat(objs, axis=axis,
798+
verify_integrity=False).consolidate()
799+
800+
else:
801+
802+
def func(_start, _stop):
803+
objs = [t.read(start=_start, stop=_stop,
804+
columns=columns, **kwargs) for t in tbls]
805+
806+
# concat and return
807+
return concat(objs, axis=axis,
808+
verify_integrity=False).consolidate()
794809

795810
if iterator or chunksize is not None:
796811
return TableIterator(self, func, nrows=nrows, start=start,
@@ -1317,20 +1332,20 @@ def __init__(self, store, func, nrows, start=None, stop=None,
13171332
if chunksize is None:
13181333
chunksize = 100000
13191334

1320-
self.chunksize = chunksize
1335+
self.chunksize = int(chunksize)
13211336
self.auto_close = auto_close
13221337

13231338
def __iter__(self):
13241339
current = self.start
13251340
while current < self.stop:
1326-
stop = current + self.chunksize
1327-
v = self.func(current, stop)
1328-
current = stop
13291341

1330-
if v is None:
1342+
stop = min(current + self.chunksize, self.stop)
1343+
value = self.func(current, stop)
1344+
if value is None:
13311345
continue
1346+
current = current + min(self.chunksize,len(value))
13321347

1333-
yield v
1348+
yield value
13341349

13351350
self.close()
13361351

pandas/io/tests/test_pytables.py

+66
Original file line numberDiff line numberDiff line change
@@ -3376,6 +3376,72 @@ def test_select_iterator(self):
33763376
#result = concat(results)
33773377
#tm.assert_frame_equal(expected, result)
33783378

3379+
def test_select_iterator_8014(self):
3380+
3381+
# single table
3382+
with ensure_clean_store(self.path) as store:
3383+
3384+
chunksize=1e4
3385+
expected = tm.makeTimeDataFrame(100064, 'S')
3386+
_maybe_remove(store, 'df')
3387+
store.append('df',expected)
3388+
3389+
beg_dt = expected.index[0]
3390+
end_dt = expected.index[-1]
3391+
3392+
#
3393+
# w/o iterator
3394+
#
3395+
3396+
# select w/o iteration and no where clause works
3397+
result = store.select('df')
3398+
tm.assert_frame_equal(expected, result)
3399+
3400+
# select w/o iterator and where clause, single term, begin
3401+
# of range, works
3402+
where = "index >= '%s'" % beg_dt
3403+
result = store.select('df',where=where)
3404+
tm.assert_frame_equal(expected, result)
3405+
3406+
# select w/o iterator and where clause, single term, end
3407+
# of range, works
3408+
where = "index <= '%s'" % end_dt
3409+
result = store.select('df',where=where)
3410+
tm.assert_frame_equal(expected, result)
3411+
3412+
# select w/o iterator and where clause, inclusive range,
3413+
# works
3414+
where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt)
3415+
result = store.select('df',where=where)
3416+
tm.assert_frame_equal(expected, result)
3417+
3418+
#
3419+
# with iterator
3420+
#
3421+
3422+
# select w/iterator and no where clause works
3423+
results = [ s for s in store.select('df',chunksize=chunksize) ]
3424+
result = concat(results)
3425+
tm.assert_frame_equal(expected, result)
3426+
3427+
# select w/iterator and where clause, single term, begin of range
3428+
where = "index >= '%s'" % beg_dt
3429+
results = [ s for s in store.select('df',where=where,chunksize=chunksize) ]
3430+
result = concat(results)
3431+
tm.assert_frame_equal(expected, result)
3432+
3433+
# select w/iterator and where clause, single term, end of range
3434+
where = "index <= '%s'" % end_dt
3435+
results = [ s for s in store.select('df',where=where,chunksize=chunksize) ]
3436+
result = concat(results)
3437+
tm.assert_frame_equal(expected, result)
3438+
3439+
# select w/iterator and where clause, inclusive range
3440+
where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt)
3441+
results = [ s for s in store.select('df',where=where,chunksize=chunksize) ]
3442+
result = concat(results)
3443+
tm.assert_frame_equal(expected, result)
3444+
33793445
def test_retain_index_attributes(self):
33803446

33813447
# GH 3499, losing frequency info on index recreation

0 commit comments

Comments
 (0)