Skip to content

Commit 0ecb4cb

Browse files
committed
Merge pull request pandas-dev#8029 from jreback/hdf_iterator
BUG: fix HDFStore iterator to handle a where properly (GH8014)
2 parents 045577b + 8c44e7f commit 0ecb4cb

File tree

3 files changed

+250
-59
lines changed

3 files changed

+250
-59
lines changed

doc/source/v0.15.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -464,7 +464,7 @@ Bug Fixes
464464
- Bug in pickle deserialization that failed for pre-0.14.1 containers with dup items trying to avoid ambiguity
465465
when matching block and manager items, when there's only one block there's no ambiguity (:issue:`7794`)
466466

467-
467+
- Bug in HDFStore iteration when passing a where (:issue:`8014`)
468468

469469
- Bug in repeated timeseries line and area plot may result in ``ValueError`` or incorrect kind (:issue:`7733`)
470470

pandas/io/pytables.py

+62-41
Original file line numberDiff line numberDiff line change
@@ -662,21 +662,18 @@ def select(self, key, where=None, start=None, stop=None, columns=None,
662662
s = self._create_storer(group)
663663
s.infer_axes()
664664

665-
# what we are actually going to do for a chunk
666-
def func(_start, _stop):
667-
return s.read(where=where, start=_start, stop=_stop,
665+
# function to call on iteration
666+
def func(_start, _stop, _where):
667+
return s.read(start=_start, stop=_stop,
668+
where=_where,
668669
columns=columns, **kwargs)
669670

670-
if iterator or chunksize is not None:
671-
if not s.is_table:
672-
raise TypeError(
673-
"can only use an iterator or chunksize on a table")
674-
return TableIterator(self, func, nrows=s.nrows, start=start,
675-
stop=stop, chunksize=chunksize,
676-
auto_close=auto_close)
671+
# create the iterator
672+
it = TableIterator(self, s, func, where=where, nrows=s.nrows, start=start,
673+
stop=stop, iterator=iterator, chunksize=chunksize,
674+
auto_close=auto_close)
677675

678-
return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop,
679-
auto_close=auto_close).get_values()
676+
return it.get_result()
680677

681678
def select_as_coordinates(
682679
self, key, where=None, start=None, stop=None, **kwargs):
@@ -779,26 +776,22 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None,
779776
# axis is the concentation axes
780777
axis = list(set([t.non_index_axes[0][0] for t in tbls]))[0]
781778

782-
def func(_start, _stop):
783-
if where is not None:
784-
c = s.read_coordinates(where=where, start=_start, stop=_stop, **kwargs)
785-
else:
786-
c = None
779+
def func(_start, _stop, _where):
787780

788-
objs = [t.read(where=c, start=_start, stop=_stop,
789-
columns=columns, **kwargs) for t in tbls]
781+
# retrieve the objs, _where is always passed as a set of coordinates here
782+
objs = [t.read(where=_where, columns=columns, **kwargs) for t in tbls]
790783

791784
# concat and return
792785
return concat(objs, axis=axis,
793786
verify_integrity=False).consolidate()
794787

795-
if iterator or chunksize is not None:
796-
return TableIterator(self, func, nrows=nrows, start=start,
797-
stop=stop, chunksize=chunksize,
798-
auto_close=auto_close)
788+
# create the iterator
789+
it = TableIterator(self, s, func, where=where, nrows=nrows, start=start,
790+
stop=stop, iterator=iterator, chunksize=chunksize,
791+
auto_close=auto_close)
792+
793+
return it.get_result(coordinates=True)
799794

800-
return TableIterator(self, func, nrows=nrows, start=start, stop=stop,
801-
auto_close=auto_close).get_values()
802795

803796
def put(self, key, value, format=None, append=False, **kwargs):
804797
"""
@@ -1293,57 +1286,85 @@ class TableIterator(object):
12931286
----------
12941287
12951288
store : the reference store
1296-
func : the function to get results
1289+
s : the refered storer
1290+
func : the function to execute the query
1291+
where : the where of the query
12971292
nrows : the rows to iterate on
12981293
start : the passed start value (default is None)
12991294
stop : the passed stop value (default is None)
1300-
chunksize : the passed chunking valeu (default is 50000)
1295+
iterator : boolean, whether to use the default iterator
1296+
chunksize : the passed chunking value (default is 50000)
13011297
auto_close : boolean, automatically close the store at the end of
13021298
iteration, default is False
13031299
kwargs : the passed kwargs
13041300
"""
13051301

1306-
def __init__(self, store, func, nrows, start=None, stop=None,
1307-
chunksize=None, auto_close=False):
1302+
def __init__(self, store, s, func, where, nrows, start=None, stop=None,
1303+
iterator=False, chunksize=None, auto_close=False):
13081304
self.store = store
1309-
self.func = func
1305+
self.s = s
1306+
self.func = func
1307+
self.where = where
13101308
self.nrows = nrows or 0
13111309
self.start = start or 0
13121310

13131311
if stop is None:
13141312
stop = self.nrows
13151313
self.stop = min(self.nrows, stop)
13161314

1317-
if chunksize is None:
1318-
chunksize = 100000
1315+
self.coordinates = None
1316+
if iterator or chunksize is not None:
1317+
if chunksize is None:
1318+
chunksize = 100000
1319+
self.chunksize = int(chunksize)
1320+
else:
1321+
self.chunksize = None
13191322

1320-
self.chunksize = chunksize
13211323
self.auto_close = auto_close
13221324

13231325
def __iter__(self):
1326+
1327+
# iterate
13241328
current = self.start
13251329
while current < self.stop:
1326-
stop = current + self.chunksize
1327-
v = self.func(current, stop)
1328-
current = stop
13291330

1330-
if v is None:
1331+
stop = min(current + self.chunksize, self.stop)
1332+
value = self.func(None, None, self.coordinates[current:stop])
1333+
current = stop
1334+
if value is None or not len(value):
13311335
continue
13321336

1333-
yield v
1337+
yield value
13341338

13351339
self.close()
13361340

13371341
def close(self):
13381342
if self.auto_close:
13391343
self.store.close()
13401344

1341-
def get_values(self):
1342-
results = self.func(self.start, self.stop)
1345+
def get_result(self, coordinates=False):
1346+
1347+
# return the actual iterator
1348+
if self.chunksize is not None:
1349+
if not self.s.is_table:
1350+
raise TypeError(
1351+
"can only use an iterator or chunksize on a table")
1352+
1353+
self.coordinates = self.s.read_coordinates(where=self.where)
1354+
1355+
return self
1356+
1357+
# if specified read via coordinates (necessary for multiple selections
1358+
if coordinates:
1359+
where = self.s.read_coordinates(where=self.where)
1360+
else:
1361+
where = self.where
1362+
1363+
# directly return the result
1364+
results = self.func(self.start, self.stop, where)
13431365
self.close()
13441366
return results
13451367

1346-
13471368
class IndexCol(StringMixin):
13481369

13491370
""" an index column description class

0 commit comments

Comments
 (0)