Skip to content

Commit a632362

Browse files
committed
Merge pull request #3078 from jreback/pytables_iter
ENH: support iteration on returned results in select and select_as_multiple in HDFStore
2 parents 4a114d2 + 78a3edb commit a632362

File tree

5 files changed

+192
-21
lines changed

5 files changed

+192
-21
lines changed

RELEASE.rst

+8-3
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,12 @@ pandas 0.11.0
6363
- Add ``axes`` property to ``Series`` for compatibility
6464
- Add ``xs`` function to ``Series`` for compatibility
6565
- Allow setitem in a frame where only mixed numerics are present (e.g. int and float), (GH3037_)
66+
- ``HDFStore``
67+
68+
- Provide dotted attribute access to ``get`` from stores
69+
(e.g. store.df == store['df'])
70+
- New keywords ``iterator=boolean``, and ``chunksize=number_in_a_chunk`` are
71+
provided to support iteration on ``select`` and ``select_as_multiple`` (GH3076_)
6672

6773
- In ``HDFStore``, provide dotted attribute access to ``get`` from stores
6874
(e.g. ``store.df == store['df']``)
@@ -140,8 +146,6 @@ pandas 0.11.0
140146
- Fix weird PyTables error when using too many selectors in a where
141147
also correctly filter on any number of values in a Term expression
142148
(so not using numexpr filtering, but isin filtering)
143-
- Provide dotted attribute access to ``get`` from stores
144-
(e.g. store.df == store['df'])
145149
- Internally, change all variables to be private-like (now have leading
146150
underscore)
147151
- fixes for query parsing to correctly interpret boolean and != (GH2849_, GH2973_)
@@ -218,6 +222,7 @@ pandas 0.11.0
218222
.. _GH2819: https://github.com/pydata/pandas/issues/2819
219223
.. _GH2845: https://github.com/pydata/pandas/issues/2845
220224
.. _GH2867: https://github.com/pydata/pandas/issues/2867
225+
.. _GH2803: https://github.com/pydata/pandas/issues/2803
221226
.. _GH2807: https://github.com/pydata/pandas/issues/2807
222227
.. _GH2849: https://github.com/pydata/pandas/issues/2849
223228
.. _GH2850: https://github.com/pydata/pandas/issues/2850
@@ -238,7 +243,7 @@ pandas 0.11.0
238243
.. _GH3037: https://github.com/pydata/pandas/issues/3037
239244
.. _GH3041: https://github.com/pydata/pandas/issues/3041
240245
.. _GH3053: https://github.com/pydata/pandas/issues/3053
241-
.. _GH2803: https://github.com/pydata/pandas/issues/2803
246+
.. _GH3076: https://github.com/pydata/pandas/issues/3076
242247

243248

244249
pandas 0.10.1

doc/source/io.rst

+17
Original file line numberDiff line numberDiff line change
@@ -1307,6 +1307,23 @@ you cannot change data columns (nor indexables) after the first
13071307
append/put operation (Of course you can simply read in the data and
13081308
create a new table!)
13091309

1310+
Iterator
1311+
~~~~~~~~
1312+
1313+
Starting in 0.11, you can pass, ``iterator=True`` or ``chunksize=number_in_a_chunk``
1314+
to ``select`` and ``select_as_multiple`` to return an iterator on the results.
1315+
The default is 50,000 rows returned in a chunk.
1316+
1317+
.. ipython:: python
1318+
1319+
for df in store.select('df', chunksize=3):
1320+
print df
1321+
1322+
Note, that the chunksize keyword applies to the **returned** rows. So if you
1323+
are doing a query, then that set will be subdivided and returned in the
1324+
iterator. Keep in mind that if you do not pass a ``where`` selection criteria
1325+
then the ``nrows`` of the table are considered.
1326+
13101327
Advanced Queries
13111328
~~~~~~~~~~~~~~~~
13121329

doc/source/v0.11.0.txt

+6-2
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,9 @@ Enhancements
238238
- In ``HDFStore``, provide dotted attribute access to ``get`` from stores
239239
(e.g. ``store.df == store['df']``)
240240

241+
- In ``HDFStore``, new keywords ``iterator=boolean``, and ``chunksize=number_in_a_chunk`` are
242+
provided to support iteration on ``select`` and ``select_as_multiple`` (GH3076_)
243+
241244
- ``Squeeze`` to possibly remove length 1 dimensions from an object.
242245

243246
.. ipython:: python
@@ -300,6 +303,7 @@ on GitHub for a complete list.
300303
.. _GH2806: https://github.com/pydata/pandas/issues/2806
301304
.. _GH2807: https://github.com/pydata/pandas/issues/2807
302305
.. _GH2918: https://github.com/pydata/pandas/issues/2918
303-
.. _GH3011: https://github.com/pydata/pandas/issues/3011
304-
.. _GH2979: https://github.com/pydata/pandas/issues/2979
305306
.. _GH2758: https://github.com/pydata/pandas/issues/2758
307+
.. _GH2979: https://github.com/pydata/pandas/issues/2979
308+
.. _GH3011: https://github.com/pydata/pandas/issues/3011
309+
.. _GH3076: https://github.com/pydata/pandas/issues/3076

pandas/io/pytables.py

+96-16
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ def get(self, key):
347347
raise KeyError('No object named %s in the file' % key)
348348
return self._read_group(group)
349349

350-
def select(self, key, where=None, start=None, stop=None, columns=None, **kwargs):
350+
def select(self, key, where=None, start=None, stop=None, columns=None, iterator=False, chunksize=None, **kwargs):
351351
"""
352352
Retrieve pandas object stored in file, optionally based on where
353353
criteria
@@ -362,16 +362,30 @@ def select(self, key, where=None, start=None, stop=None, columns=None, **kwargs)
362362
start : integer (defaults to None), row number to start selection
363363
stop : integer (defaults to None), row number to stop selection
364364
columns : a list of columns that if not None, will limit the return columns
365+
iterator : boolean, return an iterator, default False
366+
chunksize : nrows to include in iteration, return an iterator
365367
366368
"""
367369
group = self.get_node(key)
368370
if group is None:
369371
raise KeyError('No object named %s in the file' % key)
370-
return self._read_group(group, where=where, start=start, stop=stop, columns=columns, **kwargs)
371372

372-
def select_as_coordinates(self, key, where=None, **kwargs):
373+
# create the storer and axes
374+
s = self._create_storer(group)
375+
s.infer_axes()
376+
377+
# what we are actually going to do for a chunk
378+
def func(_start, _stop):
379+
return s.read(where=where, start=_start, stop=_stop, columns=columns, **kwargs)
380+
381+
if iterator or chunksize is not None:
382+
return TableIterator(func, nrows=s.nrows, start=start, stop=stop, chunksize=chunksize)
383+
384+
return TableIterator(func, nrows=s.nrows, start=start, stop=stop).get_values()
385+
386+
def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs):
373387
"""
374-
return the selection as a Coordinates. Note that start/stop/columns parematers are inapplicable here.
388+
return the selection as a Coordinates.
375389
376390
Parameters
377391
----------
@@ -380,8 +394,10 @@ def select_as_coordinates(self, key, where=None, **kwargs):
380394
Optional Parameters
381395
-------------------
382396
where : list of Term (or convertable) objects, optional
397+
start : integer (defaults to None), row number to start selection
398+
stop : integer (defaults to None), row number to stop selection
383399
"""
384-
return self.get_storer(key).read_coordinates(where = where, **kwargs)
400+
return self.get_storer(key).read_coordinates(where=where, start=start, stop=stop, **kwargs)
385401

386402
def unique(self, key, column, **kwargs):
387403
"""
@@ -400,14 +416,18 @@ def unique(self, key, column, **kwargs):
400416
"""
401417
return self.get_storer(key).read_column(column = column, **kwargs)
402418

403-
def select_as_multiple(self, keys, where=None, selector=None, columns=None, **kwargs):
419+
def select_as_multiple(self, keys, where=None, selector=None, columns=None, start=None, stop=None, iterator=False, chunksize=None, **kwargs):
404420
""" Retrieve pandas objects from multiple tables
405421
406422
Parameters
407423
----------
408424
keys : a list of the tables
409425
selector : the table to apply the where criteria (defaults to keys[0] if not supplied)
410426
columns : the columns I want back
427+
start : integer (defaults to None), row number to start selection
428+
stop : integer (defaults to None), row number to stop selection
429+
iterator : boolean, return an iterator, default False
430+
chunksize : nrows to include in iteration, return an iterator
411431
412432
Exceptions
413433
----------
@@ -418,7 +438,7 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, **kw
418438
if isinstance(keys, (list, tuple)) and len(keys) == 1:
419439
keys = keys[0]
420440
if isinstance(keys, basestring):
421-
return self.select(key=keys, where=where, columns=columns, **kwargs)
441+
return self.select(key=keys, where=where, columns=columns, start=start, stop=stop, iterator=iterator, chunksize=chunksize, **kwargs)
422442

423443
if not isinstance(keys, (list, tuple)):
424444
raise Exception("keys must be a list/tuple")
@@ -433,6 +453,8 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, **kw
433453
tbls = [ self.get_storer(k) for k in keys ]
434454

435455
# validate rows
456+
if tbls[0] is None:
457+
raise Exception("no valid tables to select as multiple")
436458
nrows = tbls[0].nrows
437459
for t in tbls:
438460
if t.nrows != nrows:
@@ -441,16 +463,25 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, **kw
441463
raise Exception("object [%s] is not a table, and cannot be used in all select as multiple" % t.pathname)
442464

443465
# select coordinates from the selector table
444-
c = self.select_as_coordinates(selector, where)
466+
c = self.select_as_coordinates(selector, where, start=start, stop=stop)
467+
nrows = len(c)
468+
469+
def func(_start, _stop):
470+
471+
# collect the returns objs
472+
objs = [t.read(where=c[_start:_stop], columns=columns) for t in tbls]
473+
474+
# axis is the concentation axes
475+
axis = list(set([t.non_index_axes[0][0] for t in tbls]))[0]
445476

446-
# collect the returns objs
447-
objs = [t.read(where=c, columns=columns) for t in tbls]
477+
# concat and return
478+
return concat(objs, axis=axis, verify_integrity=True)
448479

449-
# axis is the concentation axes
450-
axis = list(set([t.non_index_axes[0][0] for t in tbls]))[0]
480+
if iterator or chunksize is not None:
481+
return TableIterator(func, nrows=nrows, start=start, stop=stop, chunksize=chunksize)
482+
483+
return TableIterator(func, nrows=nrows, start=start, stop=stop).get_values()
451484

452-
# concat and return
453-
return concat(objs, axis=axis, verify_integrity=True)
454485

455486
def put(self, key, value, table=None, append=False, **kwargs):
456487
"""
@@ -807,6 +838,49 @@ def _read_group(self, group, **kwargs):
807838
s.infer_axes()
808839
return s.read(**kwargs)
809840

841+
class TableIterator(object):
842+
""" define the iteration interface on a table
843+
844+
Parameters
845+
----------
846+
847+
func : the function to get results
848+
nrows : the rows to iterate on
849+
start : the passed start value (default is None)
850+
stop : the passed stop value (default is None)
851+
chunksize : the passed chunking valeu (default is 50000)
852+
kwargs : the passed kwargs
853+
"""
854+
855+
def __init__(self, func, nrows, start=None, stop=None, chunksize=None):
856+
self.func = func
857+
self.nrows = nrows
858+
self.start = start or 0
859+
860+
if stop is None:
861+
stop = self.nrows
862+
self.stop = min(self.nrows,stop)
863+
864+
if chunksize is None:
865+
chunksize = 50000
866+
867+
self.chunksize = chunksize
868+
869+
def __iter__(self):
870+
current = self.start
871+
while current < self.stop:
872+
stop = current + self.chunksize
873+
v = self.func(current, stop)
874+
current = stop
875+
876+
if v is None:
877+
continue
878+
879+
yield v
880+
881+
def get_values(self):
882+
return self.func(self.start, self.stop)
883+
810884

811885
class IndexCol(object):
812886
""" an index column description class
@@ -2351,7 +2425,7 @@ def create_description(self, complib=None, complevel=None, fletcher32=False, exp
23512425

23522426
return d
23532427

2354-
def read_coordinates(self, where=None, **kwargs):
2428+
def read_coordinates(self, where=None, start=None, stop=None, **kwargs):
23552429
""" select coordinates (row numbers) from a table; return the coordinates object """
23562430

23572431
# validate the version
@@ -2362,7 +2436,7 @@ def read_coordinates(self, where=None, **kwargs):
23622436
return False
23632437

23642438
# create the selection
2365-
self.selection = Selection(self, where=where, **kwargs)
2439+
self.selection = Selection(self, where=where, start=start, stop=stop, **kwargs)
23662440
return Coordinates(self.selection.select_coords(), group=self.group, where=where)
23672441

23682442
def read_column(self, column, **kwargs):
@@ -3132,6 +3206,12 @@ def __init__(self, values, group, where, **kwargs):
31323206
self.group = group
31333207
self.where = where
31343208

3209+
def __len__(self):
3210+
return len(self.values)
3211+
3212+
def __getitem__(self, key):
3213+
""" return a new coordinates object, sliced by the key """
3214+
return Coordinates(self.values[key], self.group, self.where)
31353215

31363216
class Selection(object):
31373217
"""

pandas/io/tests/test_pytables.py

+65
Original file line numberDiff line numberDiff line change
@@ -1829,6 +1829,66 @@ def test_select_with_many_inputs(self):
18291829
tm.assert_frame_equal(expected, result)
18301830
self.assert_(len(result) == 100)
18311831

1832+
def test_select_iterator(self):
1833+
1834+
# single table
1835+
with ensure_clean(self.path) as store:
1836+
1837+
df = tm.makeTimeDataFrame(500)
1838+
store.remove('df')
1839+
store.append('df', df)
1840+
1841+
expected = store.select('df')
1842+
1843+
results = []
1844+
for s in store.select('df',iterator=True):
1845+
results.append(s)
1846+
result = concat(results)
1847+
tm.assert_frame_equal(expected, result)
1848+
results = []
1849+
for s in store.select('df',chunksize=100):
1850+
results.append(s)
1851+
result = concat(results)
1852+
tm.assert_frame_equal(expected, result)
1853+
1854+
results = []
1855+
for s in store.select('df',chunksize=150):
1856+
results.append(s)
1857+
result = concat(results)
1858+
tm.assert_frame_equal(expected, result)
1859+
1860+
# multiple
1861+
1862+
with ensure_clean(self.path) as store:
1863+
1864+
df1 = tm.makeTimeDataFrame(500)
1865+
store.append('df1',df1,data_columns=True)
1866+
df2 = tm.makeTimeDataFrame(500).rename(columns=lambda x: "%s_2" % x)
1867+
df2['foo'] = 'bar'
1868+
store.append('df2',df2)
1869+
1870+
df = concat([df1, df2], axis=1)
1871+
1872+
# full selection
1873+
expected = store.select_as_multiple(
1874+
['df1', 'df2'], selector='df1')
1875+
results = []
1876+
for s in store.select_as_multiple(
1877+
['df1', 'df2'], selector='df1', chunksize=150):
1878+
results.append(s)
1879+
result = concat(results)
1880+
tm.assert_frame_equal(expected, result)
1881+
1882+
# where selection
1883+
expected = store.select_as_multiple(
1884+
['df1', 'df2'], where= Term('A>0'), selector='df1')
1885+
results = []
1886+
for s in store.select_as_multiple(
1887+
['df1', 'df2'], where= Term('A>0'), selector='df1', chunksize=25):
1888+
results.append(s)
1889+
result = concat(results)
1890+
tm.assert_frame_equal(expected, result)
1891+
18321892
def test_panel_select(self):
18331893

18341894
wp = tm.makePanel()
@@ -2042,6 +2102,11 @@ def test_select_as_multiple(self):
20422102
df2['foo'] = 'bar'
20432103

20442104
with ensure_clean(self.path) as store:
2105+
2106+
# no tables stored
2107+
self.assertRaises(Exception, store.select_as_multiple,
2108+
None, where=['A>0', 'B>0'], selector='df1')
2109+
20452110
store.append('df1', df1, data_columns=['A', 'B'])
20462111
store.append('df2', df2)
20472112

0 commit comments

Comments
 (0)