Merge pull request #3949 from jreback/hdf_iterator

jreback · jreback · commit 57f103afce0a · 2013-06-18T15:57:39.000-07:00
ENH: enable support for iterator with read_hdf in HDFStore (GH3937)
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -101,6 +101,7 @@ pandas 0.11.1
       to select with a Storer; these are invalid parameters at this time
     - can now specify an ``encoding`` option to ``append/put``
       to enable alternate encodings (GH3750_)
+    - enable support for ``iterator/chunksize`` with ``read_hdf``
   - The repr() for (Multi)Index now obeys display.max_seq_items rather
     then numpy threshold print options. (GH3426_, GH3466_)
   - Added mangle_dupe_cols option to read_table/csv, allowing users
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -1925,6 +1925,18 @@ The default is 50,000 rows returned in a chunk.
    for df in store.select('df', chunksize=3):
       print df
 
+.. note::
+
+   .. versionadded:: 0.11.1
+
+   You can also use the iterator with ``read_hdf`` which will open, then
+   automatically close the store when finished iterating.
+
+   .. code-block:: python
+
+      for df in read_hdf('store.h5','df', chunsize=3):
+          print df
+
 Note, that the chunksize keyword applies to the **returned** rows. So if you
 are doing a query, then that set will be subdivided and returned in the
 iterator. Keep in mind that if you do not pass a ``where`` selection criteria
diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt
@@ -6,6 +6,11 @@ v0.11.1 (June ??, 2013)
 This is a minor release from 0.11.0 and includes several new features and
 enhancements along with a large number of bug fixes.
 
+Highlites include a consistent I/O API naming scheme, routines to read html,
+write multi-indexes to csv files, read & write STATA data files, read & write JSON format
+files, Python 3 support for ``HDFStore``, filtering of groupby expressions via ``filter``, and a 
+revamped ``replace`` routine that accepts regular expressions.
+
 API changes
 ~~~~~~~~~~~
 
@@ -148,8 +153,8 @@ API changes
     ``bs4`` + ``html5lib`` when lxml fails to parse. a list of parsers to try
     until success is also valid
 
-Enhancements
-~~~~~~~~~~~~
+I/O Enhancements
+~~~~~~~~~~~~~~~~
 
   - ``pd.read_html()`` can now parse HTML strings, files or urls and return
     DataFrames, courtesy of @cpcloud. (GH3477_, GH3605_, GH3606_, GH3616_).
@@ -184,28 +189,6 @@ Enhancements
     accessable via ``read_json`` top-level function for reading,
     and ``to_json`` DataFrame method for writing, :ref:`See the docs<io.json>`
 
-  - ``DataFrame.replace()`` now allows regular expressions on contained
-    ``Series`` with object dtype. See the examples section in the regular docs
-    :ref:`Replacing via String Expression <missing_data.replace_expression>`
-
-    For example you can do
-
-    .. ipython :: python
-
-        df = DataFrame({'a': list('ab..'), 'b': [1, 2, 3, 4]})
-        df.replace(regex=r'\s*\.\s*', value=np.nan)
-
-    to replace all occurrences of the string ``'.'`` with zero or more
-    instances of surrounding whitespace with ``NaN``.
-
-    Regular string replacement still works as expected. For example, you can do
-
-    .. ipython :: python
-
-        df.replace('.', np.nan)
-
-    to replace all occurrences of the string ``'.'`` with ``NaN``.
-
   - Multi-index column support for reading and writing csv format files
 
     - The ``header`` option in ``read_csv`` now accepts a
@@ -225,19 +208,62 @@ Enhancements
       with ``df.to_csv(..., index=False``), then any ``names`` on the columns index will 
       be *lost*.
 
+      .. ipython:: python
+
+         from pandas.util.testing import makeCustomDataframe as mkdf
+         df = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4)
+         df.to_csv('mi.csv',tupleize_cols=False)
+         print open('mi.csv').read()
+         pd.read_csv('mi.csv',header=[0,1,2,3],index_col=[0,1],tupleize_cols=False)
+
+      .. ipython:: python
+         :suppress:
+
+         import os
+         os.remove('mi.csv')
+
+  - Support for ``HDFStore`` (via ``PyTables 3.0.0``) on Python3
+
+  - Iterator support via ``read_hdf`` that automatically opens and closes the
+    store when iteration is finished. This is only for *tables*
+
     .. ipython:: python
 
-       from pandas.util.testing import makeCustomDataframe as mkdf
-       df = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4)
-       df.to_csv('mi.csv',tupleize_cols=False)
-       print open('mi.csv').read()
-       pd.read_csv('mi.csv',header=[0,1,2,3],index_col=[0,1],tupleize_cols=False)
+         path = 'store_iterator.h5'
+         DataFrame(randn(10,2)).to_hdf(path,'df',table=True)
+         for df in read_hdf(path,'df', chunksize=3):
+            print df
 
     .. ipython:: python
-       :suppress:
+         :suppress:
 
-       import os
-       os.remove('mi.csv')
+         import os
+         os.remove(path)
+
+Other Enhancements
+~~~~~~~~~~~~~~~~~~
+
+  - ``DataFrame.replace()`` now allows regular expressions on contained
+    ``Series`` with object dtype. See the examples section in the regular docs
+    :ref:`Replacing via String Expression <missing_data.replace_expression>`
+
+    For example you can do
+
+    .. ipython :: python
+
+        df = DataFrame({'a': list('ab..'), 'b': [1, 2, 3, 4]})
+        df.replace(regex=r'\s*\.\s*', value=np.nan)
+
+    to replace all occurrences of the string ``'.'`` with zero or more
+    instances of surrounding whitespace with ``NaN``.
+
+    Regular string replacement still works as expected. For example, you can do
+
+    .. ipython :: python
+
+        df.replace('.', np.nan)
+
+    to replace all occurrences of the string ``'.'`` with ``NaN``.
 
   - ``pd.melt()`` now accepts the optional parameters ``var_name`` and ``value_name``
     to specify custom column names of the returned DataFrame.
@@ -261,8 +287,6 @@ Enhancements
          pd.get_option('a.b')
          pd.get_option('b.c')
 
-  - Support for ``HDFStore`` (via ``PyTables 3.0.0``) on Python3
-
   - The ``filter`` method for group objects returns a subset of the original 
     object. Suppose we want to take only elements that belong to groups with a 
     group sum greater than 2.
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -196,12 +196,27 @@ def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, app
 
 def read_hdf(path_or_buf, key, **kwargs):
     """ read from the store, closeit if we opened it """
-    f = lambda store: store.select(key, **kwargs)
+    f = lambda store, auto_close: store.select(key, auto_close=auto_close, **kwargs)
 
     if isinstance(path_or_buf, basestring):
-        with get_store(path_or_buf) as store:
-            return f(store)
-    f(path_or_buf)
+
+        # can't auto open/close if we are using an iterator
+        # so delegate to the iterator
+        store = HDFStore(path_or_buf)
+        try:
+            return f(store, True)
+        except:
+
+            # if there is an error, close the store
+            try:
+                store.close()
+            except:
+                pass
+
+            raise
+
+    # a passed store; user controls open/close
+    f(path_or_buf, False)
     
 class HDFStore(object):
     """
@@ -405,7 +420,7 @@ def get(self, key):
             raise KeyError('No object named %s in the file' % key)
         return self._read_group(group)
 
-    def select(self, key, where=None, start=None, stop=None, columns=None, iterator=False, chunksize=None, **kwargs):
+    def select(self, key, where=None, start=None, stop=None, columns=None, iterator=False, chunksize=None, auto_close=False, **kwargs):
         """
         Retrieve pandas object stored in file, optionally based on where
         criteria
@@ -419,6 +434,7 @@ def select(self, key, where=None, start=None, stop=None, columns=None, iterator=
         columns : a list of columns that if not None, will limit the return columns
         iterator : boolean, return an iterator, default False
         chunksize : nrows to include in iteration, return an iterator
+        auto_close : boolean, should automatically close the store when finished, default is False
 
         """
         group = self.get_node(key)
@@ -434,9 +450,11 @@ def func(_start, _stop):
             return s.read(where=where, start=_start, stop=_stop, columns=columns, **kwargs)
 
         if iterator or chunksize is not None:
-            return TableIterator(func, nrows=s.nrows, start=start, stop=stop, chunksize=chunksize)
+            if not s.is_table:
+                raise TypeError("can only use an iterator or chunksize on a table")
+            return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, chunksize=chunksize, auto_close=auto_close)
 
-        return TableIterator(func, nrows=s.nrows, start=start, stop=stop).get_values()
+        return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, auto_close=auto_close).get_values()
 
     def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs):
         """
@@ -473,7 +491,7 @@ def select_column(self, key, column, **kwargs):
         """
         return self.get_storer(key).read_column(column = column, **kwargs)
 
-    def select_as_multiple(self, keys, where=None, selector=None, columns=None, start=None, stop=None, iterator=False, chunksize=None, **kwargs):
+    def select_as_multiple(self, keys, where=None, selector=None, columns=None, start=None, stop=None, iterator=False, chunksize=None, auto_close=False, **kwargs):
         """ Retrieve pandas objects from multiple tables
 
         Parameters
@@ -541,9 +559,9 @@ def func(_start, _stop):
             return concat(objs, axis=axis, verify_integrity=True)
 
         if iterator or chunksize is not None:
-            return TableIterator(func, nrows=nrows, start=start, stop=stop, chunksize=chunksize)
+            return TableIterator(self, func, nrows=nrows, start=start, stop=stop, chunksize=chunksize, auto_close=auto_close)
 
-        return TableIterator(func, nrows=nrows, start=start, stop=stop).get_values()
+        return TableIterator(self, func, nrows=nrows, start=start, stop=stop, auto_close=auto_close).get_values()
 
 
     def put(self, key, value, table=None, append=False, **kwargs):
@@ -916,16 +934,20 @@ class TableIterator(object):
         Parameters
         ----------
 
-        func   : the function to get results
+        store : the reference store
+        func  : the function to get results
         nrows : the rows to iterate on
         start : the passed start value (default is None)
-        stop : the passed stop value (default is None)
+        stop  : the passed stop value (default is None)
         chunksize : the passed chunking valeu (default is 50000)
+        auto_close : boolean, automatically close the store at the end of iteration,
+            default is False
         kwargs : the passed kwargs
         """
 
-    def __init__(self, func, nrows, start=None, stop=None, chunksize=None):
-        self.func   = func
+    def __init__(self, store, func, nrows, start=None, stop=None, chunksize=None, auto_close=False):
+        self.store = store
+        self.func  = func
         self.nrows = nrows or 0
         self.start = start or 0
 
@@ -937,6 +959,7 @@ def __init__(self, func, nrows, start=None, stop=None, chunksize=None):
             chunksize = 100000
 
         self.chunksize = chunksize
+        self.auto_close = auto_close
 
     def __iter__(self):
         current = self.start
@@ -950,9 +973,16 @@ def __iter__(self):
 
             yield v
 
+        self.close()
+            
+    def close(self):
+        if self.auto_close:
+            self.store.close()
+
     def get_values(self):
-        return self.func(self.start, self.stop)
-        
+        results = self.func(self.start, self.stop)
+        self.close()
+        return results
 
 class IndexCol(object):
     """ an index column description class
diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py
@@ -2078,14 +2078,36 @@ def test_select_iterator(self):
             results = []
             for s in store.select('df',chunksize=100):
                 results.append(s)
+            self.assert_(len(results) == 5)
             result = concat(results)
             tm.assert_frame_equal(expected, result)
 
             results = []
             for s in store.select('df',chunksize=150):
                 results.append(s)
             result = concat(results)
-            tm.assert_frame_equal(expected, result)
+            tm.assert_frame_equal(result, expected)
+
+        with tm.ensure_clean(self.path) as path:
+
+            df = tm.makeTimeDataFrame(500)
+            df.to_hdf(path,'df_non_table')
+            self.assertRaises(TypeError, read_hdf, path,'df_non_table',chunksize=100)
+            self.assertRaises(TypeError, read_hdf, path,'df_non_table',iterator=True)
+
+        with tm.ensure_clean(self.path) as path:
+
+            df = tm.makeTimeDataFrame(500)
+            df.to_hdf(path,'df',table=True)
+
+            results = []
+            for x in read_hdf(path,'df',chunksize=100):
+                results.append(x)
+
+            self.assert_(len(results) == 5)
+            result = concat(results)
+            tm.assert_frame_equal(result, df)
+            tm.assert_frame_equal(result, read_hdf(path,'df'))
 
         # multiple