ENH/DOC: updated docs for compression

jreback · jreback · commit af43f710dba6 · 2012-12-28T09:43:44.000-05:00
added parameter chunksize to append, now writing occurs in chunks, significatnly reducing memory usage
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -1221,6 +1221,20 @@ It should be clear that a delete operation on the ``major_axis`` will be fairly
    store.remove('wp', 'major_axis>20000102' )
    store.select('wp')
 
+Compression
+~~~~~~~~~~~
+``PyTables`` allows the stored data to be compressed (this applies to all kinds of stores, not just tables). You can pass ``complevel=int`` for a compression level (1-9, with 0 being no compression, and the default), ``complib=lib`` where lib is any of ``zlib, bzip2, lzo, blosc`` for whichever compression library you prefer. ``blosc`` offers very fast compression (its level defaults to 9), and is my most used. 
+
+``PyTables`` offer better write performance when compressed after writing them, as opposed to turning on compression at the very beginning. You can use the supplied ``PyTables`` utility ``ptrepack``. ``ptrepack`` also can change compression levels after the fact.
+
+   - ``ptrepack --chunkshape=auto --propindexes --complevel=9 --complib=blosc in.h5 out.h5``
+
+Or on-the-fly compression
+
+   - ``store_compressed = HDFStore('store_compressed.h5', complevel=9, complib='blosc')``
+
+
+
 Notes & Caveats
 ~~~~~~~~~~~~~~~
 
@@ -1258,8 +1272,6 @@ Performance
      - ``AppendableTable`` which is a similiar table to past versions (this is the default).
      - ``WORMTable`` (pending implementation) - is available to faciliate very fast writing of tables that are also queryable (but CANNOT support appends)
 
-   - ``Tables`` offer better performance when compressed after writing them (as opposed to turning on compression at the very beginning)
-     use the pytables utilities ``ptrepack`` to rewrite the file (and also can change compression methods)
    - Duplicate rows can be written, but are filtered out in selection (with the last items being selected; thus a table is unique on major, minor pairs)
 
 Experimental
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -420,6 +420,14 @@ def append(self, key, value, **kwargs):
         key : object
         value : {Series, DataFrame, Panel}
 
+        Optional Parameters
+        -------------------
+        columns : list of columns to create as data columns
+        min_itemsize : dict of columns that specify minimum string sizes
+        nan_rep : string to use as string nan represenation
+        chunksize : size to chunk the writing
+
+
         Notes
         -----
         Does *not* check if data being appended overlaps with existing
@@ -1820,7 +1828,7 @@ class AppendableTable(LegacyTable):
     table_type = 'appendable'
 
     def write(self, axes, obj, append=False, compression=None,
-              complevel=None, min_itemsize = None, **kwargs):
+              complevel=None, min_itemsize = None, chunksize = 50000, **kwargs):
 
         # create the table if it doesn't exist (or get it if it does)
         if not append:
@@ -1849,10 +1857,9 @@ def write(self, axes, obj, append=False, compression=None,
             a.validate_and_set(table, append)
 
         # add the rows
-        self.write_data()
-        self.handle.flush()
+        self.write_data(chunksize)
 
-    def write_data(self):
+    def write_data(self, chunksize):
         """ fast writing of data: requires specific cython routines each axis shape """
 
         # create the masks & values
@@ -1869,24 +1876,39 @@ def write_data(self):
             m = mask & m
 
         # the arguments
-        args   = [ a.cvalues for a in self.index_axes ]
-        search = np.array([ a.is_searchable for a in self.values_axes ]).astype('u1')
-        values = [ a.data for a in self.values_axes ]
+        indexes = [ a.cvalues for a in self.index_axes ]
+        search  = np.array([ a.is_searchable for a in self.values_axes ]).astype('u1')
+        values  = [ a.take_data() for a in self.values_axes ]
+
+        # write the chunks
+        rows   = np.prod([ i.shape[0] for i in indexes ])
+        chunks = int(rows / chunksize) + 1
+        for i in xrange(chunks):
+            start_i = i*chunksize
+            end_i   = min((i+1)*chunksize,rows)
+
+            self.write_data_chunk(indexes = [ a[start_i:end_i] for a in indexes ],
+                                  mask    = mask[start_i:end_i],
+                                  search  = search,
+                                  values  = [ v[:,start_i:end_i] for v in values ])
+
+    def write_data_chunk(self, indexes, mask, search, values):
 
         # get our function
         try:
             func = getattr(lib,"create_hdf_rows_%sd" % self.ndim)
-            args.append(mask)
-            args.append(search)
-            args.append(values)
+            args = list(indexes)
+            args.extend([ mask, search, values ])
             rows = func(*args)
         except (Exception), detail:
             raise Exception("cannot create row-data -> %s" % str(detail))
 
         try:
             if len(rows):
                 self.table.append(rows)
+                self.table.flush()
         except (Exception), detail:
+            import pdb; pdb.set_trace()
             raise Exception("tables cannot write this data -> %s" % str(detail))
 
     def delete(self, where = None):
@@ -1934,7 +1956,7 @@ def delete(self, where = None):
                 table.removeRows(start = rows[rows.index[0]], stop = rows[rows.index[-1]]+1)
                 pg = g
 
-            self.handle.flush()
+            self.table.flush()
 
         # return the number of rows removed
         return ln
diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py
@@ -576,28 +576,26 @@ def test_big_table2_frame(self):
         for x in range(20):
             df['String%03d' % x] = 'string%03d' % x
 
-        print "\nbig_table2 frame (creation of df) -> %5.2f" % (time.time()-start_time)
-        start_time = time.time()
-
-        from arb.common import profile
+        print "\nbig_table2 frame (creation of df) [rows->%s] -> %5.2f" % (len(df.index),time.time()-start_time)
         fn = 'big_table2.h5'
 
         try:
             
-            @profile.profile_func()
-            def f():
+            def f(chunksize):
                 store = HDFStore(fn,mode = 'w')
-                store.append('df',df)
+                store.append('df',df,chunksize=chunksize)
+                r = store.root.df.table.nrows
                 store.close()
-            
-            f()
-            rows = store.root.df.table.nrows
-            #recons = store.select('df')
-        finally:
-            pass
-            #os.remove(fn)
+                return r
 
-        print "\nbig_table2 frame [%s] -> %5.2f" % (rows,time.time()-start_time)
+            for c in [ 10000, 50000, 100000, 250000 ]:
+                start_time = time.time()
+                print "big_table2 frame [chunk->%s]" % c
+                rows = f(c)
+                print "big_table2 frame [rows->%s,chunk->%s] -> %5.2f" % (rows,c,time.time()-start_time)
+
+        finally:
+            os.remove(fn)
 
     def test_big_table_panel(self):
         raise nose.SkipTest('no big table panel')