Skip to content

Commit af43f71

Browse files
committed
ENH/DOC: updated docs for compression
added parameter chunksize to append, now writing occurs in chunks, significatnly reducing memory usage
1 parent 97bdb5c commit af43f71

File tree

3 files changed

+60
-28
lines changed

3 files changed

+60
-28
lines changed

doc/source/io.rst

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1221,6 +1221,20 @@ It should be clear that a delete operation on the ``major_axis`` will be fairly
12211221
store.remove('wp', 'major_axis>20000102' )
12221222
store.select('wp')
12231223
1224+
Compression
1225+
~~~~~~~~~~~
1226+
``PyTables`` allows the stored data to be compressed (this applies to all kinds of stores, not just tables). You can pass ``complevel=int`` for a compression level (1-9, with 0 being no compression, and the default), ``complib=lib`` where lib is any of ``zlib, bzip2, lzo, blosc`` for whichever compression library you prefer. ``blosc`` offers very fast compression (its level defaults to 9), and is my most used.
1227+
1228+
``PyTables`` offer better write performance when compressed after writing them, as opposed to turning on compression at the very beginning. You can use the supplied ``PyTables`` utility ``ptrepack``. ``ptrepack`` also can change compression levels after the fact.
1229+
1230+
- ``ptrepack --chunkshape=auto --propindexes --complevel=9 --complib=blosc in.h5 out.h5``
1231+
1232+
Or on-the-fly compression
1233+
1234+
- ``store_compressed = HDFStore('store_compressed.h5', complevel=9, complib='blosc')``
1235+
1236+
1237+
12241238
Notes & Caveats
12251239
~~~~~~~~~~~~~~~
12261240

@@ -1258,8 +1272,6 @@ Performance
12581272
- ``AppendableTable`` which is a similiar table to past versions (this is the default).
12591273
- ``WORMTable`` (pending implementation) - is available to faciliate very fast writing of tables that are also queryable (but CANNOT support appends)
12601274

1261-
- ``Tables`` offer better performance when compressed after writing them (as opposed to turning on compression at the very beginning)
1262-
use the pytables utilities ``ptrepack`` to rewrite the file (and also can change compression methods)
12631275
- Duplicate rows can be written, but are filtered out in selection (with the last items being selected; thus a table is unique on major, minor pairs)
12641276

12651277
Experimental

pandas/io/pytables.py

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,14 @@ def append(self, key, value, **kwargs):
420420
key : object
421421
value : {Series, DataFrame, Panel}
422422
423+
Optional Parameters
424+
-------------------
425+
columns : list of columns to create as data columns
426+
min_itemsize : dict of columns that specify minimum string sizes
427+
nan_rep : string to use as string nan represenation
428+
chunksize : size to chunk the writing
429+
430+
423431
Notes
424432
-----
425433
Does *not* check if data being appended overlaps with existing
@@ -1820,7 +1828,7 @@ class AppendableTable(LegacyTable):
18201828
table_type = 'appendable'
18211829

18221830
def write(self, axes, obj, append=False, compression=None,
1823-
complevel=None, min_itemsize = None, **kwargs):
1831+
complevel=None, min_itemsize = None, chunksize = 50000, **kwargs):
18241832

18251833
# create the table if it doesn't exist (or get it if it does)
18261834
if not append:
@@ -1849,10 +1857,9 @@ def write(self, axes, obj, append=False, compression=None,
18491857
a.validate_and_set(table, append)
18501858

18511859
# add the rows
1852-
self.write_data()
1853-
self.handle.flush()
1860+
self.write_data(chunksize)
18541861

1855-
def write_data(self):
1862+
def write_data(self, chunksize):
18561863
""" fast writing of data: requires specific cython routines each axis shape """
18571864

18581865
# create the masks & values
@@ -1869,24 +1876,39 @@ def write_data(self):
18691876
m = mask & m
18701877

18711878
# the arguments
1872-
args = [ a.cvalues for a in self.index_axes ]
1873-
search = np.array([ a.is_searchable for a in self.values_axes ]).astype('u1')
1874-
values = [ a.data for a in self.values_axes ]
1879+
indexes = [ a.cvalues for a in self.index_axes ]
1880+
search = np.array([ a.is_searchable for a in self.values_axes ]).astype('u1')
1881+
values = [ a.take_data() for a in self.values_axes ]
1882+
1883+
# write the chunks
1884+
rows = np.prod([ i.shape[0] for i in indexes ])
1885+
chunks = int(rows / chunksize) + 1
1886+
for i in xrange(chunks):
1887+
start_i = i*chunksize
1888+
end_i = min((i+1)*chunksize,rows)
1889+
1890+
self.write_data_chunk(indexes = [ a[start_i:end_i] for a in indexes ],
1891+
mask = mask[start_i:end_i],
1892+
search = search,
1893+
values = [ v[:,start_i:end_i] for v in values ])
1894+
1895+
def write_data_chunk(self, indexes, mask, search, values):
18751896

18761897
# get our function
18771898
try:
18781899
func = getattr(lib,"create_hdf_rows_%sd" % self.ndim)
1879-
args.append(mask)
1880-
args.append(search)
1881-
args.append(values)
1900+
args = list(indexes)
1901+
args.extend([ mask, search, values ])
18821902
rows = func(*args)
18831903
except (Exception), detail:
18841904
raise Exception("cannot create row-data -> %s" % str(detail))
18851905

18861906
try:
18871907
if len(rows):
18881908
self.table.append(rows)
1909+
self.table.flush()
18891910
except (Exception), detail:
1911+
import pdb; pdb.set_trace()
18901912
raise Exception("tables cannot write this data -> %s" % str(detail))
18911913

18921914
def delete(self, where = None):
@@ -1934,7 +1956,7 @@ def delete(self, where = None):
19341956
table.removeRows(start = rows[rows.index[0]], stop = rows[rows.index[-1]]+1)
19351957
pg = g
19361958

1937-
self.handle.flush()
1959+
self.table.flush()
19381960

19391961
# return the number of rows removed
19401962
return ln

pandas/io/tests/test_pytables.py

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -576,28 +576,26 @@ def test_big_table2_frame(self):
576576
for x in range(20):
577577
df['String%03d' % x] = 'string%03d' % x
578578

579-
print "\nbig_table2 frame (creation of df) -> %5.2f" % (time.time()-start_time)
580-
start_time = time.time()
581-
582-
from arb.common import profile
579+
print "\nbig_table2 frame (creation of df) [rows->%s] -> %5.2f" % (len(df.index),time.time()-start_time)
583580
fn = 'big_table2.h5'
584581

585582
try:
586583

587-
@profile.profile_func()
588-
def f():
584+
def f(chunksize):
589585
store = HDFStore(fn,mode = 'w')
590-
store.append('df',df)
586+
store.append('df',df,chunksize=chunksize)
587+
r = store.root.df.table.nrows
591588
store.close()
592-
593-
f()
594-
rows = store.root.df.table.nrows
595-
#recons = store.select('df')
596-
finally:
597-
pass
598-
#os.remove(fn)
589+
return r
599590

600-
print "\nbig_table2 frame [%s] -> %5.2f" % (rows,time.time()-start_time)
591+
for c in [ 10000, 50000, 100000, 250000 ]:
592+
start_time = time.time()
593+
print "big_table2 frame [chunk->%s]" % c
594+
rows = f(c)
595+
print "big_table2 frame [rows->%s,chunk->%s] -> %5.2f" % (rows,c,time.time()-start_time)
596+
597+
finally:
598+
os.remove(fn)
601599

602600
def test_big_table_panel(self):
603601
raise nose.SkipTest('no big table panel')

0 commit comments

Comments
 (0)