Skip to content

Commit ce6a7a9

Browse files
committed
ENH: export of get_store context manager in __init__ for pandas
add expectedrows keyword to append to give pytables an estimate of the total rows in a new table add start/stop keywords as selection criteria to limit searches to these rows added multi-index support for dataframes docs/tests for the above
1 parent af43f71 commit ce6a7a9

File tree

4 files changed

+128
-38
lines changed

4 files changed

+128
-38
lines changed

doc/source/io.rst

+14-1
Original file line numberDiff line numberDiff line change
@@ -1030,6 +1030,17 @@ Deletion of the object specified by the key
10301030
del store['wp']
10311031
10321032
store
1033+
Closing a Store
1034+
1035+
.. ipython:: python
1036+
1037+
1038+
# closing a store
1039+
store.close()
1040+
1041+
# Working with, and automatically closing the store with the context manager.
1042+
with get_store('store.h5') as store:
1043+
store.keys()
10331044
10341045
.. ipython:: python
10351046
:suppress:
@@ -1267,7 +1278,9 @@ Performance
12671278

12681279
- ``Tables`` come with a writing performance penalty as compared to regular stores. The benefit is the ability to append/delete and query (potentially very large amounts of data).
12691280
Write times are generally longer as compared with regular stores. Query times can be quite fast, especially on an indexed axis.
1270-
- ``Tables`` can (as of 0.10.0) be expressed as different types.
1281+
- You can pass ``chunksize=an integer`` to ``append``, to change the writing chunksize (default is 50000). This will signficantly lower your memory usage on writing.
1282+
- You can pass ``expectedrows=an integer`` to the first ``append``, to set the TOTAL number of expectedrows that ``PyTables`` will expected. This will optimize read/write performance.
1283+
- ``Tables`` can be expressed as different types.
12711284

12721285
- ``AppendableTable`` which is a similiar table to past versions (this is the default).
12731286
- ``WORMTable`` (pending implementation) - is available to faciliate very fast writing of tables that are also queryable (but CANNOT support appends)

pandas/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
from pandas.io.parsers import (read_csv, read_table, read_clipboard,
3333
read_fwf, to_clipboard, ExcelFile,
3434
ExcelWriter)
35-
from pandas.io.pytables import HDFStore, Term
35+
from pandas.io.pytables import HDFStore, Term, get_store
3636
from pandas.util.testing import debug
3737

3838
from pandas.tools.describe import value_range

pandas/io/pytables.py

+74-30
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,7 @@ def get(self, key):
336336
raise KeyError('No object named %s in the file' % key)
337337
return self._read_group(group)
338338

339-
def select(self, key, where=None, **kwargs):
339+
def select(self, key, where=None, start=None, stop=None, **kwargs):
340340
"""
341341
Retrieve pandas object stored in file, optionally based on where
342342
criteria
@@ -350,7 +350,7 @@ def select(self, key, where=None, **kwargs):
350350
group = self.get_node(key)
351351
if group is None:
352352
raise KeyError('No object named %s in the file' % key)
353-
return self._read_group(group, where, **kwargs)
353+
return self._read_group(group, where=where, start=start, stop=stop, **kwargs)
354354

355355
def put(self, key, value, table=False, append=False,
356356
compression=None, **kwargs):
@@ -376,7 +376,7 @@ def put(self, key, value, table=False, append=False,
376376
self._write_to_group(key, value, table=table, append=append,
377377
comp=compression, **kwargs)
378378

379-
def remove(self, key, where=None):
379+
def remove(self, key, where=None, start=None, stop=None):
380380
"""
381381
Remove pandas object partially by specifying the where condition
382382
@@ -406,7 +406,7 @@ def remove(self, key, where=None):
406406
if not _is_table_type(group):
407407
raise Exception('can only remove with where on objects written as tables')
408408
t = create_table(self, group)
409-
return t.delete(where)
409+
return t.delete(where = where, start=start, stop=stop)
410410

411411
return None
412412

@@ -426,7 +426,7 @@ def append(self, key, value, **kwargs):
426426
min_itemsize : dict of columns that specify minimum string sizes
427427
nan_rep : string to use as string nan represenation
428428
chunksize : size to chunk the writing
429-
429+
expectedrows : expected TOTAL row size of this table
430430
431431
Notes
432432
-----
@@ -472,6 +472,15 @@ def get_node(self, key):
472472
except:
473473
return None
474474

475+
def get_table(self, key):
476+
""" return the table object for a key, raise if not in the file or a non-table """
477+
group = self.get_node(key)
478+
if group is None:
479+
raise KeyError('No object named %s in the file' % key)
480+
if not _is_table_type(group):
481+
raise Exception("cannot return a table object for a non-table")
482+
return create_table(self, group)
483+
475484
###### private methods ######
476485

477486
def _get_handler(self, op, kind):
@@ -596,7 +605,7 @@ def _read_sparse_panel(self, group, where=None):
596605
def _write_frame(self, group, df):
597606
self._write_block_manager(group, df._data)
598607

599-
def _read_frame(self, group, where=None):
608+
def _read_frame(self, group, where=None, **kwargs):
600609
return DataFrame(self._read_block_manager(group))
601610

602611
def _write_block_manager(self, group, data):
@@ -638,7 +647,7 @@ def _write_wide(self, group, panel):
638647
panel._consolidate_inplace()
639648
self._write_block_manager(group, panel._data)
640649

641-
def _read_wide(self, group, where=None):
650+
def _read_wide(self, group, where=None, **kwargs):
642651
return Panel(self._read_block_manager(group))
643652

644653
def _write_ndim_table(self, group, obj, append=False, comp=None, axes=None, index=True, **kwargs):
@@ -652,12 +661,13 @@ def _write_ndim_table(self, group, obj, append=False, comp=None, axes=None, inde
652661

653662
def _read_ndim_table(self, group, where=None, **kwargs):
654663
t = create_table(self, group, **kwargs)
655-
return t.read(where)
664+
return t.read(where, **kwargs)
656665

657666
def _write_frame_table(self, group, df, append=False, comp=None, axes=None, index=True, **kwargs):
658667
if axes is None:
659668
axes = [0]
660-
t = create_table(self, group, typ = 'appendable_frame')
669+
670+
t = create_table(self, group, typ = 'appendable_frame' if df.index.nlevels == 1 else 'appendable_multiframe')
661671
t.write(axes=axes, obj=df, append=append, compression=comp, **kwargs)
662672
if index:
663673
t.create_index()
@@ -860,9 +870,9 @@ def _read_group(self, group, where=None, **kwargs):
860870
kind = group._v_attrs.pandas_type
861871
kind = _LEGACY_MAP.get(kind, kind)
862872
handler = self._get_handler(op='read', kind=kind)
863-
return handler(group, where, **kwargs)
873+
return handler(group, where=where, **kwargs)
864874

865-
def _read_series(self, group, where=None):
875+
def _read_series(self, group, where=None, **kwargs):
866876
index = self._read_index(group, 'index')
867877
if len(index) > 0:
868878
values = _read_array(group, 'values')
@@ -872,12 +882,12 @@ def _read_series(self, group, where=None):
872882
name = getattr(group._v_attrs, 'name', None)
873883
return Series(values, index=index, name=name)
874884

875-
def _read_legacy_series(self, group, where=None):
885+
def _read_legacy_series(self, group, where=None, **kwargs):
876886
index = self._read_index_legacy(group, 'index')
877887
values = _read_array(group, 'values')
878888
return Series(values, index=index)
879889

880-
def _read_legacy_frame(self, group, where=None):
890+
def _read_legacy_frame(self, group, where=None, **kwargs):
881891
index = self._read_index_legacy(group, 'index')
882892
columns = self._read_index_legacy(group, 'columns')
883893
values = _read_array(group, 'values')
@@ -1253,11 +1263,13 @@ class Table(object):
12531263
values_axes : a list of the columns which comprise the data of this table
12541264
data_columns : a list of columns that we are allowing indexing (these become single columns in values_axes)
12551265
nan_rep : the string to use for nan representations for string objects
1266+
levels : the names of levels
12561267
12571268
"""
12581269
table_type = None
12591270
obj_type = None
12601271
ndim = None
1272+
levels = 1
12611273

12621274
def __init__(self, parent, group, **kwargs):
12631275
self.parent = parent
@@ -1384,6 +1396,7 @@ def set_attrs(self):
13841396
self.attrs.non_index_axes = self.non_index_axes
13851397
self.attrs.data_columns = self.data_columns
13861398
self.attrs.nan_rep = self.nan_rep
1399+
self.attrs.levels = self.levels
13871400

13881401
def validate_version(self, where = None):
13891402
""" are we trying to operate on an old version? """
@@ -1472,7 +1485,7 @@ def create_index(self, columns = None, optlevel = None, kind = None):
14721485
if not v.is_indexed:
14731486
v.createIndex(**kw)
14741487

1475-
def read_axes(self, where):
1488+
def read_axes(self, where, **kwargs):
14761489
""" create and return the axes sniffed from the table: return boolean for success """
14771490

14781491
# validate the version
@@ -1482,7 +1495,7 @@ def read_axes(self, where):
14821495
if not self.infer_axes(): return False
14831496

14841497
# create the selection
1485-
self.selection = Selection(self, where)
1498+
self.selection = Selection(self, where = where, **kwargs)
14861499
values = self.selection.select()
14871500

14881501
# convert the data
@@ -1502,6 +1515,7 @@ def infer_axes(self):
15021515
self.non_index_axes = getattr(self.attrs,'non_index_axes',None) or []
15031516
self.data_columns = getattr(self.attrs,'data_columns',None) or []
15041517
self.nan_rep = getattr(self.attrs,'nan_rep',None)
1518+
self.levels = getattr(self.attrs,'levels',None) or []
15051519
self.index_axes = [ a.infer(self.table) for a in self.indexables if a.is_an_indexable ]
15061520
self.values_axes = [ a.infer(self.table) for a in self.indexables if not a.is_an_indexable ]
15071521
return True
@@ -1659,10 +1673,11 @@ def reindex(obj, axis, filt, ordered):
16591673

16601674
return obj
16611675

1662-
def create_description(self, compression = None, complevel = None):
1676+
def create_description(self, compression = None, complevel = None, expectedrows = None):
16631677
""" create the description of the table from the axes & values """
16641678

1665-
d = { 'name' : 'table' }
1679+
d = dict( name = 'table',
1680+
expectedrows = expectedrows )
16661681

16671682
# description from the axes & values
16681683
d['description'] = dict([ (a.cname,a.typ) for a in self.axes ])
@@ -1728,11 +1743,11 @@ class LegacyTable(Table):
17281743
def write(self, **kwargs):
17291744
raise Exception("write operations are not allowed on legacy tables!")
17301745

1731-
def read(self, where=None):
1746+
def read(self, where=None, **kwargs):
17321747
""" we have n indexable columns, with an arbitrary number of data axes """
17331748

17341749

1735-
if not self.read_axes(where): return None
1750+
if not self.read_axes(where=where, **kwargs): return None
17361751

17371752
factors = [ Categorical.from_array(a.values) for a in self.index_axes ]
17381753
levels = [ f.levels for f in factors ]
@@ -1828,7 +1843,8 @@ class AppendableTable(LegacyTable):
18281843
table_type = 'appendable'
18291844

18301845
def write(self, axes, obj, append=False, compression=None,
1831-
complevel=None, min_itemsize = None, chunksize = 50000, **kwargs):
1846+
complevel=None, min_itemsize = None, chunksize = 50000,
1847+
expectedrows = None, **kwargs):
18321848

18331849
# create the table if it doesn't exist (or get it if it does)
18341850
if not append:
@@ -1841,7 +1857,7 @@ def write(self, axes, obj, append=False, compression=None,
18411857
if 'table' not in self.group:
18421858

18431859
# create the table
1844-
options = self.create_description(compression = compression, complevel = complevel)
1860+
options = self.create_description(compression = compression, complevel = complevel, expectedrows = expectedrows)
18451861

18461862
# set the table attributes
18471863
self.set_attrs()
@@ -1911,7 +1927,7 @@ def write_data_chunk(self, indexes, mask, search, values):
19111927
import pdb; pdb.set_trace()
19121928
raise Exception("tables cannot write this data -> %s" % str(detail))
19131929

1914-
def delete(self, where = None):
1930+
def delete(self, where = None, **kwargs):
19151931

19161932
# delete all rows (and return the nrows)
19171933
if where is None or not len(where):
@@ -1924,7 +1940,7 @@ def delete(self, where = None):
19241940

19251941
# create the selection
19261942
table = self.table
1927-
self.selection = Selection(self, where)
1943+
self.selection = Selection(self, where, **kwargs)
19281944
values = self.selection.select_coords()
19291945

19301946
# delete the rows in reverse order
@@ -1977,9 +1993,9 @@ def get_object(self, obj):
19771993
obj = obj.T
19781994
return obj
19791995

1980-
def read(self, where=None):
1996+
def read(self, where=None, **kwargs):
19811997

1982-
if not self.read_axes(where): return None
1998+
if not self.read_axes(where=where, **kwargs): return None
19831999

19842000
index = self.index_axes[0].values
19852001
frames = []
@@ -2014,6 +2030,30 @@ def read(self, where=None):
20142030

20152031
return df
20162032

2033+
class AppendableMultiFrameTable(AppendableFrameTable):
2034+
""" a frame with a multi-index """
2035+
table_type = 'appendable_multiframe'
2036+
obj_type = DataFrame
2037+
ndim = 2
2038+
2039+
@property
2040+
def table_type_short(self):
2041+
return 'appendable_multi'
2042+
2043+
def write(self, obj, columns = None, **kwargs):
2044+
if columns is None:
2045+
columns = []
2046+
for n in obj.index.names:
2047+
if n not in columns:
2048+
columns.insert(0,n)
2049+
self.levels = obj.index.names
2050+
return super(AppendableMultiFrameTable, self).write(obj = obj.reset_index(), columns = columns, **kwargs)
2051+
2052+
def read(self, where=None, **kwargs):
2053+
df = super(AppendableMultiFrameTable, self).read(where = where, **kwargs)
2054+
df.set_index(self.levels, inplace=True)
2055+
return df
2056+
20172057
class AppendablePanelTable(AppendableTable):
20182058
""" suppor the new appendable table formats """
20192059
table_type = 'appendable_panel'
@@ -2038,7 +2078,8 @@ class AppendableNDimTable(AppendablePanelTable):
20382078

20392079
# table maps
20402080
_TABLE_MAP = {
2041-
'appendable_frame' : AppendableFrameTable,
2081+
'appendable_frame' : AppendableFrameTable,
2082+
'appendable_multiframe' : AppendableMultiFrameTable,
20422083
'appendable_panel' : AppendablePanelTable,
20432084
'appendable_ndim' : AppendableNDimTable,
20442085
'worm' : WORMTable,
@@ -2410,11 +2451,14 @@ class Selection(object):
24102451
----------
24112452
table : a Table object
24122453
where : list of Terms (or convertable to)
2454+
start, stop: indicies to start and/or stop selection
24132455
24142456
"""
2415-
def __init__(self, table, where=None):
2457+
def __init__(self, table, where=None, start=None, stop=None, **kwargs):
24162458
self.table = table
24172459
self.where = where
2460+
self.start = start
2461+
self.stop = stop
24182462
self.condition = None
24192463
self.filter = None
24202464
self.terms = self.generate(where)
@@ -2448,15 +2492,15 @@ def select(self):
24482492
generate the selection
24492493
"""
24502494
if self.condition is not None:
2451-
return self.table.table.readWhere(self.condition)
2495+
return self.table.table.readWhere(self.condition, start=self.start, stop=self.stop)
24522496
else:
2453-
return self.table.table.read()
2497+
return self.table.table.read(start=self.start,stop=self.stop)
24542498

24552499
def select_coords(self):
24562500
"""
24572501
generate the selection
24582502
"""
2459-
return self.table.table.getWhereList(self.condition, sort = True)
2503+
return self.table.table.getWhereList(self.condition, start=self.start, stop=self.stop, sort = True)
24602504

24612505

24622506
def _get_index_factory(klass):

0 commit comments

Comments
 (0)