From 27e8dea3507c5d601e1feb10beb48e34090db8f6 Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 25 Jan 2013 14:31:25 -0500 Subject: [PATCH 1/7] BUG: issue in PyTables with too many selectors in a where --- RELEASE.rst | 2 ++ pandas/io/pytables.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/RELEASE.rst b/RELEASE.rst index 981fa5bed257d..ae955e204f036 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -22,6 +22,8 @@ Where to get it * Binary installers on PyPI: http://pypi.python.org/pypi/pandas * Documentation: http://pandas.pydata.org + - Fix weird PyTables error when using too many selectors in a where + pandas 0.10.1 ============= diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 78bd204f26993..767565abf9c8c 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2894,6 +2894,7 @@ class Term(object): _ops = ['<=', '<', '>=', '>', '!=', '==', '='] _search = re.compile("^\s*(?P\w+)\s*(?P%s)\s*(?P.+)\s*$" % '|'.join(_ops)) + _max_selectors = 31 def __init__(self, field, op=None, value=None, queryables=None): self.field = None @@ -3006,7 +3007,7 @@ def eval(self): if self.is_in_table: # too many values to create the expression? - if len(values) <= 61: + if len(values) <= self._max_selectors: self.condition = "(%s)" % ' | '.join( ["(%s == %s)" % (self.field, v[0]) for v in values]) From 1dbe01dff53719310a9ade4ba41799f3bc281878 Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 29 Jan 2013 13:21:22 -0500 Subject: [PATCH 2/7] TST: use ensure_clean contextmanager to avoid spewing temporary files if testing is interrupted --- pandas/io/pytables.py | 17 + pandas/io/tests/test_pytables.py | 2681 +++++++++++++++--------------- 2 files changed, 1395 insertions(+), 1303 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 767565abf9c8c..c18db458ecdf3 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2552,6 +2552,11 @@ def write_data(self, chunksize): def write_data_chunk(self, indexes, mask, search, values): + # 0 len + for v in values: + if not np.prod(v.shape): + return + # get our function try: func = getattr(lib, "create_hdf_rows_%sd" % self.ndim) @@ -3139,3 +3144,15 @@ def select_coords(self): return self.table.table.getWhereList(self.condition, start=self.start, stop=self.stop, sort=True) +### utilities ### + +def timeit(key,df,fn=None,remove=True,**kwargs): + if fn is None: + fn = 'timeit.h5' + store = HDFStore(fn,mode='w') + store.append(key,df,**kwargs) + store.close() + + if remove: + import os + os.remove(fn) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 934e088ddc1d3..ff4d4fa788e6b 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -27,236 +27,265 @@ _multiprocess_can_split_ = False +# contextmanager to ensure the file cleanup +def safe_remove(path): + if path is not None: + import os + try: + os.remove(path) + except: + pass + +def safe_close(store): + try: + if store is not None: + store.close() + except: + pass + +from contextlib import contextmanager + +@contextmanager +def ensure_clean(path, mode='a', complevel=None, complib=None, + fletcher32=False): + store = HDFStore(path, mode=mode, complevel=complevel, + complib=complib, fletcher32=False) + try: + yield store + finally: + safe_close(store) + if mode == 'w' or mode == 'a': + safe_remove(path) + +# set these parameters so we don't have file sharing +tables.parameters.MAX_NUMEXPR_THREADS = 1 +tables.parameters.MAX_BLOSC_THREADS = 1 +tables.parameters.MAX_THREADS = 1 + class TestHDFStore(unittest.TestCase): - scratchpath = '__scratch__.h5' def setUp(self): warnings.filterwarnings(action='ignore', category=FutureWarning) self.path = '__%s__.h5' % tm.rands(10) - self.store = HDFStore(self.path) def tearDown(self): - self.store.close() - try: - os.remove(self.path) - except os.error: - pass + pass def test_factory_fun(self): try: - with get_store(self.scratchpath) as tbl: + with get_store(self.path) as tbl: raise ValueError('blah') except ValueError: pass + finally: + safe_remove(self.path) - with get_store(self.scratchpath) as tbl: - tbl['a'] = tm.makeDataFrame() - - with get_store(self.scratchpath) as tbl: - self.assertEquals(len(tbl), 1) - self.assertEquals(type(tbl['a']), DataFrame) - - os.remove(self.scratchpath) + try: + with get_store(self.path) as tbl: + tbl['a'] = tm.makeDataFrame() + + with get_store(self.path) as tbl: + self.assertEquals(len(tbl), 1) + self.assertEquals(type(tbl['a']), DataFrame) + finally: + safe_remove(self.path) def test_keys(self): - self.store['a'] = tm.makeTimeSeries() - self.store['b'] = tm.makeStringSeries() - self.store['c'] = tm.makeDataFrame() - self.store['d'] = tm.makePanel() - self.store['foo/bar'] = tm.makePanel() - self.assertEquals(len(self.store), 5) - self.assert_(set( - self.store.keys()) == set(['/a', '/b', '/c', '/d', '/foo/bar'])) + + with ensure_clean(self.path) as store: + store['a'] = tm.makeTimeSeries() + store['b'] = tm.makeStringSeries() + store['c'] = tm.makeDataFrame() + store['d'] = tm.makePanel() + store['foo/bar'] = tm.makePanel() + self.assertEquals(len(store), 5) + self.assert_(set( + store.keys()) == set(['/a', '/b', '/c', '/d', '/foo/bar'])) def test_repr(self): - repr(self.store) - self.store['a'] = tm.makeTimeSeries() - self.store['b'] = tm.makeStringSeries() - self.store['c'] = tm.makeDataFrame() - self.store['d'] = tm.makePanel() - self.store['foo/bar'] = tm.makePanel() - self.store.append('e', tm.makePanel()) - df = tm.makeDataFrame() - df['obj1'] = 'foo' - df['obj2'] = 'bar' - df['bool1'] = df['A'] > 0 - df['bool2'] = df['B'] > 0 - df['bool3'] = True - df['int1'] = 1 - df['int2'] = 2 - df['timestamp1'] = Timestamp('20010102') - df['timestamp2'] = Timestamp('20010103') - df['datetime1'] = datetime.datetime(2001,1,2,0,0) - df['datetime2'] = datetime.datetime(2001,1,3,0,0) - df.ix[3:6,['obj1']] = np.nan - df = df.consolidate().convert_objects() - self.store['df'] = df + with ensure_clean(self.path) as store: + repr(store) + store['a'] = tm.makeTimeSeries() + store['b'] = tm.makeStringSeries() + store['c'] = tm.makeDataFrame() + store['d'] = tm.makePanel() + store['foo/bar'] = tm.makePanel() + store.append('e', tm.makePanel()) + + df = tm.makeDataFrame() + df['obj1'] = 'foo' + df['obj2'] = 'bar' + df['bool1'] = df['A'] > 0 + df['bool2'] = df['B'] > 0 + df['bool3'] = True + df['int1'] = 1 + df['int2'] = 2 + df['timestamp1'] = Timestamp('20010102') + df['timestamp2'] = Timestamp('20010103') + df['datetime1'] = datetime.datetime(2001,1,2,0,0) + df['datetime2'] = datetime.datetime(2001,1,3,0,0) + df.ix[3:6,['obj1']] = np.nan + df = df.consolidate().convert_objects() + store['df'] = df - # make a random group in hdf space - self.store.handle.createGroup(self.store.handle.root,'bah') + # make a random group in hdf space + store.handle.createGroup(store.handle.root,'bah') - repr(self.store) - str(self.store) + repr(store) + str(store) def test_contains(self): - self.store['a'] = tm.makeTimeSeries() - self.store['b'] = tm.makeDataFrame() - self.store['foo/bar'] = tm.makeDataFrame() - self.assert_('a' in self.store) - self.assert_('b' in self.store) - self.assert_('c' not in self.store) - self.assert_('foo/bar' in self.store) - self.assert_('/foo/bar' in self.store) - self.assert_('/foo/b' not in self.store) - self.assert_('bar' not in self.store) - - # GH 2694 - warnings.filterwarnings('ignore', category=tables.NaturalNameWarning) - self.store['node())'] = tm.makeDataFrame() - self.assert_('node())' in self.store) - warnings.filterwarnings('always', category=tables.NaturalNameWarning) + + with ensure_clean(self.path) as store: + store['a'] = tm.makeTimeSeries() + store['b'] = tm.makeDataFrame() + store['foo/bar'] = tm.makeDataFrame() + self.assert_('a' in store) + self.assert_('b' in store) + self.assert_('c' not in store) + self.assert_('foo/bar' in store) + self.assert_('/foo/bar' in store) + self.assert_('/foo/b' not in store) + self.assert_('bar' not in store) + + # GH 2694 + warnings.filterwarnings('ignore', category=tables.NaturalNameWarning) + store['node())'] = tm.makeDataFrame() + self.assert_('node())' in store) + warnings.filterwarnings('always', category=tables.NaturalNameWarning) def test_versioning(self): - self.store['a'] = tm.makeTimeSeries() - self.store['b'] = tm.makeDataFrame() - df = tm.makeTimeDataFrame() - self.store.remove('df1') - self.store.append('df1', df[:10]) - self.store.append('df1', df[10:]) - self.assert_(self.store.root.a._v_attrs.pandas_version == '0.10.1') - self.assert_(self.store.root.b._v_attrs.pandas_version == '0.10.1') - self.assert_(self.store.root.df1._v_attrs.pandas_version == '0.10.1') - - # write a file and wipe its versioning - self.store.remove('df2') - self.store.append('df2', df) - - # this is an error because its table_type is appendable, but no version - # info - self.store.get_node('df2')._v_attrs.pandas_version = None - self.assertRaises(Exception, self.store.select, 'df2') - - def test_meta(self): - raise nose.SkipTest('no meta') - - meta = {'foo': ['I love pandas ']} - s = tm.makeTimeSeries() - s.meta = meta - self.store['a'] = s - self.assert_(self.store['a'].meta == meta) - df = tm.makeDataFrame() - df.meta = meta - self.store['b'] = df - self.assert_(self.store['b'].meta == meta) - - # this should work, but because slicing doesn't propgate meta it doesn - self.store.remove('df1') - self.store.append('df1', df[:10]) - self.store.append('df1', df[10:]) - results = self.store['df1'] - # self.assert_(getattr(results,'meta',None) == meta) - - # no meta - df = tm.makeDataFrame() - self.store['b'] = df - self.assert_(hasattr(self.store['b'], 'meta') is False) + with ensure_clean(self.path) as store: + store['a'] = tm.makeTimeSeries() + store['b'] = tm.makeDataFrame() + df = tm.makeTimeDataFrame() + store.remove('df1') + store.append('df1', df[:10]) + store.append('df1', df[10:]) + self.assert_(store.root.a._v_attrs.pandas_version == '0.10.1') + self.assert_(store.root.b._v_attrs.pandas_version == '0.10.1') + self.assert_(store.root.df1._v_attrs.pandas_version == '0.10.1') + + # write a file and wipe its versioning + store.remove('df2') + store.append('df2', df) + + # this is an error because its table_type is appendable, but no version + # info + store.get_node('df2')._v_attrs.pandas_version = None + self.assertRaises(Exception, store.select, 'df2') def test_reopen_handle(self): - self.store['a'] = tm.makeTimeSeries() - self.store.open('w', warn=False) - self.assert_(self.store.handle.isopen) - self.assertEquals(len(self.store), 0) + with ensure_clean(self.path) as store: + store['a'] = tm.makeTimeSeries() + store.open('w', warn=False) + self.assert_(store.handle.isopen) + self.assertEquals(len(store), 0) + def test_flush(self): - self.store['a'] = tm.makeTimeSeries() - self.store.flush() - def test_get(self): - self.store['a'] = tm.makeTimeSeries() - left = self.store.get('a') - right = self.store['a'] - tm.assert_series_equal(left, right) + with ensure_clean(self.path) as store: + store['a'] = tm.makeTimeSeries() + store.flush() - left = self.store.get('/a') - right = self.store['/a'] - tm.assert_series_equal(left, right) + def test_get(self): - self.assertRaises(KeyError, self.store.get, 'b') + with ensure_clean(self.path) as store: + store['a'] = tm.makeTimeSeries() + left = store.get('a') + right = store['a'] + tm.assert_series_equal(left, right) + + left = store.get('/a') + right = store['/a'] + tm.assert_series_equal(left, right) + + self.assertRaises(KeyError, store.get, 'b') def test_put(self): - ts = tm.makeTimeSeries() - df = tm.makeTimeDataFrame() - self.store['a'] = ts - self.store['b'] = df[:10] - self.store['foo/bar/bah'] = df[:10] - self.store['foo'] = df[:10] - self.store['/foo'] = df[:10] - self.store.put('c', df[:10], table=True) - - # not OK, not a table - self.assertRaises( - ValueError, self.store.put, 'b', df[10:], append=True) - - # node does not currently exist, test _is_table_type returns False in - # this case - #self.store.remove('f') - #self.assertRaises(ValueError, self.store.put, 'f', df[10:], append=True) - # can't put to a table (use append instead) - self.assertRaises(ValueError, self.store.put, 'c', df[10:], append=True) - - # overwrite table - self.store.put('c', df[:10], table=True, append=False) - tm.assert_frame_equal(df[:10], self.store['c']) + with ensure_clean(self.path) as store: + + ts = tm.makeTimeSeries() + df = tm.makeTimeDataFrame() + store['a'] = ts + store['b'] = df[:10] + store['foo/bar/bah'] = df[:10] + store['foo'] = df[:10] + store['/foo'] = df[:10] + store.put('c', df[:10], table=True) + + # not OK, not a table + self.assertRaises( + ValueError, store.put, 'b', df[10:], append=True) + + # node does not currently exist, test _is_table_type returns False in + # this case + # store.remove('f') + # self.assertRaises(ValueError, store.put, 'f', df[10:], append=True) + + # can't put to a table (use append instead) + self.assertRaises(ValueError, store.put, 'c', df[10:], append=True) + + # overwrite table + store.put('c', df[:10], table=True, append=False) + tm.assert_frame_equal(df[:10], store['c']) def test_put_string_index(self): - index = Index( - ["I am a very long string index: %s" % i for i in range(20)]) - s = Series(np.arange(20), index=index) - df = DataFrame({'A': s, 'B': s}) - - self.store['a'] = s - tm.assert_series_equal(self.store['a'], s) - - self.store['b'] = df - tm.assert_frame_equal(self.store['b'], df) - - # mixed length - index = Index(['abcdefghijklmnopqrstuvwxyz1234567890'] + ["I am a very long string index: %s" % i for i in range(20)]) - s = Series(np.arange(21), index=index) - df = DataFrame({'A': s, 'B': s}) - self.store['a'] = s - tm.assert_series_equal(self.store['a'], s) - - self.store['b'] = df - tm.assert_frame_equal(self.store['b'], df) + with ensure_clean(self.path) as store: + + index = Index( + ["I am a very long string index: %s" % i for i in range(20)]) + s = Series(np.arange(20), index=index) + df = DataFrame({'A': s, 'B': s}) + + store['a'] = s + tm.assert_series_equal(store['a'], s) + + store['b'] = df + tm.assert_frame_equal(store['b'], df) + + # mixed length + index = Index(['abcdefghijklmnopqrstuvwxyz1234567890'] + ["I am a very long string index: %s" % i for i in range(20)]) + s = Series(np.arange(21), index=index) + df = DataFrame({'A': s, 'B': s}) + store['a'] = s + tm.assert_series_equal(store['a'], s) + + store['b'] = df + tm.assert_frame_equal(store['b'], df) def test_put_compression(self): - df = tm.makeTimeDataFrame() - self.store.put('c', df, table=True, complib='zlib') - tm.assert_frame_equal(self.store['c'], df) + with ensure_clean(self.path) as store: + df = tm.makeTimeDataFrame() - # can't compress if table=False - self.assertRaises(ValueError, self.store.put, 'b', df, - table=False, complib='zlib') + store.put('c', df, table=True, complib='zlib') + tm.assert_frame_equal(store['c'], df) + + # can't compress if table=False + self.assertRaises(ValueError, store.put, 'b', df, + table=False, complib='zlib') def test_put_compression_blosc(self): tm.skip_if_no_package('tables', '2.2', app='blosc support') df = tm.makeTimeDataFrame() - # can't compress if table=False - self.assertRaises(ValueError, self.store.put, 'b', df, - table=False, complib='blosc') - - self.store.put('c', df, table=True, complib='blosc') - tm.assert_frame_equal(self.store['c'], df) + with ensure_clean(self.path) as store: + # can't compress if table=False + self.assertRaises(ValueError, store.put, 'b', df, + table=False, complib='blosc') + + store.put('c', df, table=True, complib='blosc') + tm.assert_frame_equal(store['c'], df) + def test_put_integer(self): # non-date, non-string index df = DataFrame(np.random.randn(50, 100)) @@ -277,432 +306,447 @@ def test_put_mixed_type(self): df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0) df.ix[3:6, ['obj1']] = np.nan df = df.consolidate().convert_objects() - self.store.remove('df') - warnings.filterwarnings('ignore', category=PerformanceWarning) - self.store.put('df',df) - expected = self.store.get('df') - tm.assert_frame_equal(expected,df) - warnings.filterwarnings('always', category=PerformanceWarning) + with ensure_clean(self.path) as store: + store.remove('df') + warnings.filterwarnings('ignore', category=PerformanceWarning) + store.put('df',df) + expected = store.get('df') + tm.assert_frame_equal(expected,df) + warnings.filterwarnings('always', category=PerformanceWarning) + def test_append(self): - df = tm.makeTimeDataFrame() - self.store.remove('df1') - self.store.append('df1', df[:10]) - self.store.append('df1', df[10:]) - tm.assert_frame_equal(self.store['df1'], df) - - self.store.remove('df2') - self.store.put('df2', df[:10], table=True) - self.store.append('df2', df[10:]) - tm.assert_frame_equal(self.store['df2'], df) - - self.store.remove('df3') - self.store.append('/df3', df[:10]) - self.store.append('/df3', df[10:]) - tm.assert_frame_equal(self.store['df3'], df) - - # this is allowed by almost always don't want to do it - warnings.filterwarnings('ignore', category=tables.NaturalNameWarning) - self.store.remove('/df3 foo') - self.store.append('/df3 foo', df[:10]) - self.store.append('/df3 foo', df[10:]) - tm.assert_frame_equal(self.store['df3 foo'], df) - warnings.filterwarnings('always', category=tables.NaturalNameWarning) - - # panel - wp = tm.makePanel() - self.store.remove('wp1') - self.store.append('wp1', wp.ix[:, :10, :]) - self.store.append('wp1', wp.ix[:, 10:, :]) - tm.assert_panel_equal(self.store['wp1'], wp) - - # ndim - p4d = tm.makePanel4D() - self.store.remove('p4d') - self.store.append('p4d', p4d.ix[:, :, :10, :]) - self.store.append('p4d', p4d.ix[:, :, 10:, :]) - tm.assert_panel4d_equal(self.store['p4d'], p4d) - - # test using axis labels - self.store.remove('p4d') - self.store.append('p4d', p4d.ix[:, :, :10, :], axes=[ - 'items', 'major_axis', 'minor_axis']) - self.store.append('p4d', p4d.ix[:, :, 10:, :], axes=[ - 'items', 'major_axis', 'minor_axis']) - tm.assert_panel4d_equal(self.store['p4d'], p4d) - - # test using differnt number of items on each axis - p4d2 = p4d.copy() - p4d2['l4'] = p4d['l1'] - p4d2['l5'] = p4d['l1'] - self.store.remove('p4d2') - self.store.append( - 'p4d2', p4d2, axes=['items', 'major_axis', 'minor_axis']) - tm.assert_panel4d_equal(self.store['p4d2'], p4d2) - - # test using differt order of items on the non-index axes - self.store.remove('wp1') - wp_append1 = wp.ix[:, :10, :] - self.store.append('wp1', wp_append1) - wp_append2 = wp.ix[:, 10:, :].reindex(items=wp.items[::-1]) - self.store.append('wp1', wp_append2) - tm.assert_panel_equal(self.store['wp1'], wp) - - # dtype issues - mizxed type in a single object column - df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]]) - df['mixed_column'] = 'testing' - df.ix[2, 'mixed_column'] = np.nan - self.store.remove('df') - self.store.append('df', df) - tm.assert_frame_equal(self.store['df'], df) + with ensure_clean(self.path) as store: + df = tm.makeTimeDataFrame() + store.remove('df1') + store.append('df1', df[:10]) + store.append('df1', df[10:]) + tm.assert_frame_equal(store['df1'], df) + + store.remove('df2') + store.put('df2', df[:10], table=True) + store.append('df2', df[10:]) + tm.assert_frame_equal(store['df2'], df) + + store.remove('df3') + store.append('/df3', df[:10]) + store.append('/df3', df[10:]) + tm.assert_frame_equal(store['df3'], df) + + # this is allowed by almost always don't want to do it + warnings.filterwarnings('ignore', category=tables.NaturalNameWarning) + store.remove('/df3 foo') + store.append('/df3 foo', df[:10]) + store.append('/df3 foo', df[10:]) + tm.assert_frame_equal(store['df3 foo'], df) + warnings.filterwarnings('always', category=tables.NaturalNameWarning) + + # panel + wp = tm.makePanel() + store.remove('wp1') + store.append('wp1', wp.ix[:, :10, :]) + store.append('wp1', wp.ix[:, 10:, :]) + tm.assert_panel_equal(store['wp1'], wp) + + # ndim + p4d = tm.makePanel4D() + store.remove('p4d') + store.append('p4d', p4d.ix[:, :, :10, :]) + store.append('p4d', p4d.ix[:, :, 10:, :]) + tm.assert_panel4d_equal(store['p4d'], p4d) + + # test using axis labels + store.remove('p4d') + store.append('p4d', p4d.ix[:, :, :10, :], axes=[ + 'items', 'major_axis', 'minor_axis']) + store.append('p4d', p4d.ix[:, :, 10:, :], axes=[ + 'items', 'major_axis', 'minor_axis']) + tm.assert_panel4d_equal(store['p4d'], p4d) + + # test using differnt number of items on each axis + p4d2 = p4d.copy() + p4d2['l4'] = p4d['l1'] + p4d2['l5'] = p4d['l1'] + store.remove('p4d2') + store.append( + 'p4d2', p4d2, axes=['items', 'major_axis', 'minor_axis']) + tm.assert_panel4d_equal(store['p4d2'], p4d2) + + # test using differt order of items on the non-index axes + store.remove('wp1') + wp_append1 = wp.ix[:, :10, :] + store.append('wp1', wp_append1) + wp_append2 = wp.ix[:, 10:, :].reindex(items=wp.items[::-1]) + store.append('wp1', wp_append2) + tm.assert_panel_equal(store['wp1'], wp) + + # dtype issues - mizxed type in a single object column + df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]]) + df['mixed_column'] = 'testing' + df.ix[2, 'mixed_column'] = np.nan + store.remove('df') + store.append('df', df) + tm.assert_frame_equal(store['df'], df) def test_append_frame_column_oriented(self): - # column oriented - df = tm.makeTimeDataFrame() - self.store.remove('df1') - self.store.append('df1', df.ix[:, :2], axes=['columns']) - self.store.append('df1', df.ix[:, 2:]) - tm.assert_frame_equal(self.store['df1'], df) - - result = self.store.select('df1', 'columns=A') - expected = df.reindex(columns=['A']) - tm.assert_frame_equal(expected, result) - - # this isn't supported - self.assertRaises(Exception, self.store.select, 'df1', ( - 'columns=A', Term('index', '>', df.index[4]))) - - # selection on the non-indexable - result = self.store.select( - 'df1', ('columns=A', Term('index', '=', df.index[0:4]))) - expected = df.reindex(columns=['A'], index=df.index[0:4]) - tm.assert_frame_equal(expected, result) + with ensure_clean(self.path) as store: + # column oriented + df = tm.makeTimeDataFrame() + store.remove('df1') + store.append('df1', df.ix[:, :2], axes=['columns']) + store.append('df1', df.ix[:, 2:]) + tm.assert_frame_equal(store['df1'], df) + + result = store.select('df1', 'columns=A') + expected = df.reindex(columns=['A']) + tm.assert_frame_equal(expected, result) + + # this isn't supported + self.assertRaises(Exception, store.select, 'df1', ( + 'columns=A', Term('index', '>', df.index[4]))) + + # selection on the non-indexable + result = store.select( + 'df1', ('columns=A', Term('index', '=', df.index[0:4]))) + expected = df.reindex(columns=['A'], index=df.index[0:4]) + tm.assert_frame_equal(expected, result) def test_ndim_indexables(self): """ test using ndim tables in new ways""" - p4d = tm.makePanel4D() - - def check_indexers(key, indexers): - for i, idx in enumerate(indexers): - self.assert_(getattr(getattr( - self.store.root, key).table.description, idx)._v_pos == i) - - # append then change (will take existing schema) - indexers = ['items', 'major_axis', 'minor_axis'] - - self.store.remove('p4d') - self.store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) - self.store.append('p4d', p4d.ix[:, :, 10:, :]) - tm.assert_panel4d_equal(self.store.select('p4d'), p4d) - check_indexers('p4d', indexers) - - # same as above, but try to append with differnt axes - self.store.remove('p4d') - self.store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) - self.store.append('p4d', p4d.ix[:, :, 10:, :], axes=[ - 'labels', 'items', 'major_axis']) - tm.assert_panel4d_equal(self.store.select('p4d'), p4d) - check_indexers('p4d', indexers) - - # pass incorrect number of axes - self.store.remove('p4d') - self.assertRaises(Exception, self.store.append, 'p4d', p4d.ix[ - :, :, :10, :], axes=['major_axis', 'minor_axis']) - - # different than default indexables #1 - indexers = ['labels', 'major_axis', 'minor_axis'] - self.store.remove('p4d') - self.store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) - self.store.append('p4d', p4d.ix[:, :, 10:, :]) - tm.assert_panel4d_equal(self.store['p4d'], p4d) - check_indexers('p4d', indexers) - - # different than default indexables #2 - indexers = ['major_axis', 'labels', 'minor_axis'] - self.store.remove('p4d') - self.store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) - self.store.append('p4d', p4d.ix[:, :, 10:, :]) - tm.assert_panel4d_equal(self.store['p4d'], p4d) - check_indexers('p4d', indexers) - - # partial selection - result = self.store.select('p4d', ['labels=l1']) - expected = p4d.reindex(labels=['l1']) - tm.assert_panel4d_equal(result, expected) - - # partial selection2 - result = self.store.select('p4d', [Term( - 'labels=l1'), Term('items=ItemA'), Term('minor_axis=B')]) - expected = p4d.reindex( - labels=['l1'], items=['ItemA'], minor_axis=['B']) - tm.assert_panel4d_equal(result, expected) - - # non-existant partial selection - result = self.store.select('p4d', [Term( - 'labels=l1'), Term('items=Item1'), Term('minor_axis=B')]) - expected = p4d.reindex(labels=['l1'], items=[], minor_axis=['B']) - tm.assert_panel4d_equal(result, expected) + with ensure_clean(self.path) as store: + + p4d = tm.makePanel4D() + + def check_indexers(key, indexers): + for i, idx in enumerate(indexers): + self.assert_(getattr(getattr( + store.root, key).table.description, idx)._v_pos == i) + + # append then change (will take existing schema) + indexers = ['items', 'major_axis', 'minor_axis'] + + store.remove('p4d') + store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) + store.append('p4d', p4d.ix[:, :, 10:, :]) + tm.assert_panel4d_equal(store.select('p4d'), p4d) + check_indexers('p4d', indexers) + + # same as above, but try to append with differnt axes + store.remove('p4d') + store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) + store.append('p4d', p4d.ix[:, :, 10:, :], axes=[ + 'labels', 'items', 'major_axis']) + tm.assert_panel4d_equal(store.select('p4d'), p4d) + check_indexers('p4d', indexers) + + # pass incorrect number of axes + store.remove('p4d') + self.assertRaises(Exception, store.append, 'p4d', p4d.ix[ + :, :, :10, :], axes=['major_axis', 'minor_axis']) + + # different than default indexables #1 + indexers = ['labels', 'major_axis', 'minor_axis'] + store.remove('p4d') + store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) + store.append('p4d', p4d.ix[:, :, 10:, :]) + tm.assert_panel4d_equal(store['p4d'], p4d) + check_indexers('p4d', indexers) + + # different than default indexables #2 + indexers = ['major_axis', 'labels', 'minor_axis'] + store.remove('p4d') + store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) + store.append('p4d', p4d.ix[:, :, 10:, :]) + tm.assert_panel4d_equal(store['p4d'], p4d) + check_indexers('p4d', indexers) + + # partial selection + result = store.select('p4d', ['labels=l1']) + expected = p4d.reindex(labels=['l1']) + tm.assert_panel4d_equal(result, expected) + + # partial selection2 + result = store.select('p4d', [Term( + 'labels=l1'), Term('items=ItemA'), Term('minor_axis=B')]) + expected = p4d.reindex( + labels=['l1'], items=['ItemA'], minor_axis=['B']) + tm.assert_panel4d_equal(result, expected) + + # non-existant partial selection + result = store.select('p4d', [Term( + 'labels=l1'), Term('items=Item1'), Term('minor_axis=B')]) + expected = p4d.reindex(labels=['l1'], items=[], minor_axis=['B']) + tm.assert_panel4d_equal(result, expected) def test_append_with_strings(self): - wp = tm.makePanel() - wp2 = wp.rename_axis( - dict([(x, "%s_extra" % x) for x in wp.minor_axis]), axis=2) - - def check_col(key,name,size): - self.assert_(getattr(self.store.get_storer(key).table.description,name).itemsize == size) - - self.store.append('s1', wp, min_itemsize=20) - self.store.append('s1', wp2) - expected = concat([wp, wp2], axis=2) - expected = expected.reindex(minor_axis=sorted(expected.minor_axis)) - tm.assert_panel_equal(self.store['s1'], expected) - check_col('s1', 'minor_axis', 20) - - # test dict format - self.store.append('s2', wp, min_itemsize={'minor_axis': 20}) - self.store.append('s2', wp2) - expected = concat([wp, wp2], axis=2) - expected = expected.reindex(minor_axis=sorted(expected.minor_axis)) - tm.assert_panel_equal(self.store['s2'], expected) - check_col('s2', 'minor_axis', 20) - - # apply the wrong field (similar to #1) - self.store.append('s3', wp, min_itemsize={'major_axis': 20}) - self.assertRaises(Exception, self.store.append, 's3') - - # test truncation of bigger strings - self.store.append('s4', wp) - self.assertRaises(Exception, self.store.append, 's4', wp2) - - # avoid truncation on elements - df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']]) - self.store.append('df_big', df) - tm.assert_frame_equal(self.store.select('df_big'), df) - check_col('df_big', 'values_block_1', 15) - - # appending smaller string ok - df2 = DataFrame([[124, 'asdqy'], [346, 'dggnhefbdfb']]) - self.store.append('df_big', df2) - expected = concat([df, df2]) - tm.assert_frame_equal(self.store.select('df_big'), expected) - check_col('df_big', 'values_block_1', 15) - - # avoid truncation on elements - df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']]) - self.store.append('df_big2', df, min_itemsize={'values': 50}) - tm.assert_frame_equal(self.store.select('df_big2'), df) - check_col('df_big2', 'values_block_1', 50) - - # bigger string on next append - self.store.append('df_new', df) - df_new = DataFrame( - [[124, 'abcdefqhij'], [346, 'abcdefghijklmnopqrtsuvwxyz']]) - self.assertRaises(Exception, self.store.append, 'df_new', df_new) - - # with nans - self.store.remove('df') - df = tm.makeTimeDataFrame() - df['string'] = 'foo' - df.ix[1:4, 'string'] = np.nan - df['string2'] = 'bar' - df.ix[4:8, 'string2'] = np.nan - df['string3'] = 'bah' - df.ix[1:, 'string3'] = np.nan - self.store.append('df', df) - result = self.store.select('df') - tm.assert_frame_equal(result, df) - def test_append_with_data_columns(self): + with ensure_clean(self.path) as store: + wp = tm.makePanel() + wp2 = wp.rename_axis( + dict([(x, "%s_extra" % x) for x in wp.minor_axis]), axis=2) + + def check_col(key,name,size): + self.assert_(getattr(store.get_storer(key).table.description,name).itemsize == size) + + store.append('s1', wp, min_itemsize=20) + store.append('s1', wp2) + expected = concat([wp, wp2], axis=2) + expected = expected.reindex(minor_axis=sorted(expected.minor_axis)) + tm.assert_panel_equal(store['s1'], expected) + check_col('s1', 'minor_axis', 20) + + # test dict format + store.append('s2', wp, min_itemsize={'minor_axis': 20}) + store.append('s2', wp2) + expected = concat([wp, wp2], axis=2) + expected = expected.reindex(minor_axis=sorted(expected.minor_axis)) + tm.assert_panel_equal(store['s2'], expected) + check_col('s2', 'minor_axis', 20) + + # apply the wrong field (similar to #1) + store.append('s3', wp, min_itemsize={'major_axis': 20}) + self.assertRaises(Exception, store.append, 's3') + + # test truncation of bigger strings + store.append('s4', wp) + self.assertRaises(Exception, store.append, 's4', wp2) + + # avoid truncation on elements + df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']]) + store.append('df_big', df) + tm.assert_frame_equal(store.select('df_big'), df) + check_col('df_big', 'values_block_1', 15) + + # appending smaller string ok + df2 = DataFrame([[124, 'asdqy'], [346, 'dggnhefbdfb']]) + store.append('df_big', df2) + expected = concat([df, df2]) + tm.assert_frame_equal(store.select('df_big'), expected) + check_col('df_big', 'values_block_1', 15) + + # avoid truncation on elements + df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']]) + store.append('df_big2', df, min_itemsize={'values': 50}) + tm.assert_frame_equal(store.select('df_big2'), df) + check_col('df_big2', 'values_block_1', 50) + + # bigger string on next append + store.append('df_new', df) + df_new = DataFrame( + [[124, 'abcdefqhij'], [346, 'abcdefghijklmnopqrtsuvwxyz']]) + self.assertRaises(Exception, store.append, 'df_new', df_new) + + # with nans + store.remove('df') + df = tm.makeTimeDataFrame() + df['string'] = 'foo' + df.ix[1:4, 'string'] = np.nan + df['string2'] = 'bar' + df.ix[4:8, 'string2'] = np.nan + df['string3'] = 'bah' + df.ix[1:, 'string3'] = np.nan + store.append('df', df) + result = store.select('df') + tm.assert_frame_equal(result, df) - df = tm.makeTimeDataFrame() - self.store.remove('df') - self.store.append('df', df[:2], data_columns=['B']) - self.store.append('df', df[2:]) - tm.assert_frame_equal(self.store['df'], df) - - # check that we have indicies created - assert(self.store.handle.root.df.table.cols.index.is_indexed is True) - assert(self.store.handle.root.df.table.cols.B.is_indexed is True) - - # data column searching - result = self.store.select('df', [Term('B>0')]) - expected = df[df.B > 0] - tm.assert_frame_equal(result, expected) - - # data column searching (with an indexable and a data_columns) - result = self.store.select( - 'df', [Term('B>0'), Term('index', '>', df.index[3])]) - df_new = df.reindex(index=df.index[4:]) - expected = df_new[df_new.B > 0] - tm.assert_frame_equal(result, expected) - - # data column selection with a string data_column - df_new = df.copy() - df_new['string'] = 'foo' - df_new['string'][1:4] = np.nan - df_new['string'][5:6] = 'bar' - self.store.remove('df') - self.store.append('df', df_new, data_columns=['string']) - result = self.store.select('df', [Term('string', '=', 'foo')]) - expected = df_new[df_new.string == 'foo'] - tm.assert_frame_equal(result, expected) - - # using min_itemsize and a data column - def check_col(key,name,size): - self.assert_(getattr(self.store.get_storer(key).table.description,name).itemsize == size) - - self.store.remove('df') - self.store.append('df', df_new, data_columns=['string'], - min_itemsize={'string': 30}) - check_col('df', 'string', 30) - self.store.remove('df') - self.store.append( - 'df', df_new, data_columns=['string'], min_itemsize=30) - check_col('df', 'string', 30) - self.store.remove('df') - self.store.append('df', df_new, data_columns=['string'], - min_itemsize={'values': 30}) - check_col('df', 'string', 30) - - df_new['string2'] = 'foobarbah' - df_new['string_block1'] = 'foobarbah1' - df_new['string_block2'] = 'foobarbah2' - self.store.remove('df') - self.store.append('df', df_new, data_columns=['string', 'string2'], min_itemsize={'string': 30, 'string2': 40, 'values': 50}) - check_col('df', 'string', 30) - check_col('df', 'string2', 40) - check_col('df', 'values_block_1', 50) - - # multiple data columns - df_new = df.copy() - df_new['string'] = 'foo' - df_new['string'][1:4] = np.nan - df_new['string'][5:6] = 'bar' - df_new['string2'] = 'foo' - df_new['string2'][2:5] = np.nan - df_new['string2'][7:8] = 'bar' - self.store.remove('df') - self.store.append( - 'df', df_new, data_columns=['A', 'B', 'string', 'string2']) - result = self.store.select('df', [Term('string', '=', 'foo'), Term( - 'string2=foo'), Term('A>0'), Term('B<0')]) - expected = df_new[(df_new.string == 'foo') & ( - df_new.string2 == 'foo') & (df_new.A > 0) & (df_new.B < 0)] - tm.assert_frame_equal(result, expected) - - # yield an empty frame - result = self.store.select('df', [Term('string', '=', 'foo'), Term( - 'string2=bar'), Term('A>0'), Term('B<0')]) - expected = df_new[(df_new.string == 'foo') & ( - df_new.string2 == 'bar') & (df_new.A > 0) & (df_new.B < 0)] - tm.assert_frame_equal(result, expected) - - # doc example - df_dc = df.copy() - df_dc['string'] = 'foo' - df_dc.ix[4:6, 'string'] = np.nan - df_dc.ix[7:9, 'string'] = 'bar' - df_dc['string2'] = 'cool' - df_dc['datetime'] = Timestamp('20010102') - df_dc = df_dc.convert_objects() - df_dc.ix[3:5, ['A', 'B', 'datetime']] = np.nan - - self.store.remove('df_dc') - self.store.append('df_dc', df_dc, data_columns=['B', 'C', - 'string', 'string2', 'datetime']) - result = self.store.select('df_dc', [Term('B>0')]) - - expected = df_dc[df_dc.B > 0] - tm.assert_frame_equal(result, expected) - - result = self.store.select( - 'df_dc', ['B > 0', 'C > 0', 'string == foo']) - expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & ( - df_dc.string == 'foo')] - tm.assert_frame_equal(result, expected) + def test_append_with_data_columns(self): + with ensure_clean(self.path) as store: + df = tm.makeTimeDataFrame() + store.remove('df') + store.append('df', df[:2], data_columns=['B']) + store.append('df', df[2:]) + tm.assert_frame_equal(store['df'], df) + + # check that we have indicies created + assert(store.handle.root.df.table.cols.index.is_indexed is True) + assert(store.handle.root.df.table.cols.B.is_indexed is True) + + # data column searching + result = store.select('df', [Term('B>0')]) + expected = df[df.B > 0] + tm.assert_frame_equal(result, expected) + + # data column searching (with an indexable and a data_columns) + result = store.select( + 'df', [Term('B>0'), Term('index', '>', df.index[3])]) + df_new = df.reindex(index=df.index[4:]) + expected = df_new[df_new.B > 0] + tm.assert_frame_equal(result, expected) + + # data column selection with a string data_column + df_new = df.copy() + df_new['string'] = 'foo' + df_new['string'][1:4] = np.nan + df_new['string'][5:6] = 'bar' + store.remove('df') + store.append('df', df_new, data_columns=['string']) + result = store.select('df', [Term('string', '=', 'foo')]) + expected = df_new[df_new.string == 'foo'] + tm.assert_frame_equal(result, expected) + + # using min_itemsize and a data column + def check_col(key,name,size): + self.assert_(getattr(store.get_storer(key).table.description,name).itemsize == size) + + with ensure_clean(self.path) as store: + store.remove('df') + store.append('df', df_new, data_columns=['string'], + min_itemsize={'string': 30}) + check_col('df', 'string', 30) + store.remove('df') + store.append( + 'df', df_new, data_columns=['string'], min_itemsize=30) + check_col('df', 'string', 30) + store.remove('df') + store.append('df', df_new, data_columns=['string'], + min_itemsize={'values': 30}) + check_col('df', 'string', 30) + + with ensure_clean(self.path) as store: + df_new['string2'] = 'foobarbah' + df_new['string_block1'] = 'foobarbah1' + df_new['string_block2'] = 'foobarbah2' + store.remove('df') + store.append('df', df_new, data_columns=['string', 'string2'], min_itemsize={'string': 30, 'string2': 40, 'values': 50}) + check_col('df', 'string', 30) + check_col('df', 'string2', 40) + check_col('df', 'values_block_1', 50) + + with ensure_clean(self.path) as store: + # multiple data columns + df_new = df.copy() + df_new['string'] = 'foo' + df_new['string'][1:4] = np.nan + df_new['string'][5:6] = 'bar' + df_new['string2'] = 'foo' + df_new['string2'][2:5] = np.nan + df_new['string2'][7:8] = 'bar' + store.remove('df') + store.append( + 'df', df_new, data_columns=['A', 'B', 'string', 'string2']) + result = store.select('df', [Term('string', '=', 'foo'), Term( + 'string2=foo'), Term('A>0'), Term('B<0')]) + expected = df_new[(df_new.string == 'foo') & ( + df_new.string2 == 'foo') & (df_new.A > 0) & (df_new.B < 0)] + tm.assert_frame_equal(result, expected) + + # yield an empty frame + result = store.select('df', [Term('string', '=', 'foo'), Term( + 'string2=bar'), Term('A>0'), Term('B<0')]) + expected = df_new[(df_new.string == 'foo') & ( + df_new.string2 == 'bar') & (df_new.A > 0) & (df_new.B < 0)] + tm.assert_frame_equal(result, expected) + + with ensure_clean(self.path) as store: + # doc example + df_dc = df.copy() + df_dc['string'] = 'foo' + df_dc.ix[4:6, 'string'] = np.nan + df_dc.ix[7:9, 'string'] = 'bar' + df_dc['string2'] = 'cool' + df_dc['datetime'] = Timestamp('20010102') + df_dc = df_dc.convert_objects() + df_dc.ix[3:5, ['A', 'B', 'datetime']] = np.nan + + store.remove('df_dc') + store.append('df_dc', df_dc, data_columns=['B', 'C', + 'string', 'string2', 'datetime']) + result = store.select('df_dc', [Term('B>0')]) + + expected = df_dc[df_dc.B > 0] + tm.assert_frame_equal(result, expected) + + result = store.select( + 'df_dc', ['B > 0', 'C > 0', 'string == foo']) + expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & ( + df_dc.string == 'foo')] + tm.assert_frame_equal(result, expected) + def test_create_table_index(self): - - def col(t,column): - return getattr(self.store.get_storer(t).table.cols,column) - - # index=False - wp = tm.makePanel() - self.store.append('p5', wp, index=False) - self.store.create_table_index('p5', columns=['major_axis']) - assert(col('p5', 'major_axis').is_indexed is True) - assert(col('p5', 'minor_axis').is_indexed is False) - - # index=True - self.store.append('p5i', wp, index=True) - assert(col('p5i', 'major_axis').is_indexed is True) - assert(col('p5i', 'minor_axis').is_indexed is True) - - # default optlevels - self.store.get_storer('p5').create_index() - assert(col('p5', 'major_axis').index.optlevel == 6) - assert(col('p5', 'minor_axis').index.kind == 'medium') - - # let's change the indexing scheme - self.store.create_table_index('p5') - assert(col('p5', 'major_axis').index.optlevel == 6) - assert(col('p5', 'minor_axis').index.kind == 'medium') - self.store.create_table_index('p5', optlevel=9) - assert(col('p5', 'major_axis').index.optlevel == 9) - assert(col('p5', 'minor_axis').index.kind == 'medium') - self.store.create_table_index('p5', kind='full') - assert(col('p5', 'major_axis').index.optlevel == 9) - assert(col('p5', 'minor_axis').index.kind == 'full') - self.store.create_table_index('p5', optlevel=1, kind='light') - assert(col('p5', 'major_axis').index.optlevel == 1) - assert(col('p5', 'minor_axis').index.kind == 'light') - - # data columns - df = tm.makeTimeDataFrame() - df['string'] = 'foo' - df['string2'] = 'bar' - self.store.append('f', df, data_columns=['string', 'string2']) - assert(col('f', 'index').is_indexed is True) - assert(col('f', 'string').is_indexed is True) - assert(col('f', 'string2').is_indexed is True) - - # specify index=columns - self.store.append( - 'f2', df, index=['string'], data_columns=['string', 'string2']) - assert(col('f2', 'index').is_indexed is False) - assert(col('f2', 'string').is_indexed is True) - assert(col('f2', 'string2').is_indexed is False) - - # try to index a non-table - self.store.remove('f2') - self.store.put('f2', df) - self.assertRaises(Exception, self.store.create_table_index, 'f2') - - # try to change the version supports flag - from pandas.io import pytables - pytables._table_supports_index = False - self.assertRaises(Exception, self.store.create_table_index, 'f') - - # test out some versions - original = tables.__version__ - - for v in ['2.2', '2.2b']: - pytables._table_mod = None - pytables._table_supports_index = False - tables.__version__ = v - self.assertRaises(Exception, self.store.create_table_index, 'f') - - for v in ['2.3.1', '2.3.1b', '2.4dev', '2.4', original]: - pytables._table_mod = None + + with ensure_clean(self.path) as store: + + def col(t,column): + return getattr(store.get_storer(t).table.cols,column) + + # index=False + wp = tm.makePanel() + store.append('p5', wp, index=False) + store.create_table_index('p5', columns=['major_axis']) + assert(col('p5', 'major_axis').is_indexed is True) + assert(col('p5', 'minor_axis').is_indexed is False) + + # index=True + store.append('p5i', wp, index=True) + assert(col('p5i', 'major_axis').is_indexed is True) + assert(col('p5i', 'minor_axis').is_indexed is True) + + # default optlevels + store.get_storer('p5').create_index() + assert(col('p5', 'major_axis').index.optlevel == 6) + assert(col('p5', 'minor_axis').index.kind == 'medium') + + # let's change the indexing scheme + store.create_table_index('p5') + assert(col('p5', 'major_axis').index.optlevel == 6) + assert(col('p5', 'minor_axis').index.kind == 'medium') + store.create_table_index('p5', optlevel=9) + assert(col('p5', 'major_axis').index.optlevel == 9) + assert(col('p5', 'minor_axis').index.kind == 'medium') + store.create_table_index('p5', kind='full') + assert(col('p5', 'major_axis').index.optlevel == 9) + assert(col('p5', 'minor_axis').index.kind == 'full') + store.create_table_index('p5', optlevel=1, kind='light') + assert(col('p5', 'major_axis').index.optlevel == 1) + assert(col('p5', 'minor_axis').index.kind == 'light') + + # data columns + df = tm.makeTimeDataFrame() + df['string'] = 'foo' + df['string2'] = 'bar' + store.append('f', df, data_columns=['string', 'string2']) + assert(col('f', 'index').is_indexed is True) + assert(col('f', 'string').is_indexed is True) + assert(col('f', 'string2').is_indexed is True) + + # specify index=columns + store.append( + 'f2', df, index=['string'], data_columns=['string', 'string2']) + assert(col('f2', 'index').is_indexed is False) + assert(col('f2', 'string').is_indexed is True) + assert(col('f2', 'string2').is_indexed is False) + + # try to index a non-table + store.remove('f2') + store.put('f2', df) + self.assertRaises(Exception, store.create_table_index, 'f2') + + # try to change the version supports flag + from pandas.io import pytables pytables._table_supports_index = False - tables.__version__ = v - self.store.create_table_index('f') - pytables._table_mod = None - pytables._table_supports_index = False - tables.__version__ = original + self.assertRaises(Exception, store.create_table_index, 'f') + + # test out some versions + original = tables.__version__ + + for v in ['2.2', '2.2b']: + pytables._table_mod = None + pytables._table_supports_index = False + tables.__version__ = v + self.assertRaises(Exception, store.create_table_index, 'f') + + for v in ['2.3.1', '2.3.1b', '2.4dev', '2.4', original]: + pytables._table_mod = None + pytables._table_supports_index = False + tables.__version__ = v + store.create_table_index('f') + pytables._table_mod = None + pytables._table_supports_index = False + tables.__version__ = original def test_big_table_frame(self): raise nose.SkipTest('no big table frame') @@ -715,14 +759,10 @@ def test_big_table_frame(self): import time x = time.time() - try: - store = HDFStore(self.scratchpath) + with ensure_clean(self.path,mode='w') as store: store.append('df', df) rows = store.root.df.table.nrows recons = store.select('df') - finally: - store.close() - os.remove(self.scratchpath) print "\nbig_table frame [%s] -> %5.2f" % (rows, time.time() - x) @@ -743,25 +783,18 @@ def test_big_table2_frame(self): df['datetime%03d' % x] = datetime.datetime(2001, 1, 2, 0, 0) print "\nbig_table2 frame (creation of df) [rows->%s] -> %5.2f" % (len(df.index), time.time() - start_time) - fn = 'big_table2.h5' - - try: - def f(chunksize): - store = HDFStore(fn, mode='w') + def f(chunksize): + with ensure_clean(self.path,mode='w') as store: store.append('df', df, chunksize=chunksize) r = store.root.df.table.nrows - store.close() return r - for c in [10000, 50000, 250000]: - start_time = time.time() - print "big_table2 frame [chunk->%s]" % c - rows = f(c) - print "big_table2 frame [rows->%s,chunk->%s] -> %5.2f" % (rows, c, time.time() - start_time) - - finally: - os.remove(fn) + for c in [10000, 50000, 250000]: + start_time = time.time() + print "big_table2 frame [chunk->%s]" % c + rows = f(c) + print "big_table2 frame [rows->%s,chunk->%s] -> %5.2f" % (rows, c, time.time() - start_time) def test_big_put_frame(self): raise nose.SkipTest('no big put frame') @@ -777,21 +810,15 @@ def test_big_put_frame(self): df['datetime%03d' % x] = datetime.datetime(2001, 1, 2, 0, 0) print "\nbig_put frame (creation of df) [rows->%s] -> %5.2f" % (len(df.index), time.time() - start_time) - fn = 'big_put.h5' - - try: + with ensure_clean(self.path, mode='w') as store: start_time = time.time() store = HDFStore(fn, mode='w') store.put('df', df) - store.close() print df.get_dtype_counts() print "big_put frame [shape->%s] -> %5.2f" % (df.shape, time.time() - start_time) - finally: - os.remove(fn) - def test_big_table_panel(self): raise nose.SkipTest('no big table panel') @@ -807,27 +834,25 @@ def test_big_table_panel(self): import time x = time.time() - try: - store = HDFStore(self.scratchpath) + + + with ensure_clean(self.path, mode='w') as store: store.append('wp', wp) rows = store.root.wp.table.nrows recons = store.select('wp') - finally: - store.close() - os.remove(self.scratchpath) print "\nbig_table panel [%s] -> %5.2f" % (rows, time.time() - x) def test_append_diff_item_order(self): - raise nose.SkipTest('append diff item order') wp = tm.makePanel() wp1 = wp.ix[:, :10, :] wp2 = wp.ix[['ItemC', 'ItemB', 'ItemA'], 10:, :] - - self.store.put('panel', wp1, table=True) - self.assertRaises(Exception, self.store.put, 'panel', wp2, - append=True) + + with ensure_clean(self.path) as store: + store.put('panel', wp1, table=True) + self.assertRaises(Exception, store.put, 'panel', wp2, + append=True) def test_append_hierarchical(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], @@ -838,75 +863,81 @@ def test_append_hierarchical(self): df = DataFrame(np.random.randn(10, 3), index=index, columns=['A', 'B', 'C']) - self.store.append('mi', df) - result = self.store.select('mi') - tm.assert_frame_equal(result, df) + with ensure_clean(self.path) as store: + store.append('mi', df) + result = store.select('mi') + tm.assert_frame_equal(result, df) def test_append_misc(self): - # unsuported data types for non-tables - p4d = tm.makePanel4D() - self.assertRaises(Exception, self.store.put,'p4d',p4d) + with ensure_clean(self.path) as store: - # unsupported data type for table - s = tm.makeStringSeries() - self.assertRaises(Exception, self.store.append,'s',s) + # unsuported data types for non-tables + p4d = tm.makePanel4D() + self.assertRaises(Exception, store.put,'p4d',p4d) - # unsuported data types - self.assertRaises(Exception, self.store.put,'abc',None) - self.assertRaises(Exception, self.store.put,'abc','123') - self.assertRaises(Exception, self.store.put,'abc',123) - self.assertRaises(Exception, self.store.put,'abc',np.arange(5)) + # unsupported data type for table + s = tm.makeStringSeries() + self.assertRaises(Exception, store.append,'s',s) - df = tm.makeDataFrame() - self.store.append('df', df, chunksize=1) - result = self.store.select('df') - tm.assert_frame_equal(result, df) + # unsuported data types + self.assertRaises(Exception, store.put,'abc',None) + self.assertRaises(Exception, store.put,'abc','123') + self.assertRaises(Exception, store.put,'abc',123) + self.assertRaises(Exception, store.put,'abc',np.arange(5)) + + df = tm.makeDataFrame() + store.append('df', df, chunksize=1) + result = store.select('df') + tm.assert_frame_equal(result, df) - self.store.append('df1', df, expectedrows=10) - result = self.store.select('df1') - tm.assert_frame_equal(result, df) + store.append('df1', df, expectedrows=10) + result = store.select('df1') + tm.assert_frame_equal(result, df) def test_table_index_incompatible_dtypes(self): df1 = DataFrame({'a': [1, 2, 3]}) df2 = DataFrame({'a': [4, 5, 6]}, index=date_range('1/1/2000', periods=3)) - self.store.put('frame', df1, table=True) - self.assertRaises(Exception, self.store.put, 'frame', df2, - table=True, append=True) + with ensure_clean(self.path) as store: + store.put('frame', df1, table=True) + self.assertRaises(Exception, store.put, 'frame', df2, + table=True, append=True) def test_table_values_dtypes_roundtrip(self): - df1 = DataFrame({'a': [1, 2, 3]}, dtype='f8') - self.store.append('df_f8', df1) - assert df1.dtypes == self.store['df_f8'].dtypes - - df2 = DataFrame({'a': [1, 2, 3]}, dtype='i8') - self.store.append('df_i8', df2) - assert df2.dtypes == self.store['df_i8'].dtypes - - # incompatible dtype - self.assertRaises(Exception, self.store.append, 'df_i8', df1) - - # check creation/storage/retrieval of float32 (a bit hacky to actually create them thought) - df1 = DataFrame(np.array([[1],[2],[3]],dtype='f4'),columns = ['A']) - self.store.append('df_f4', df1) - assert df1.dtypes == self.store['df_f4'].dtypes - assert df1.dtypes[0] == 'float32' - - # check with mixed dtypes (but not multi float types) - df1 = DataFrame(np.array([[1],[2],[3]],dtype='f4'),columns = ['float32']) - df1['string'] = 'foo' - self.store.append('df_mixed_dtypes1', df1) - assert (df1.dtypes == self.store['df_mixed_dtypes1'].dtypes).all() == True - assert df1.dtypes[0] == 'float32' - assert df1.dtypes[1] == 'object' - - ### this is not supported, e.g. mixed float32/float64 blocks ### - #df1 = DataFrame(np.array([[1],[2],[3]],dtype='f4'),columns = ['float32']) - #df1['float64'] = 1.0 - #self.store.append('df_mixed_dtypes2', df1) - #assert df1.dtypes == self.store['df_mixed_dtypes2'].dtypes).all() == True + + with ensure_clean(self.path) as store: + df1 = DataFrame({'a': [1, 2, 3]}, dtype='f8') + store.append('df_f8', df1) + assert df1.dtypes == store['df_f8'].dtypes + + df2 = DataFrame({'a': [1, 2, 3]}, dtype='i8') + store.append('df_i8', df2) + assert df2.dtypes == store['df_i8'].dtypes + + # incompatible dtype + self.assertRaises(Exception, store.append, 'df_i8', df1) + + # check creation/storage/retrieval of float32 (a bit hacky to actually create them thought) + df1 = DataFrame(np.array([[1],[2],[3]],dtype='f4'),columns = ['A']) + store.append('df_f4', df1) + assert df1.dtypes == store['df_f4'].dtypes + assert df1.dtypes[0] == 'float32' + + # check with mixed dtypes (but not multi float types) + df1 = DataFrame(np.array([[1],[2],[3]],dtype='f4'),columns = ['float32']) + df1['string'] = 'foo' + store.append('df_mixed_dtypes1', df1) + assert (df1.dtypes == store['df_mixed_dtypes1'].dtypes).all() == True + assert df1.dtypes[0] == 'float32' + assert df1.dtypes[1] == 'object' + + ### this is not supported, e.g. mixed float32/float64 blocks ### + #df1 = DataFrame(np.array([[1],[2],[3]],dtype='f4'),columns = ['float32']) + #df1['float64'] = 1.0 + #store.append('df_mixed_dtypes2', df1) + #assert df1.dtypes == store['df_mixed_dtypes2'].dtypes).all() == True def test_table_mixed_dtypes(self): @@ -926,8 +957,9 @@ def test_table_mixed_dtypes(self): df.ix[3:6, ['obj1']] = np.nan df = df.consolidate().convert_objects() - self.store.append('df1_mixed', df) - tm.assert_frame_equal(self.store.select('df1_mixed'), df) + with ensure_clean(self.path) as store: + store.append('df1_mixed', df) + tm.assert_frame_equal(store.select('df1_mixed'), df) # panel wp = tm.makePanel() @@ -939,8 +971,9 @@ def test_table_mixed_dtypes(self): wp['int2'] = 2 wp = wp.consolidate() - self.store.append('p1_mixed', wp) - tm.assert_panel_equal(self.store.select('p1_mixed'), wp) + with ensure_clean(self.path) as store: + store.append('p1_mixed', wp) + tm.assert_panel_equal(store.select('p1_mixed'), wp) # ndim wp = tm.makePanel4D() @@ -952,16 +985,20 @@ def test_table_mixed_dtypes(self): wp['int2'] = 2 wp = wp.consolidate() - self.store.append('p4d_mixed', wp) - tm.assert_panel4d_equal(self.store.select('p4d_mixed'), wp) + with ensure_clean(self.path) as store: + store.append('p4d_mixed', wp) + tm.assert_panel4d_equal(store.select('p4d_mixed'), wp) def test_unimplemented_dtypes_table_columns(self): - #### currently not supported dtypes #### - for n, f in [('unicode', u'\u03c3'), ('date', datetime.date(2001, 1, 2))]: - df = tm.makeDataFrame() - df[n] = f - self.assertRaises( - NotImplementedError, self.store.append, 'df1_%s' % n, df) + + with ensure_clean(self.path) as store: + + ### currently not supported dtypes #### + for n, f in [('unicode', u'\u03c3'), ('date', datetime.date(2001, 1, 2))]: + df = tm.makeDataFrame() + df[n] = f + self.assertRaises( + NotImplementedError, store.append, 'df1_%s' % n, df) # frame df = tm.makeDataFrame() @@ -970,271 +1007,288 @@ def test_unimplemented_dtypes_table_columns(self): df['datetime1'] = datetime.date(2001, 1, 2) df = df.consolidate().convert_objects() - # this fails because we have a date in the object block...... - self.assertRaises(Exception, self.store.append, 'df_unimplemented', df) + with ensure_clean(self.path) as store: + # this fails because we have a date in the object block...... + self.assertRaises(Exception, store.append, 'df_unimplemented', df) def test_remove(self): - ts = tm.makeTimeSeries() - df = tm.makeDataFrame() - self.store['a'] = ts - self.store['b'] = df - self.store.remove('a') - self.assertEquals(len(self.store), 1) - tm.assert_frame_equal(df, self.store['b']) - - self.store.remove('b') - self.assertEquals(len(self.store), 0) - - # pathing - self.store['a'] = ts - self.store['b/foo'] = df - self.store.remove('foo') - self.store.remove('b/foo') - self.assertEquals(len(self.store), 1) - - self.store['a'] = ts - self.store['b/foo'] = df - self.store.remove('b') - self.assertEquals(len(self.store), 1) - - # __delitem__ - self.store['a'] = ts - self.store['b'] = df - del self.store['a'] - del self.store['b'] - self.assertEquals(len(self.store), 0) - def test_remove_where(self): + with ensure_clean(self.path) as store: - # non-existance - crit1 = Term('index', '>', 'foo') - self.store.remove('a', where=[crit1]) + ts = tm.makeTimeSeries() + df = tm.makeDataFrame() + store['a'] = ts + store['b'] = df + store.remove('a') + self.assertEquals(len(store), 1) + tm.assert_frame_equal(df, store['b']) + + store.remove('b') + self.assertEquals(len(store), 0) + + # pathing + store['a'] = ts + store['b/foo'] = df + store.remove('foo') + store.remove('b/foo') + self.assertEquals(len(store), 1) + + store['a'] = ts + store['b/foo'] = df + store.remove('b') + self.assertEquals(len(store), 1) + + # __delitem__ + store['a'] = ts + store['b'] = df + del store['a'] + del store['b'] + self.assertEquals(len(store), 0) - # try to remove non-table (with crit) - # non-table ok (where = None) - wp = tm.makePanel() - self.store.put('wp', wp, table=True) - self.store.remove('wp', [('minor_axis', ['A', 'D'])]) - rs = self.store.select('wp') - expected = wp.reindex(minor_axis=['B', 'C']) - tm.assert_panel_equal(rs, expected) - - # empty where - self.store.remove('wp') - self.store.put('wp', wp, table=True) - - # deleted number (entire table) - n = self.store.remove('wp', []) - assert(n == 120) - - # non - empty where - self.store.remove('wp') - self.store.put('wp', wp, table=True) - self.assertRaises(Exception, self.store.remove, - 'wp', ['foo']) - - # selectin non-table with a where - # self.store.put('wp2', wp, table=False) - # self.assertRaises(Exception, self.store.remove, - # 'wp2', [('column', ['A', 'D'])]) + def test_remove_where(self): + + with ensure_clean(self.path) as store: + + # non-existance + crit1 = Term('index', '>', 'foo') + store.remove('a', where=[crit1]) + + # try to remove non-table (with crit) + # non-table ok (where = None) + wp = tm.makePanel() + store.put('wp', wp, table=True) + store.remove('wp', [('minor_axis', ['A', 'D'])]) + rs = store.select('wp') + expected = wp.reindex(minor_axis=['B', 'C']) + tm.assert_panel_equal(rs, expected) + + # empty where + store.remove('wp') + store.put('wp', wp, table=True) + + # deleted number (entire table) + n = store.remove('wp', []) + assert(n == 120) + + # non - empty where + store.remove('wp') + store.put('wp', wp, table=True) + self.assertRaises(Exception, store.remove, + 'wp', ['foo']) + + # selectin non-table with a where + # store.put('wp2', wp, table=False) + # self.assertRaises(Exception, store.remove, + # 'wp2', [('column', ['A', 'D'])]) def test_remove_crit(self): - wp = tm.makePanel() - # group row removal - date4 = wp.major_axis.take([0, 1, 2, 4, 5, 6, 8, 9, 10]) - crit4 = Term('major_axis', date4) - self.store.put('wp3', wp, table=True) - n = self.store.remove('wp3', where=[crit4]) - assert(n == 36) - result = self.store.select('wp3') - expected = wp.reindex(major_axis=wp.major_axis - date4) - tm.assert_panel_equal(result, expected) - - # upper half - self.store.put('wp', wp, table=True) - date = wp.major_axis[len(wp.major_axis) // 2] - - crit1 = Term('major_axis', '>', date) - crit2 = Term('minor_axis', ['A', 'D']) - n = self.store.remove('wp', where=[crit1]) - - assert(n == 56) - - n = self.store.remove('wp', where=[crit2]) - assert(n == 32) - - result = self.store['wp'] - expected = wp.truncate(after=date).reindex(minor=['B', 'C']) - tm.assert_panel_equal(result, expected) - - # individual row elements - self.store.put('wp2', wp, table=True) - - date1 = wp.major_axis[1:3] - crit1 = Term('major_axis', date1) - self.store.remove('wp2', where=[crit1]) - result = self.store.select('wp2') - expected = wp.reindex(major_axis=wp.major_axis - date1) - tm.assert_panel_equal(result, expected) - - date2 = wp.major_axis[5] - crit2 = Term('major_axis', date2) - self.store.remove('wp2', where=[crit2]) - result = self.store['wp2'] - expected = wp.reindex( - major_axis=wp.major_axis - date1 - Index([date2])) - tm.assert_panel_equal(result, expected) - - date3 = [wp.major_axis[7], wp.major_axis[9]] - crit3 = Term('major_axis', date3) - self.store.remove('wp2', where=[crit3]) - result = self.store['wp2'] - expected = wp.reindex( - major_axis=wp.major_axis - date1 - Index([date2]) - Index(date3)) - tm.assert_panel_equal(result, expected) - - # corners - self.store.put('wp4', wp, table=True) - n = self.store.remove( - 'wp4', where=[Term('major_axis', '>', wp.major_axis[-1])]) - result = self.store.select('wp4') - tm.assert_panel_equal(result, wp) + with ensure_clean(self.path) as store: + + wp = tm.makePanel() + + # group row removal + date4 = wp.major_axis.take([0, 1, 2, 4, 5, 6, 8, 9, 10]) + crit4 = Term('major_axis', date4) + store.put('wp3', wp, table=True) + n = store.remove('wp3', where=[crit4]) + assert(n == 36) + result = store.select('wp3') + expected = wp.reindex(major_axis=wp.major_axis - date4) + tm.assert_panel_equal(result, expected) + + # upper half + store.put('wp', wp, table=True) + date = wp.major_axis[len(wp.major_axis) // 2] + + crit1 = Term('major_axis', '>', date) + crit2 = Term('minor_axis', ['A', 'D']) + n = store.remove('wp', where=[crit1]) + + assert(n == 56) + + n = store.remove('wp', where=[crit2]) + assert(n == 32) + + result = store['wp'] + expected = wp.truncate(after=date).reindex(minor=['B', 'C']) + tm.assert_panel_equal(result, expected) + + # individual row elements + store.put('wp2', wp, table=True) + + date1 = wp.major_axis[1:3] + crit1 = Term('major_axis', date1) + store.remove('wp2', where=[crit1]) + result = store.select('wp2') + expected = wp.reindex(major_axis=wp.major_axis - date1) + tm.assert_panel_equal(result, expected) + + date2 = wp.major_axis[5] + crit2 = Term('major_axis', date2) + store.remove('wp2', where=[crit2]) + result = store['wp2'] + expected = wp.reindex( + major_axis=wp.major_axis - date1 - Index([date2])) + tm.assert_panel_equal(result, expected) + + date3 = [wp.major_axis[7], wp.major_axis[9]] + crit3 = Term('major_axis', date3) + store.remove('wp2', where=[crit3]) + result = store['wp2'] + expected = wp.reindex( + major_axis=wp.major_axis - date1 - Index([date2]) - Index(date3)) + tm.assert_panel_equal(result, expected) + + # corners + store.put('wp4', wp, table=True) + n = store.remove( + 'wp4', where=[Term('major_axis', '>', wp.major_axis[-1])]) + result = store.select('wp4') + tm.assert_panel_equal(result, wp) def test_terms(self): - wp = tm.makePanel() - p4d = tm.makePanel4D() - self.store.put('wp', wp, table=True) - self.store.put('p4d', p4d, table=True) - - # some invalid terms - terms = [ - ['minor', ['A', 'B']], - ['index', ['20121114']], - ['index', ['20121114', '20121114']], - ] - for t in terms: - self.assertRaises(Exception, self.store.select, 'wp', t) - - self.assertRaises(Exception, Term.__init__) - self.assertRaises(Exception, Term.__init__, 'blah') - self.assertRaises(Exception, Term.__init__, 'index') - self.assertRaises(Exception, Term.__init__, 'index', '==') - self.assertRaises(Exception, Term.__init__, 'index', '>', 5) - - # panel - result = self.store.select('wp', [Term( - 'major_axis<20000108'), Term('minor_axis', '=', ['A', 'B'])]) - expected = wp.truncate(after='20000108').reindex(minor=['A', 'B']) - tm.assert_panel_equal(result, expected) - - # p4d - result = self.store.select('p4d', [Term('major_axis<20000108'), - Term('minor_axis', '=', ['A', 'B']), - Term('items', '=', ['ItemA', 'ItemB'])]) - expected = p4d.truncate(after='20000108').reindex( - minor=['A', 'B'], items=['ItemA', 'ItemB']) - tm.assert_panel4d_equal(result, expected) - - # valid terms - terms = [ - dict(field='major_axis', op='>', value='20121114'), - ('major_axis', '20121114'), - ('major_axis', '>', '20121114'), - (('major_axis', ['20121114', '20121114']),), - ('major_axis', datetime.datetime(2012, 11, 14)), - 'major_axis> 20121114', - 'major_axis >20121114', - 'major_axis > 20121114', - (('minor_axis', ['A', 'B']),), - (('minor_axis', ['A', 'B']),), - ((('minor_axis', ['A', 'B']),),), - (('items', ['ItemA', 'ItemB']),), - ('items=ItemA'), - ] - - for t in terms: - self.store.select('wp', t) - self.store.select('p4d', t) - - # valid for p4d only - terms = [ - (('labels', '=', ['l1', 'l2']),), - Term('labels', '=', ['l1', 'l2']), - ] - - for t in terms: - self.store.select('p4d', t) + with ensure_clean(self.path) as store: + + wp = tm.makePanel() + p4d = tm.makePanel4D() + store.put('wp', wp, table=True) + store.put('p4d', p4d, table=True) + + # some invalid terms + terms = [ + ['minor', ['A', 'B']], + ['index', ['20121114']], + ['index', ['20121114', '20121114']], + ] + for t in terms: + self.assertRaises(Exception, store.select, 'wp', t) + + self.assertRaises(Exception, Term.__init__) + self.assertRaises(Exception, Term.__init__, 'blah') + self.assertRaises(Exception, Term.__init__, 'index') + self.assertRaises(Exception, Term.__init__, 'index', '==') + self.assertRaises(Exception, Term.__init__, 'index', '>', 5) + + # panel + result = store.select('wp', [Term( + 'major_axis<20000108'), Term('minor_axis', '=', ['A', 'B'])]) + expected = wp.truncate(after='20000108').reindex(minor=['A', 'B']) + tm.assert_panel_equal(result, expected) + + # p4d + result = store.select('p4d', [Term('major_axis<20000108'), + Term('minor_axis', '=', ['A', 'B']), + Term('items', '=', ['ItemA', 'ItemB'])]) + expected = p4d.truncate(after='20000108').reindex( + minor=['A', 'B'], items=['ItemA', 'ItemB']) + tm.assert_panel4d_equal(result, expected) + + # valid terms + terms = [ + dict(field='major_axis', op='>', value='20121114'), + ('major_axis', '20121114'), + ('major_axis', '>', '20121114'), + (('major_axis', ['20121114', '20121114']),), + ('major_axis', datetime.datetime(2012, 11, 14)), + 'major_axis> 20121114', + 'major_axis >20121114', + 'major_axis > 20121114', + (('minor_axis', ['A', 'B']),), + (('minor_axis', ['A', 'B']),), + ((('minor_axis', ['A', 'B']),),), + (('items', ['ItemA', 'ItemB']),), + ('items=ItemA'), + ] + + for t in terms: + store.select('wp', t) + store.select('p4d', t) + + # valid for p4d only + terms = [ + (('labels', '=', ['l1', 'l2']),), + Term('labels', '=', ['l1', 'l2']), + ] + + for t in terms: + store.select('p4d', t) def test_series(self): + s = tm.makeStringSeries() self._check_roundtrip(s, tm.assert_series_equal) - + ts = tm.makeTimeSeries() self._check_roundtrip(ts, tm.assert_series_equal) - + ts2 = Series(ts.index, Index(ts.index, dtype=object)) self._check_roundtrip(ts2, tm.assert_series_equal) - + ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) self._check_roundtrip(ts3, tm.assert_series_equal) - + def test_sparse_series(self): + s = tm.makeStringSeries() s[3:5] = np.nan ss = s.to_sparse() self._check_roundtrip(ss, tm.assert_series_equal, check_series_type=True) - + ss2 = s.to_sparse(kind='integer') self._check_roundtrip(ss2, tm.assert_series_equal, check_series_type=True) - + ss3 = s.to_sparse(fill_value=0) self._check_roundtrip(ss3, tm.assert_series_equal, check_series_type=True) def test_sparse_frame(self): + s = tm.makeDataFrame() s.ix[3:5, 1:3] = np.nan s.ix[8:10, -2] = np.nan ss = s.to_sparse() self._check_double_roundtrip(ss, tm.assert_frame_equal, check_frame_type=True) - + ss2 = s.to_sparse(kind='integer') self._check_double_roundtrip(ss2, tm.assert_frame_equal, check_frame_type=True) - + ss3 = s.to_sparse(fill_value=0) self._check_double_roundtrip(ss3, tm.assert_frame_equal, check_frame_type=True) - + def test_sparse_panel(self): + items = ['x', 'y', 'z'] p = Panel(dict((i, tm.makeDataFrame().ix[:2, :2]) for i in items)) sp = p.to_sparse() - + self._check_double_roundtrip(sp, tm.assert_panel_equal, check_panel_type=True) - + sp2 = p.to_sparse(kind='integer') self._check_double_roundtrip(sp2, tm.assert_panel_equal, check_panel_type=True) - + sp3 = p.to_sparse(fill_value=0) self._check_double_roundtrip(sp3, tm.assert_panel_equal, check_panel_type=True) def test_float_index(self): + # GH #454 index = np.random.randn(10) s = Series(np.random.randn(10), index=index) self._check_roundtrip(s, tm.assert_series_equal) - + def test_tuple_index(self): + # GH #492 col = np.arange(10) idx = [(0., 1.), (2., 3.), (4., 5.)] @@ -1243,8 +1297,9 @@ def test_tuple_index(self): warnings.filterwarnings('ignore', category=PerformanceWarning) self._check_roundtrip(DF, tm.assert_frame_equal) warnings.filterwarnings('always', category=PerformanceWarning) - + def test_index_types(self): + values = np.random.randn(2) func = lambda l, r: tm.assert_series_equal(l, r, True, True, True) @@ -1253,45 +1308,47 @@ def test_index_types(self): ser = Series(values, [0, 'y']) self._check_roundtrip(ser, func) warnings.filterwarnings('always', category=PerformanceWarning) - + ser = Series(values, [datetime.datetime.today(), 0]) self._check_roundtrip(ser, func) - + ser = Series(values, ['y', 0]) self._check_roundtrip(ser, func) - + warnings.filterwarnings('ignore', category=PerformanceWarning) ser = Series(values, [datetime.date.today(), 'a']) self._check_roundtrip(ser, func) warnings.filterwarnings('always', category=PerformanceWarning) - + warnings.filterwarnings('ignore', category=PerformanceWarning) ser = Series(values, [1.23, 'b']) self._check_roundtrip(ser, func) warnings.filterwarnings('always', category=PerformanceWarning) - + ser = Series(values, [1, 1.53]) self._check_roundtrip(ser, func) - + ser = Series(values, [1, 5]) self._check_roundtrip(ser, func) - + ser = Series(values, [datetime.datetime( - 2012, 1, 1), datetime.datetime(2012, 1, 2)]) + 2012, 1, 1), datetime.datetime(2012, 1, 2)]) self._check_roundtrip(ser, func) - + def test_timeseries_preepoch(self): + if sys.version_info[0] == 2 and sys.version_info[1] < 7: raise nose.SkipTest - + dr = bdate_range('1/1/1940', '1/1/1960') ts = Series(np.random.randn(len(dr)), index=dr) try: self._check_roundtrip(ts, tm.assert_series_equal) except OverflowError: raise nose.SkipTest('known failer on some windows platforms') - + def test_frame(self): + df = tm.makeDataFrame() # put in some random NAs @@ -1300,23 +1357,24 @@ def test_frame(self): self._check_roundtrip_table(df, tm.assert_frame_equal) self._check_roundtrip(df, tm.assert_frame_equal) - + self._check_roundtrip_table(df, tm.assert_frame_equal, compression=True) self._check_roundtrip(df, tm.assert_frame_equal, compression=True) - + tdf = tm.makeTimeDataFrame() self._check_roundtrip(tdf, tm.assert_frame_equal) self._check_roundtrip(tdf, tm.assert_frame_equal, compression=True) - - # not consolidated - df['foo'] = np.random.randn(len(df)) - self.store['df'] = df - recons = self.store['df'] - self.assert_(recons._data.is_consolidated()) - + + with ensure_clean(self.path) as store: + # not consolidated + df['foo'] = np.random.randn(len(df)) + store['df'] = df + recons = store['df'] + self.assert_(recons._data.is_consolidated()) + # empty self._check_roundtrip(df[:0], tm.assert_frame_equal) @@ -1332,37 +1390,33 @@ def test_empty_series_frame(self): self._check_roundtrip(df0, tm.assert_frame_equal) self._check_roundtrip(df1, tm.assert_frame_equal) self._check_roundtrip(df2, tm.assert_frame_equal) - + def test_can_serialize_dates(self): + rng = [x.date() for x in bdate_range('1/1/2000', '1/30/2000')] frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + self._check_roundtrip(frame, tm.assert_frame_equal) def test_timezones(self): rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern') frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - try: - store = HDFStore(self.scratchpath) + + with ensure_clean(self.path) as store: store['frame'] = frame recons = store['frame'] self.assert_(recons.index.equals(rng)) self.assertEquals(rng.tz, recons.index.tz) - finally: - store.close() - os.remove(self.scratchpath) def test_fixed_offset_tz(self): rng = date_range('1/1/2000 00:00:00-07:00', '1/30/2000 00:00:00-07:00') frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - try: - store = HDFStore(self.scratchpath) + + with ensure_clean(self.path) as store: store['frame'] = frame recons = store['frame'] self.assert_(recons.index.equals(rng)) self.assertEquals(rng.tz, recons.index.tz) - finally: - store.close() - os.remove(self.scratchpath) def test_store_hierarchical(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], @@ -1378,41 +1432,31 @@ def test_store_hierarchical(self): self._check_roundtrip(frame['A'], tm.assert_series_equal) # check that the names are stored - try: - store = HDFStore(self.scratchpath) + with ensure_clean(self.path) as store: store['frame'] = frame recons = store['frame'] assert(recons.index.names == ['foo', 'bar']) - finally: - store.close() - os.remove(self.scratchpath) def test_store_index_name(self): df = tm.makeDataFrame() df.index.name = 'foo' - try: - store = HDFStore(self.scratchpath) + + with ensure_clean(self.path) as store: store['frame'] = df recons = store['frame'] assert(recons.index.name == 'foo') - finally: - store.close() - os.remove(self.scratchpath) def test_store_series_name(self): df = tm.makeDataFrame() series = df['A'] - try: - store = HDFStore(self.scratchpath) + with ensure_clean(self.path) as store: store['series'] = series recons = store['series'] assert(recons.name == 'A') - finally: - store.close() - os.remove(self.scratchpath) def test_store_mixed(self): + def _make_one(): df = tm.makeDataFrame() df['obj1'] = 'foo' @@ -1429,16 +1473,17 @@ def _make_one(): self._check_roundtrip(df1, tm.assert_frame_equal) self._check_roundtrip(df2, tm.assert_frame_equal) - self.store['obj'] = df1 - tm.assert_frame_equal(self.store['obj'], df1) - self.store['obj'] = df2 - tm.assert_frame_equal(self.store['obj'], df2) - + with ensure_clean(self.path) as store: + store['obj'] = df1 + tm.assert_frame_equal(store['obj'], df1) + store['obj'] = df2 + tm.assert_frame_equal(store['obj'], df2) + # check that can store Series of all of these types self._check_roundtrip(df1['obj1'], tm.assert_series_equal) self._check_roundtrip(df1['bool1'], tm.assert_series_equal) self._check_roundtrip(df1['int1'], tm.assert_series_equal) - + # try with compression self._check_roundtrip(df1['obj1'], tm.assert_series_equal, compression=True) @@ -1450,25 +1495,23 @@ def _make_one(): compression=True) def test_wide(self): + wp = tm.makePanel() self._check_roundtrip(wp, tm.assert_panel_equal) def test_wide_table(self): + wp = tm.makePanel() self._check_roundtrip_table(wp, tm.assert_panel_equal) def test_wide_table_dups(self): wp = tm.makePanel() - try: - store = HDFStore(self.scratchpath) + with ensure_clean(self.path) as store: store._quiet = True store.put('panel', wp, table=True) store.put('panel', wp, table=True, append=True) recons = store['panel'] tm.assert_panel_equal(recons, wp) - finally: - store.close() - os.remove(self.scratchpath) def test_long(self): def _check(left, right): @@ -1484,220 +1527,234 @@ def test_longpanel(self): pass def test_overwrite_node(self): - self.store['a'] = tm.makeTimeDataFrame() - ts = tm.makeTimeSeries() - self.store['a'] = ts - tm.assert_series_equal(self.store['a'], ts) + with ensure_clean(self.path) as store: + store['a'] = tm.makeTimeDataFrame() + ts = tm.makeTimeSeries() + store['a'] = ts + + tm.assert_series_equal(store['a'], ts) def test_select(self): wp = tm.makePanel() - # put/select ok - self.store.remove('wp') - self.store.put('wp', wp, table=True) - self.store.select('wp') - - # non-table ok (where = None) - self.store.remove('wp') - self.store.put('wp2', wp, table=False) - self.store.select('wp2') - - # selection on the non-indexable with a large number of columns - wp = Panel( - np.random.randn(100, 100, 100), items=['Item%03d' % i for i in xrange(100)], - major_axis=date_range('1/1/2000', periods=100), minor_axis=['E%03d' % i for i in xrange(100)]) - - self.store.remove('wp') - self.store.append('wp', wp) - items = ['Item%03d' % i for i in xrange(80)] - result = self.store.select('wp', Term('items', items)) - expected = wp.reindex(items=items) - tm.assert_panel_equal(expected, result) - - # selectin non-table with a where - # self.assertRaises(Exception, self.store.select, - # 'wp2', ('column', ['A', 'D'])) - - # select with columns= - df = tm.makeTimeDataFrame() - self.store.remove('df') - self.store.append('df', df) - result = self.store.select('df', columns=['A', 'B']) - expected = df.reindex(columns=['A', 'B']) - tm.assert_frame_equal(expected, result) - - # equivalentsly - result = self.store.select('df', [('columns', ['A', 'B'])]) - expected = df.reindex(columns=['A', 'B']) - tm.assert_frame_equal(expected, result) - - # with a data column - self.store.remove('df') - self.store.append('df', df, data_columns=['A']) - result = self.store.select('df', ['A > 0'], columns=['A', 'B']) - expected = df[df.A > 0].reindex(columns=['A', 'B']) - tm.assert_frame_equal(expected, result) - - # all a data columns - self.store.remove('df') - self.store.append('df', df, data_columns=True) - result = self.store.select('df', ['A > 0'], columns=['A', 'B']) - expected = df[df.A > 0].reindex(columns=['A', 'B']) - tm.assert_frame_equal(expected, result) - - # with a data column, but different columns - self.store.remove('df') - self.store.append('df', df, data_columns=['A']) - result = self.store.select('df', ['A > 0'], columns=['C', 'D']) - expected = df[df.A > 0].reindex(columns=['C', 'D']) - tm.assert_frame_equal(expected, result) - - # with a Timestamp data column (GH #2637) - df = DataFrame(dict(ts=bdate_range('2012-01-01', periods=300), A=np.random.randn(300))) - self.store.remove('df') - self.store.append('df', df, data_columns=['ts', 'A']) - result = self.store.select('df', [Term('ts', '>=', Timestamp('2012-02-01'))]) - expected = df[df.ts >= Timestamp('2012-02-01')] - tm.assert_frame_equal(expected, result) + with ensure_clean(self.path) as store: + + # put/select ok + store.remove('wp') + store.put('wp', wp, table=True) + store.select('wp') + + # non-table ok (where = None) + store.remove('wp') + store.put('wp2', wp, table=False) + store.select('wp2') + + # selection on the non-indexable with a large number of columns + wp = Panel( + np.random.randn(100, 100, 100), items=['Item%03d' % i for i in xrange(100)], + major_axis=date_range('1/1/2000', periods=100), minor_axis=['E%03d' % i for i in xrange(100)]) + + store.remove('wp') + store.append('wp', wp) + items = ['Item%03d' % i for i in xrange(80)] + result = store.select('wp', Term('items', items)) + expected = wp.reindex(items=items) + tm.assert_panel_equal(expected, result) + + # selectin non-table with a where + # self.assertRaises(Exception, store.select, + # 'wp2', ('column', ['A', 'D'])) + + # select with columns= + df = tm.makeTimeDataFrame() + store.remove('df') + store.append('df', df) + result = store.select('df', columns=['A', 'B']) + expected = df.reindex(columns=['A', 'B']) + tm.assert_frame_equal(expected, result) + + # equivalentsly + result = store.select('df', [('columns', ['A', 'B'])]) + expected = df.reindex(columns=['A', 'B']) + tm.assert_frame_equal(expected, result) + + # with a data column + store.remove('df') + store.append('df', df, data_columns=['A']) + result = store.select('df', ['A > 0'], columns=['A', 'B']) + expected = df[df.A > 0].reindex(columns=['A', 'B']) + tm.assert_frame_equal(expected, result) + + # all a data columns + store.remove('df') + store.append('df', df, data_columns=True) + result = store.select('df', ['A > 0'], columns=['A', 'B']) + expected = df[df.A > 0].reindex(columns=['A', 'B']) + tm.assert_frame_equal(expected, result) + + # with a data column, but different columns + store.remove('df') + store.append('df', df, data_columns=['A']) + result = store.select('df', ['A > 0'], columns=['C', 'D']) + expected = df[df.A > 0].reindex(columns=['C', 'D']) + tm.assert_frame_equal(expected, result) + + # with a Timestamp data column (GH #2637) + df = DataFrame(dict(ts=bdate_range('2012-01-01', periods=300), A=np.random.randn(300))) + store.remove('df') + store.append('df', df, data_columns=['ts', 'A']) + result = store.select('df', [Term('ts', '>=', Timestamp('2012-02-01'))]) + expected = df[df.ts >= Timestamp('2012-02-01')] + tm.assert_frame_equal(expected, result) def test_panel_select(self): - wp = tm.makePanel() - self.store.put('wp', wp, table=True) - date = wp.major_axis[len(wp.major_axis) // 2] - crit1 = ('major_axis', '>=', date) - crit2 = ('minor_axis', '=', ['A', 'D']) - - result = self.store.select('wp', [crit1, crit2]) - expected = wp.truncate(before=date).reindex(minor=['A', 'D']) - tm.assert_panel_equal(result, expected) - - result = self.store.select( - 'wp', ['major_axis>=20000124', ('minor_axis', '=', ['A', 'B'])]) - expected = wp.truncate(before='20000124').reindex(minor=['A', 'B']) - tm.assert_panel_equal(result, expected) + wp = tm.makePanel() + with ensure_clean(self.path) as store: + store.put('wp', wp, table=True) + date = wp.major_axis[len(wp.major_axis) // 2] + + crit1 = ('major_axis', '>=', date) + crit2 = ('minor_axis', '=', ['A', 'D']) + + result = store.select('wp', [crit1, crit2]) + expected = wp.truncate(before=date).reindex(minor=['A', 'D']) + tm.assert_panel_equal(result, expected) + + result = store.select( + 'wp', ['major_axis>=20000124', ('minor_axis', '=', ['A', 'B'])]) + expected = wp.truncate(before='20000124').reindex(minor=['A', 'B']) + tm.assert_panel_equal(result, expected) + def test_frame_select(self): - df = tm.makeTimeDataFrame() - self.store.put('frame', df, table=True) - date = df.index[len(df) // 2] - - crit1 = ('index', '>=', date) - crit2 = ('columns', ['A', 'D']) - crit3 = ('columns', 'A') - - result = self.store.select('frame', [crit1, crit2]) - expected = df.ix[date:, ['A', 'D']] - tm.assert_frame_equal(result, expected) - - result = self.store.select('frame', [crit3]) - expected = df.ix[:, ['A']] - tm.assert_frame_equal(result, expected) - - # other indicies for a frame - # integer - df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20))) - self.store.append('df_int', df) - self.store.select( - 'df_int', [Term("index<10"), Term("columns", "=", ["A"])]) - - df = DataFrame(dict(A=np.random.rand( - 20), B=np.random.rand(20), index=np.arange(20, dtype='f8'))) - self.store.append('df_float', df) - self.store.select( - 'df_float', [Term("index<10.0"), Term("columns", "=", ["A"])]) - - # invalid terms df = tm.makeTimeDataFrame() - self.store.append('df_time', df) - self.assertRaises( - Exception, self.store.select, 'df_time', [Term("index>0")]) - - # can't select if not written as table - # self.store['frame'] = df - # self.assertRaises(Exception, self.store.select, - # 'frame', [crit1, crit2]) + with ensure_clean(self.path) as store: + store.put('frame', df, table=True) + date = df.index[len(df) // 2] + + crit1 = ('index', '>=', date) + crit2 = ('columns', ['A', 'D']) + crit3 = ('columns', 'A') + + result = store.select('frame', [crit1, crit2]) + expected = df.ix[date:, ['A', 'D']] + tm.assert_frame_equal(result, expected) + + result = store.select('frame', [crit3]) + expected = df.ix[:, ['A']] + tm.assert_frame_equal(result, expected) + + # other indicies for a frame + + # integer + df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20))) + store.append('df_int', df) + store.select( + 'df_int', [Term("index<10"), Term("columns", "=", ["A"])]) + + df = DataFrame(dict(A=np.random.rand( + 20), B=np.random.rand(20), index=np.arange(20, dtype='f8'))) + store.append('df_float', df) + store.select( + 'df_float', [Term("index<10.0"), Term("columns", "=", ["A"])]) + + # invalid terms + df = tm.makeTimeDataFrame() + store.append('df_time', df) + self.assertRaises( + Exception, store.select, 'df_time', [Term("index>0")]) + + # can't select if not written as table + # store['frame'] = df + # self.assertRaises(Exception, store.select, + # 'frame', [crit1, crit2]) + def test_unique(self): + df = tm.makeTimeDataFrame() def check(x, y): self.assert_((np.unique(x) == np.unique(y)).all() == True) - self.store.remove('df') - self.store.append('df', df) - - # error - self.assertRaises(KeyError, self.store.unique, 'df', 'foo') - - # valid - result = self.store.unique('df', 'index') - check(result.values, df.index.values) - - # not a data indexable column - self.assertRaises( - ValueError, self.store.unique, 'df', 'values_block_0') - - # a data column - df2 = df.copy() - df2['string'] = 'foo' - self.store.append('df2', df2, data_columns=['string']) - result = self.store.unique('df2', 'string') - check(result.values, df2['string'].unique()) - - # a data column with NaNs, result excludes the NaNs - df3 = df.copy() - df3['string'] = 'foo' - df3.ix[4:6, 'string'] = np.nan - self.store.append('df3', df3, data_columns=['string']) - result = self.store.unique('df3', 'string') - check(result.values, df3['string'].valid().unique()) + with ensure_clean(self.path) as store: + store.remove('df') + store.append('df', df) + + # error + self.assertRaises(KeyError, store.unique, 'df', 'foo') + + # valid + result = store.unique('df', 'index') + check(result.values, df.index.values) + + # not a data indexable column + self.assertRaises( + ValueError, store.unique, 'df', 'values_block_0') + + # a data column + df2 = df.copy() + df2['string'] = 'foo' + store.append('df2', df2, data_columns=['string']) + result = store.unique('df2', 'string') + check(result.values, df2['string'].unique()) + + # a data column with NaNs, result excludes the NaNs + df3 = df.copy() + df3['string'] = 'foo' + df3.ix[4:6, 'string'] = np.nan + store.append('df3', df3, data_columns=['string']) + result = store.unique('df3', 'string') + check(result.values, df3['string'].valid().unique()) def test_coordinates(self): df = tm.makeTimeDataFrame() - self.store.remove('df') - self.store.append('df', df) - - # all - c = self.store.select_as_coordinates('df') - assert((c.values == np.arange(len(df.index))).all() == True) - - # get coordinates back & test vs frame - self.store.remove('df') - - df = DataFrame(dict(A=range(5), B=range(5))) - self.store.append('df', df) - c = self.store.select_as_coordinates('df', ['index<3']) - assert((c.values == np.arange(3)).all() == True) - result = self.store.select('df', where=c) - expected = df.ix[0:2, :] - tm.assert_frame_equal(result, expected) - - c = self.store.select_as_coordinates('df', ['index>=3', 'index<=4']) - assert((c.values == np.arange(2) + 3).all() == True) - result = self.store.select('df', where=c) - expected = df.ix[3:4, :] - tm.assert_frame_equal(result, expected) - - # multiple tables - self.store.remove('df1') - self.store.remove('df2') - df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) - self.store.append('df1', df1, data_columns=['A', 'B']) - self.store.append('df2', df2) - - c = self.store.select_as_coordinates('df1', ['A>0', 'B>0']) - df1_result = self.store.select('df1', c) - df2_result = self.store.select('df2', c) - result = concat([df1_result, df2_result], axis=1) + with ensure_clean(self.path) as store: - expected = concat([df1, df2], axis=1) - expected = expected[(expected.A > 0) & (expected.B > 0)] - tm.assert_frame_equal(result, expected) + store.remove('df') + store.append('df', df) + + # all + c = store.select_as_coordinates('df') + assert((c.values == np.arange(len(df.index))).all() == True) + + # get coordinates back & test vs frame + store.remove('df') + + df = DataFrame(dict(A=range(5), B=range(5))) + store.append('df', df) + c = store.select_as_coordinates('df', ['index<3']) + assert((c.values == np.arange(3)).all() == True) + result = store.select('df', where=c) + expected = df.ix[0:2, :] + tm.assert_frame_equal(result, expected) + + c = store.select_as_coordinates('df', ['index>=3', 'index<=4']) + assert((c.values == np.arange(2) + 3).all() == True) + result = store.select('df', where=c) + expected = df.ix[3:4, :] + tm.assert_frame_equal(result, expected) + + # multiple tables + store.remove('df1') + store.remove('df2') + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) + store.append('df1', df1, data_columns=['A', 'B']) + store.append('df2', df2) + + c = store.select_as_coordinates('df1', ['A>0', 'B>0']) + df1_result = store.select('df1', c) + df2_result = store.select('df2', c) + result = concat([df1_result, df2_result], axis=1) + + expected = concat([df1, df2], axis=1) + expected = expected[(expected.A > 0) & (expected.B > 0)] + tm.assert_frame_equal(result, expected) def test_append_to_multiple(self): df1 = tm.makeTimeDataFrame() @@ -1705,102 +1762,109 @@ def test_append_to_multiple(self): df2['foo'] = 'bar' df = concat([df1, df2], axis=1) - # exceptions - self.assertRaises(Exception, self.store.append_to_multiple, {'df1': - ['A', 'B'], 'df2': None}, df, selector='df3') - self.assertRaises(Exception, self.store.append_to_multiple, - {'df1': None, 'df2': None}, df, selector='df3') - self.assertRaises( - Exception, self.store.append_to_multiple, 'df1', df, 'df1') - - # regular operation - self.store.append_to_multiple( - {'df1': ['A', 'B'], 'df2': None}, df, selector='df1') - result = self.store.select_as_multiple( - ['df1', 'df2'], where=['A>0', 'B>0'], selector='df1') - expected = df[(df.A > 0) & (df.B > 0)] - tm.assert_frame_equal(result, expected) + with ensure_clean(self.path) as store: + # exceptions + self.assertRaises(Exception, store.append_to_multiple, + {'df1': ['A', 'B'], 'df2': None}, df, selector='df3') + self.assertRaises(Exception, store.append_to_multiple, + {'df1': None, 'df2': None}, df, selector='df3') + self.assertRaises( + Exception, store.append_to_multiple, 'df1', df, 'df1') + + # regular operation + store.append_to_multiple( + {'df1': ['A', 'B'], 'df2': None}, df, selector='df1') + result = store.select_as_multiple( + ['df1', 'df2'], where=['A>0', 'B>0'], selector='df1') + expected = df[(df.A > 0) & (df.B > 0)] + tm.assert_frame_equal(result, expected) + def test_select_as_multiple(self): + df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) df2['foo'] = 'bar' - self.store.append('df1', df1, data_columns=['A', 'B']) - self.store.append('df2', df2) - - # exceptions - self.assertRaises(Exception, self.store.select_as_multiple, - None, where=['A>0', 'B>0'], selector='df1') - self.assertRaises(Exception, self.store.select_as_multiple, - [None], where=['A>0', 'B>0'], selector='df1') - - # default select - result = self.store.select('df1', ['A>0', 'B>0']) - expected = self.store.select_as_multiple( - ['df1'], where=['A>0', 'B>0'], selector='df1') - tm.assert_frame_equal(result, expected) - expected = self.store.select_as_multiple( - 'df1', where=['A>0', 'B>0'], selector='df1') - tm.assert_frame_equal(result, expected) - - # multiple - result = self.store.select_as_multiple( - ['df1', 'df2'], where=['A>0', 'B>0'], selector='df1') - expected = concat([df1, df2], axis=1) - expected = expected[(expected.A > 0) & (expected.B > 0)] - tm.assert_frame_equal(result, expected) - - # multiple (diff selector) - result = self.store.select_as_multiple(['df1', 'df2'], where=[Term( - 'index', '>', df2.index[4])], selector='df2') - expected = concat([df1, df2], axis=1) - expected = expected[5:] - tm.assert_frame_equal(result, expected) - - # test excpection for diff rows - self.store.append('df3', tm.makeTimeDataFrame(nper=50)) - self.assertRaises(Exception, self.store.select_as_multiple, ['df1', - 'df3'], where=['A>0', 'B>0'], selector='df1') - def test_start_stop(self): + with ensure_clean(self.path) as store: + store.append('df1', df1, data_columns=['A', 'B']) + store.append('df2', df2) + + # exceptions + self.assertRaises(Exception, store.select_as_multiple, + None, where=['A>0', 'B>0'], selector='df1') + self.assertRaises(Exception, store.select_as_multiple, + [None], where=['A>0', 'B>0'], selector='df1') + + # default select + result = store.select('df1', ['A>0', 'B>0']) + expected = store.select_as_multiple( + ['df1'], where=['A>0', 'B>0'], selector='df1') + tm.assert_frame_equal(result, expected) + expected = store.select_as_multiple( + 'df1', where=['A>0', 'B>0'], selector='df1') + tm.assert_frame_equal(result, expected) + + # multiple + result = store.select_as_multiple( + ['df1', 'df2'], where=['A>0', 'B>0'], selector='df1') + expected = concat([df1, df2], axis=1) + expected = expected[(expected.A > 0) & (expected.B > 0)] + tm.assert_frame_equal(result, expected) + + # multiple (diff selector) + result = store.select_as_multiple(['df1', 'df2'], where=[Term( + 'index', '>', df2.index[4])], selector='df2') + expected = concat([df1, df2], axis=1) + expected = expected[5:] + tm.assert_frame_equal(result, expected) + + # test excpection for diff rows + store.append('df3', tm.makeTimeDataFrame(nper=50)) + self.assertRaises(Exception, store.select_as_multiple, + ['df1','df3'], where=['A>0', 'B>0'], selector='df1') - df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20))) - self.store.append('df', df) + def test_start_stop(self): - result = self.store.select( - 'df', [Term("columns", "=", ["A"])], start=0, stop=5) - expected = df.ix[0:4, ['A']] - tm.assert_frame_equal(result, expected) + with ensure_clean(self.path) as store: - # out of range - result = self.store.select( - 'df', [Term("columns", "=", ["A"])], start=30, stop=40) - assert(len(result) == 0) - assert(type(result) == DataFrame) + df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20))) + store.append('df', df) + + result = store.select( + 'df', [Term("columns", "=", ["A"])], start=0, stop=5) + expected = df.ix[0:4, ['A']] + tm.assert_frame_equal(result, expected) + + # out of range + result = store.select( + 'df', [Term("columns", "=", ["A"])], start=30, stop=40) + assert(len(result) == 0) + assert(type(result) == DataFrame) def test_select_filter_corner(self): + df = DataFrame(np.random.randn(50, 100)) df.index = ['%.3d' % c for c in df.index] df.columns = ['%.3d' % c for c in df.columns] - self.store.put('frame', df, table=True) - crit = Term('columns', df.columns[:75]) - result = self.store.select('frame', [crit]) - tm.assert_frame_equal(result, df.ix[:, df.columns[:75]]) + with ensure_clean(self.path) as store: + store.put('frame', df, table=True) + crit = Term('columns', df.columns[:75]) + result = store.select('frame', [crit]) + tm.assert_frame_equal(result, df.ix[:, df.columns[:75]]) + def _check_roundtrip(self, obj, comparator, compression=False, **kwargs): + options = {} if compression: options['complib'] = _default_compressor - store = HDFStore(self.scratchpath, 'w', **options) - try: + with ensure_clean(self.path, 'w', **options) as store: store['obj'] = obj retrieved = store['obj'] comparator(retrieved, obj, **kwargs) - finally: - store.close() - os.remove(self.scratchpath) def _check_double_roundtrip(self, obj, comparator, compression=False, **kwargs): @@ -1808,84 +1872,90 @@ def _check_double_roundtrip(self, obj, comparator, compression=False, if compression: options['complib'] = _default_compressor - store = HDFStore(self.scratchpath, 'w', **options) - try: + with ensure_clean(self.path, 'w', **options) as store: store['obj'] = obj retrieved = store['obj'] comparator(retrieved, obj, **kwargs) store['obj'] = retrieved again = store['obj'] comparator(again, obj, **kwargs) - finally: - store.close() - os.remove(self.scratchpath) + def _check_roundtrip_table(self, obj, comparator, compression=False): options = {} if compression: options['complib'] = _default_compressor - store = HDFStore(self.scratchpath, 'w', **options) - try: + with ensure_clean(self.path, 'w', **options) as store: store.put('obj', obj, table=True) retrieved = store['obj'] # sorted_obj = _test_sort(obj) comparator(retrieved, obj) - finally: - store.close() - os.remove(self.scratchpath) def test_pytables_native_read(self): pth = curpath() - store = HDFStore(os.path.join(pth, 'pytables_native.h5'), 'r') - d2 = store['detector/readout'] - store.close() - store = HDFStore(os.path.join(pth, 'pytables_native2.h5'), 'r') - str(store) - d1 = store['detector'] - store.close() + + try: + store = HDFStore(os.path.join(pth, 'pytables_native.h5'), 'r') + d2 = store['detector/readout'] + finally: + safe_close(store) + + try: + store = HDFStore(os.path.join(pth, 'pytables_native2.h5'), 'r') + str(store) + d1 = store['detector'] + finally: + safe_close(store) def test_legacy_read(self): pth = curpath() - store = HDFStore(os.path.join(pth, 'legacy.h5'), 'r') - store['a'] - store['b'] - store['c'] - store['d'] - store.close() + try: + store = HDFStore(os.path.join(pth, 'legacy.h5'), 'r') + store['a'] + store['b'] + store['c'] + store['d'] + finally: + safe_close(store) def test_legacy_table_read(self): # legacy table types pth = curpath() - store = HDFStore(os.path.join(pth, 'legacy_table.h5'), 'r') - store.select('df1') - store.select('df2') - store.select('wp1') - - # force the frame - store.select('df2', typ='legacy_frame') - - # old version warning - warnings.filterwarnings('ignore', category=IncompatibilityWarning) - self.assertRaises( - Exception, store.select, 'wp1', Term('minor_axis', '=', 'B')) - - df2 = store.select('df2') - store.select('df2', Term('index', '>', df2.index[2])) - warnings.filterwarnings('always', category=IncompatibilityWarning) + try: + store = HDFStore(os.path.join(pth, 'legacy_table.h5'), 'r') + store.select('df1') + store.select('df2') + store.select('wp1') + + # force the frame + store.select('df2', typ='legacy_frame') + + # old version warning + warnings.filterwarnings('ignore', category=IncompatibilityWarning) + self.assertRaises( + Exception, store.select, 'wp1', Term('minor_axis', '=', 'B')) - store.close() + df2 = store.select('df2') + store.select('df2', Term('index', '>', df2.index[2])) + warnings.filterwarnings('always', category=IncompatibilityWarning) + + finally: + safe_close(store) def test_legacy_0_10_read(self): # legacy from 0.10 pth = curpath() - store = HDFStore(os.path.join(pth, 'legacy_0.10.h5'), 'r') - for k in store.keys(): - store.select(k) - store.close() + try: + store = HDFStore(os.path.join(pth, 'legacy_0.10.h5'), 'r') + for k in store.keys(): + store.select(k) + finally: + safe_close(store) def test_copy(self): pth = curpath() + def do_copy(f = None, new_f = None, keys = None, propindexes = True, **kwargs): try: import os @@ -1893,6 +1963,7 @@ def do_copy(f = None, new_f = None, keys = None, propindexes = True, **kwargs): if f is None: f = os.path.join(pth, 'legacy_0.10.h5') + store = HDFStore(f, 'r') if new_f is None: @@ -1920,13 +1991,9 @@ def do_copy(f = None, new_f = None, keys = None, propindexes = True, **kwargs): except (Exception), detail: pass finally: - store.close() - tstore.close() - import os - try: - os.remove(new_f) - except: - pass + safe_close(store) + safe_close(tstore) + safe_remove(new_f) do_copy() do_copy(keys = ['df']) @@ -1934,18 +2001,19 @@ def do_copy(f = None, new_f = None, keys = None, propindexes = True, **kwargs): # new table df = tm.makeDataFrame() + try: - st = HDFStore(self.scratchpath) + st = HDFStore(self.path) st.append('df', df, data_columns = ['A']) st.close() - do_copy(f = self.scratchpath) - do_copy(f = self.scratchpath, propindexes = False) + do_copy(f = self.path) + do_copy(f = self.path, propindexes = False) finally: - import os - os.remove(self.scratchpath) + safe_remove(self.path) def test_legacy_table_write(self): raise nose.SkipTest + # legacy table types pth = curpath() df = tm.makeDataFrame() @@ -1959,71 +2027,78 @@ def test_legacy_table_write(self): store.close() def test_store_datetime_fractional_secs(self): - dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456) - series = Series([0], [dt]) - self.store['a'] = series - self.assertEquals(self.store['a'].index[0], dt) - - def test_tseries_indices_series(self): - idx = tm.makeDateIndex(10) - ser = Series(np.random.randn(len(idx)), idx) - self.store['a'] = ser - result = self.store['a'] - assert_series_equal(result, ser) - self.assertEquals(type(result.index), type(ser.index)) - self.assertEquals(result.index.freq, ser.index.freq) + with ensure_clean(self.path) as store: + dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456) + series = Series([0], [dt]) + store['a'] = series + self.assertEquals(store['a'].index[0], dt) - idx = tm.makePeriodIndex(10) - ser = Series(np.random.randn(len(idx)), idx) - self.store['a'] = ser - result = self.store['a'] + def test_tseries_indices_series(self): - assert_series_equal(result, ser) - self.assertEquals(type(result.index), type(ser.index)) - self.assertEquals(result.index.freq, ser.index.freq) + with ensure_clean(self.path) as store: + idx = tm.makeDateIndex(10) + ser = Series(np.random.randn(len(idx)), idx) + store['a'] = ser + result = store['a'] + + assert_series_equal(result, ser) + self.assertEquals(type(result.index), type(ser.index)) + self.assertEquals(result.index.freq, ser.index.freq) + + idx = tm.makePeriodIndex(10) + ser = Series(np.random.randn(len(idx)), idx) + store['a'] = ser + result = store['a'] + + assert_series_equal(result, ser) + self.assertEquals(type(result.index), type(ser.index)) + self.assertEquals(result.index.freq, ser.index.freq) def test_tseries_indices_frame(self): - idx = tm.makeDateIndex(10) - df = DataFrame(np.random.randn(len(idx), 3), index=idx) - self.store['a'] = df - result = self.store['a'] - - assert_frame_equal(result, df) - self.assertEquals(type(result.index), type(df.index)) - self.assertEquals(result.index.freq, df.index.freq) - - idx = tm.makePeriodIndex(10) - df = DataFrame(np.random.randn(len(idx), 3), idx) - self.store['a'] = df - result = self.store['a'] - assert_frame_equal(result, df) - self.assertEquals(type(result.index), type(df.index)) - self.assertEquals(result.index.freq, df.index.freq) + with ensure_clean(self.path) as store: + idx = tm.makeDateIndex(10) + df = DataFrame(np.random.randn(len(idx), 3), index=idx) + store['a'] = df + result = store['a'] + + assert_frame_equal(result, df) + self.assertEquals(type(result.index), type(df.index)) + self.assertEquals(result.index.freq, df.index.freq) + + idx = tm.makePeriodIndex(10) + df = DataFrame(np.random.randn(len(idx), 3), idx) + store['a'] = df + result = store['a'] + + assert_frame_equal(result, df) + self.assertEquals(type(result.index), type(df.index)) + self.assertEquals(result.index.freq, df.index.freq) def test_unicode_index(self): + unicode_values = [u'\u03c3', u'\u03c3\u03c3'] warnings.filterwarnings('ignore', category=PerformanceWarning) s = Series(np.random.randn(len(unicode_values)), unicode_values) self._check_roundtrip(s, tm.assert_series_equal) warnings.filterwarnings('always', category=PerformanceWarning) - def test_store_datetime_mixed(self): + df = DataFrame( {'a': [1, 2, 3], 'b': [1., 2., 3.], 'c': ['a', 'b', 'c']}) ts = tm.makeTimeSeries() df['d'] = ts.index[:3] self._check_roundtrip(df, tm.assert_frame_equal) - + # def test_cant_write_multiindex_table(self): # # for now, #1848 # df = DataFrame(np.random.randn(10, 4), # index=[np.arange(5).repeat(2), # np.tile(np.arange(2), 5)]) - # self.assertRaises(Exception, self.store.put, 'foo', df, table=True) + # self.assertRaises(Exception, store.put, 'foo', df, table=True) def curpath(): From eb2c048525420e48fa7e8a80e2a0e2b4dbe91fcc Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 1 Feb 2013 19:23:05 -0500 Subject: [PATCH 3/7] DOC: added DataTypes section to HDFStore --- doc/source/io.rst | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/doc/source/io.rst b/doc/source/io.rst index 2b0145dba5f24..288940585c6fe 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1363,6 +1363,32 @@ Notes & Caveats # we have provided a minimum minor_axis indexable size store.root.wp_big_strings.table +DataTypes +~~~~~~~~~ + +``HDFStore`` will map an object dtype to the ``PyTables`` underlying dtype. This means the following types are known to work: + + - floating : ``float64, float32, float16`` *(using* ``np.nan`` *to represent invalid values)* + - integer : ``int64, int32, int8, uint64, uint32, uint8`` + - bool + - datetime64[ns] *(using* ``NaT`` *to represent invalid values)* + - object : ``strings`` *(using* ``np.nan`` *to represent invalid values)* + +Currently, ``unicode`` and ``datetime`` columns (represented with a dtype of ``object``), **WILL FAIL**. In addition, even though a column may look like a ``datetime64[ns]``, +if it contains ``np.nan``, this **WILL FAIL**. You can try to convert datetimelike columns to proper ``datetime64[ns]`` columns, that possibily contain ``NaT`` to represent invalid values. (Some of these issues have been addressed and these conversion may not be necessary in future versions of pandas) + + .. ipython:: python + + import datetime + df = DataFrame(dict(datelike = Series([datetime.datetime(2001,1,1),datetime.datetime(2001,1,2),np.nan]))) + df + df.dtypes + + # to convert + df['datelike'] = Series(df['datelike'].values,dtype='M8[ns]') + df + df.dtypes + External Compatibility ~~~~~~~~~~~~~~~~~~~~~~ From 7065ff0e9ecf3516544591a709f95202f3f38acb Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 7 Feb 2013 12:09:18 -0500 Subject: [PATCH 4/7] ENH: provide dotted (attribute) access in stores (e.g. store.df == store['df']) --- RELEASE.rst | 6 +- doc/source/io.rst | 3 + doc/source/v0.10.2.txt | 18 +++++ doc/source/whatsnew.rst | 2 + pandas/io/pytables.py | 121 +++++++++++++++++-------------- pandas/io/tests/test_pytables.py | 36 ++++++++- 6 files changed, 125 insertions(+), 61 deletions(-) create mode 100644 doc/source/v0.10.2.txt diff --git a/RELEASE.rst b/RELEASE.rst index ae955e204f036..021b3e64e12f8 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -22,7 +22,11 @@ Where to get it * Binary installers on PyPI: http://pypi.python.org/pypi/pandas * Documentation: http://pandas.pydata.org - - Fix weird PyTables error when using too many selectors in a where + ``HDFStore`` + + - Fix weird PyTables error when using too many selectors in a where + - Provide dotted attribute access to ``get`` from stores (e.g. store.df == store['df']) + - Internally, change all variables to be private-like (now have leading underscore) pandas 0.10.1 ============= diff --git a/doc/source/io.rst b/doc/source/io.rst index 288940585c6fe..100ca9e251234 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1021,6 +1021,9 @@ In a current or later Python session, you can retrieve stored objects: # store.get('df') is an equivalent method store['df'] + # dotted (attribute) access provides get as well + store.df + Deletion of the object specified by the key .. ipython:: python diff --git a/doc/source/v0.10.2.txt b/doc/source/v0.10.2.txt new file mode 100644 index 0000000000000..e9fed5b36f3cd --- /dev/null +++ b/doc/source/v0.10.2.txt @@ -0,0 +1,18 @@ +.. _whatsnew_0102: + +v0.10.2 (February ??, 2013) +--------------------------- + +This is a minor release from 0.10.1 and includes many new features and +enhancements along with a large number of bug fixes. There are also a number of +important API changes that long-time pandas users should pay close attention +to. + +**Enhancements** + + - In ``HDFStore``, provide dotted attribute access to ``get`` from stores (e.g. store.df == store['df']) + +See the `full release notes +`__ or issue tracker +on GitHub for a complete list. + diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index 6c125c45a2599..646610ecccd88 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -16,6 +16,8 @@ What's New These are new features and improvements of note in each release. +.. include:: v0.10.2.txt + .. include:: v0.10.1.txt .. include:: v0.10.0.txt diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index c18db458ecdf3..1a00ff522ccda 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -197,19 +197,19 @@ def __init__(self, path, mode='a', complevel=None, complib=None, except ImportError: # pragma: no cover raise Exception('HDFStore requires PyTables') - self.path = path - self.mode = mode - self.handle = None - self.complevel = complevel - self.complib = complib - self.fletcher32 = fletcher32 - self.filters = None + self._path = path + self._mode = mode + self._handle = None + self._complevel = complevel + self._complib = complib + self._fletcher32 = fletcher32 + self._filters = None self.open(mode=mode, warn=False) @property def root(self): """ return the root node """ - return self.handle.root + return self._handle.root def __getitem__(self, key): return self.get(key) @@ -220,10 +220,19 @@ def __setitem__(self, key, value): def __delitem__(self, key): return self.remove(key) + def __getattr__(self, name): + """ allow attribute access to get stores """ + try: + return self.get(name) + except: + pass + raise AttributeError("'%s' object has no attribute '%s'" % + (type(self).__name__, name)) + def __contains__(self, key): """ check for existance of this key can match the exact pathname or the pathnm w/o the leading '/' - """ + """ node = self.get_node(key) if node is not None: name = node._v_pathname @@ -234,7 +243,7 @@ def __len__(self): return len(self.groups()) def __repr__(self): - output = '%s\nFile path: %s\n' % (type(self), self.path) + output = '%s\nFile path: %s\n' % (type(self), self._path) if len(self.keys()): keys = [] @@ -277,7 +286,7 @@ def open(self, mode='a', warn=True): mode : {'a', 'w', 'r', 'r+'}, default 'a' See HDFStore docstring or tables.openFile for info about modes """ - self.mode = mode + self._mode = mode if warn and mode == 'w': # pragma: no cover while True: response = raw_input("Re-opening as mode='w' will delete the " @@ -286,22 +295,22 @@ def open(self, mode='a', warn=True): break elif response == 'n': return - if self.handle is not None and self.handle.isopen: - self.handle.close() + if self._handle is not None and self._handle.isopen: + self._handle.close() - if self.complib is not None: - if self.complevel is None: - self.complevel = 9 - self.filters = _tables().Filters(self.complevel, - self.complib, - fletcher32=self.fletcher32) + if self._complib is not None: + if self._complevel is None: + self._complevel = 9 + self._filters = _tables().Filters(self._complevel, + self._complib, + fletcher32=self._fletcher32) try: - self.handle = h5_open(self.path, self.mode) + self._handle = h5_open(self._path, self._mode) except IOError, e: # pragma: no cover if 'can not be written' in str(e): - print 'Opening %s in read-only mode' % self.path - self.handle = h5_open(self.path, 'r') + print 'Opening %s in read-only mode' % self._path + self._handle = h5_open(self._path, 'r') else: raise @@ -309,13 +318,13 @@ def close(self): """ Close the PyTables file handle """ - self.handle.close() + self._handle.close() def flush(self): """ Force all buffered modifications to be written to disk """ - self.handle.flush() + self._handle.flush() def get(self, key): """ @@ -617,14 +626,14 @@ def create_table_index(self, key, **kwargs): def groups(self): """ return a list of all the top-level nodes (that are not themselves a pandas storage object) """ _tables() - return [ g for g in self.handle.walkNodes() if getattr(g._v_attrs,'pandas_type',None) or getattr(g,'table',None) or (isinstance(g,_table_mod.table.Table) and g._v_name != 'table') ] + return [ g for g in self._handle.walkNodes() if getattr(g._v_attrs,'pandas_type',None) or getattr(g,'table',None) or (isinstance(g,_table_mod.table.Table) and g._v_name != 'table') ] def get_node(self, key): """ return the node with the key or None if it does not exist """ try: if not key.startswith('/'): key = '/' + key - return self.handle.getNode(self.root, key) + return self._handle.getNode(self.root, key) except: return None @@ -751,7 +760,7 @@ def _write_to_group(self, key, value, index=True, table=False, append=False, com # remove the node if we are not appending if group is not None and not append: - self.handle.removeNode(group, recursive=True) + self._handle.removeNode(group, recursive=True) group = None if group is None: @@ -768,7 +777,7 @@ def _write_to_group(self, key, value, index=True, table=False, append=False, com new_path += p group = self.get_node(new_path) if group is None: - group = self.handle.createGroup(path, p) + group = self._handle.createGroup(path, p) path = new_path s = self._create_storer(group, value, table=table, append=append, **kwargs) @@ -1304,28 +1313,28 @@ def pathname(self): return self.group._v_pathname @property - def handle(self): - return self.parent.handle + def _handle(self): + return self.parent._handle @property def _quiet(self): return self.parent._quiet @property - def filters(self): - return self.parent.filters + def _filters(self): + return self.parent._filters @property - def complevel(self): - return self.parent.complevel + def _complevel(self): + return self.parent._complevel @property - def fletcher32(self): - return self.parent.fletcher32 + def _fletcher32(self): + return self.parent._fletcher32 @property - def complib(self): - return self.parent.complib + def _complib(self): + return self.parent._complib @property def attrs(self): @@ -1380,7 +1389,7 @@ def write(self, **kwargs): def delete(self, where = None, **kwargs): """ support fully deleting the node in its entirety (only) - where specification must be None """ if where is None: - self.handle.removeNode(self.group, recursive=True) + self._handle.removeNode(self.group, recursive=True) return None raise NotImplementedError("cannot delete on an abstract storer") @@ -1583,7 +1592,7 @@ def read_index_node(self, node): def write_array(self, key, value): if key in self.group: - self.handle.removeNode(self.group, key) + self._handle.removeNode(self.group, key) # Transform needed to interface with pytables row/col notation empty_array = any(x == 0 for x in value.shape) @@ -1593,7 +1602,7 @@ def write_array(self, key, value): value = value.T transposed = True - if self.filters is not None: + if self._filters is not None: atom = None try: # get the atom for this datatype @@ -1603,9 +1612,9 @@ def write_array(self, key, value): if atom is not None: # create an empty chunked array and fill it from value - ca = self.handle.createCArray(self.group, key, atom, + ca = self._handle.createCArray(self.group, key, atom, value.shape, - filters=self.filters) + filters=self._filters) ca[:] = value getattr(self.group, key)._v_attrs.transposed = transposed return @@ -1622,21 +1631,21 @@ def write_array(self, key, value): ws = performance_doc % (inferred_type,key) warnings.warn(ws, PerformanceWarning) - vlarr = self.handle.createVLArray(self.group, key, + vlarr = self._handle.createVLArray(self.group, key, _tables().ObjectAtom()) vlarr.append(value) elif value.dtype.type == np.datetime64: - self.handle.createArray(self.group, key, value.view('i8')) + self._handle.createArray(self.group, key, value.view('i8')) getattr(self.group, key)._v_attrs.value_type = 'datetime64' else: if empty_array: # ugly hack for length 0 axes arr = np.empty((1,) * value.ndim) - self.handle.createArray(self.group, key, arr) + self._handle.createArray(self.group, key, arr) getattr(self.group, key)._v_attrs.value_type = str(value.dtype) getattr(self.group, key)._v_attrs.shape = value.shape else: - self.handle.createArray(self.group, key, value) + self._handle.createArray(self.group, key, value) getattr(self.group, key)._v_attrs.transposed = transposed @@ -1729,7 +1738,7 @@ def write(self, obj, **kwargs): for name, ss in obj.iteritems(): key = 'sparse_series_%s' % name if key not in self.group._v_children: - node = self.handle.createGroup(self.group, key) + node = self._handle.createGroup(self.group, key) else: node = getattr(self.group, key) s = SparseSeriesStorer(self.parent, node) @@ -1763,7 +1772,7 @@ def write(self, obj, **kwargs): for name, sdf in obj.iteritems(): key = 'sparse_frame_%s' % name if key not in self.group._v_children: - node = self.handle.createGroup(self.group, key) + node = self._handle.createGroup(self.group, key) else: node = getattr(self.group, key) s = SparseFrameStorer(self.parent, node) @@ -2293,13 +2302,13 @@ def create_description(self, complib=None, complevel=None, fletcher32=False, exp if complib: if complevel is None: - complevel = self.complevel or 9 + complevel = self._complevel or 9 filters = _tables().Filters(complevel=complevel, complib=complib, - fletcher32=fletcher32 or self.fletcher32) + fletcher32=fletcher32 or self._fletcher32) d['filters'] = filters - elif self.filters is not None: - d['filters'] = self.filters + elif self._filters is not None: + d['filters'] = self._filters return d @@ -2484,7 +2493,7 @@ def write(self, obj, axes=None, append=False, complib=None, expectedrows=None, **kwargs): if not append and self.is_exists: - self.handle.removeNode(self.group, 'table') + self._handle.removeNode(self.group, 'table') # create the axes self.create_axes(axes=axes, obj=obj, validate=append, @@ -2502,7 +2511,7 @@ def write(self, obj, axes=None, append=False, complib=None, self.set_attrs() # create the table - table = self.handle.createTable(self.group, **options) + table = self._handle.createTable(self.group, **options) else: table = self.table @@ -2579,7 +2588,7 @@ def delete(self, where=None, **kwargs): # delete all rows (and return the nrows) if where is None or not len(where): nrows = self.nrows - self.handle.removeNode(self.group, recursive=True) + self._handle.removeNode(self.group, recursive=True) return nrows # infer the data kind diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index ff4d4fa788e6b..a4df428d60d90 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -132,7 +132,7 @@ def test_repr(self): store['df'] = df # make a random group in hdf space - store.handle.createGroup(store.handle.root,'bah') + store._handle.createGroup(store._handle.root,'bah') repr(store) str(store) @@ -184,7 +184,7 @@ def test_reopen_handle(self): with ensure_clean(self.path) as store: store['a'] = tm.makeTimeSeries() store.open('w', warn=False) - self.assert_(store.handle.isopen) + self.assert_(store._handle.isopen) self.assertEquals(len(store), 0) def test_flush(self): @@ -207,6 +207,34 @@ def test_get(self): self.assertRaises(KeyError, store.get, 'b') + def test_getattr(self): + + with ensure_clean(self.path) as store: + + s = tm.makeTimeSeries() + store['a'] = s + + # test attribute access + result = store.a + tm.assert_series_equal(result, s) + result = getattr(store,'a') + tm.assert_series_equal(result, s) + + df = tm.makeTimeDataFrame() + store['df'] = df + result = store.df + tm.assert_frame_equal(result, df) + + # errors + self.assertRaises(AttributeError, getattr, store, 'd') + + for x in ['mode','path','handle','complib']: + self.assertRaises(AttributeError, getattr, store, x) + + # not stores + for x in ['mode','path','handle','complib']: + getattr(store,"_%s" % x) + def test_put(self): with ensure_clean(self.path) as store: @@ -562,8 +590,8 @@ def test_append_with_data_columns(self): tm.assert_frame_equal(store['df'], df) # check that we have indicies created - assert(store.handle.root.df.table.cols.index.is_indexed is True) - assert(store.handle.root.df.table.cols.B.is_indexed is True) + assert(store._handle.root.df.table.cols.index.is_indexed is True) + assert(store._handle.root.df.table.cols.B.is_indexed is True) # data column searching result = store.select('df', [Term('B>0')]) From eb608d4c9cef53d8414018fd5d4916fdee288c3a Mon Sep 17 00:00:00 2001 From: Alvaro Tejero-Cantero Date: Sat, 9 Feb 2013 15:26:08 +0000 Subject: [PATCH 5/7] DOC: Paragraph line-length in HDFStore section limited to 79 (PEP8). --- doc/source/conf.py | 2 + doc/source/io.rst | 322 ++++++++++++++++++++++++++++++++++----------- 2 files changed, 244 insertions(+), 80 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 76093d83b32e7..9f086be82eafc 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -19,6 +19,8 @@ # sys.path.append(os.path.abspath('.')) sys.path.insert(0, os.path.abspath('../sphinxext')) +sys.path.insert(0, '/home/e0/repos/jrb_pytb7') + sys.path.extend([ # numpy standard doc extensions diff --git a/doc/source/io.rst b/doc/source/io.rst index 100ca9e251234..c95c8426e7880 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -975,8 +975,8 @@ one can use the ExcelWriter class, as in the following example: HDF5 (PyTables) --------------- -``HDFStore`` is a dict-like object which reads and writes pandas to the high -performance HDF5 format using the excellent `PyTables +``HDFStore`` is a dict-like object which reads and writes pandas to the +high performance HDF5 format using the excellent `PyTables `__ library. .. ipython:: python @@ -990,7 +990,8 @@ performance HDF5 format using the excellent `PyTables store = HDFStore('store.h5') print store -Objects can be written to the file just like adding key-value pairs to a dict: +Objects can be written to the file just like adding key-value pairs to a +dict: .. ipython:: python @@ -1040,7 +1041,7 @@ Closing a Store # closing a store store.close() - # Working with, and automatically closing the store with the context manager. + # Working with, and automatically closing the store with the context manager with get_store('store.h5') as store: store.keys() @@ -1052,15 +1053,19 @@ Closing a Store os.remove('store.h5') -These stores are **not** appendable once written (though you can simply remove them and rewrite). Nor are they **queryable**; they must be retrieved in their entirety. +These stores are **not** appendable once written (though you can simply +remove them and rewrite). Nor are they **queryable**; they must be +retrieved in their entirety. Storing in Table format ~~~~~~~~~~~~~~~~~~~~~~~ -``HDFStore`` supports another ``PyTables`` format on disk, the ``table`` format. Conceptually a ``table`` is shaped -very much like a DataFrame, with rows and columns. A ``table`` may be appended to in the same or other sessions. -In addition, delete & query type operations are supported. +``HDFStore`` supports another ``PyTables`` format on disk, the ``table`` +format. Conceptually a ``table`` is shaped very much like a DataFrame, +with rows and columns. A ``table`` may be appended to in the same or +other sessions. In addition, delete & query type operations are +supported. .. ipython:: python :suppress: @@ -1088,7 +1093,12 @@ In addition, delete & query type operations are supported. Hierarchical Keys ~~~~~~~~~~~~~~~~~ -Keys to a store can be specified as a string. These can be in a hierarchical path-name like format (e.g. ``foo/bar/bah``), which will generate a hierarchy of sub-stores (or ``Groups`` in PyTables parlance). Keys can be specified with out the leading '/' and are ALWAYS absolute (e.g. 'foo' refers to '/foo'). Removal operations can remove everying in the sub-store and BELOW, so be *careful*. +Keys to a store can be specified as a string. These can be in a +hierarchical path-name like format (e.g. ``foo/bar/bah``), which will +generate a hierarchy of sub-stores (or ``Groups`` in PyTables +parlance). Keys can be specified with out the leading '/' and are ALWAYS +absolute (e.g. 'foo' refers to '/foo'). Removal operations can remove +everying in the sub-store and BELOW, so be *careful*. .. ipython:: python @@ -1107,8 +1117,16 @@ Keys to a store can be specified as a string. These can be in a hierarchical pat Storing Mixed Types in a Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Storing mixed-dtype data is supported. Strings are store as a fixed-width using the maximum size of the appended column. Subsequent appends will truncate strings at this length. -Passing ``min_itemsize = { `values` : size }`` as a parameter to append will set a larger minimum for the string columns. Storing ``floats, strings, ints, bools, datetime64`` are currently supported. For string columns, passing ``nan_rep = 'nan'`` to append will change the default nan representation on disk (which converts to/from `np.nan`), this defaults to `nan`. +Storing mixed-dtype data is supported. Strings are store as a +fixed-width using the maximum size of the appended column. Subsequent +appends will truncate strings at this length. + +Passing ``min_itemsize = { `values` : size }`` as a parameter to append +will set a larger minimum for the string columns. Storing ``floats, +strings, ints, bools, datetime64`` are currently supported. For string +columns, passing ``nan_rep = 'nan'`` to append will change the default +nan representation on disk (which converts to/from `np.nan`), this +defaults to `nan`. .. ipython:: python @@ -1130,7 +1148,8 @@ Passing ``min_itemsize = { `values` : size }`` as a parameter to append will set Storing Multi-Index DataFrames ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Storing multi-index dataframes as tables is very similar to storing/selecting from homogenous index DataFrames. +Storing multi-index dataframes as tables is very similar to +storing/selecting from homogenous index DataFrames. .. ipython:: python @@ -1146,21 +1165,29 @@ Storing multi-index dataframes as tables is very similar to storing/selecting fr store.append('df_mi',df_mi) store.select('df_mi') - # the levels are automatically included as data columns + # the levels are automatically included as data columns store.select('df_mi', Term('foo=bar')) Querying a Table ~~~~~~~~~~~~~~~~ -``select`` and ``delete`` operations have an optional criteria that can be specified to select/delete only -a subset of the data. This allows one to have a very large on-disk table and retrieve only a portion of the data. + +``select`` and ``delete`` operations have an optional criteria that can +be specified to select/delete only a subset of the data. This allows one +to have a very large on-disk table and retrieve only a portion of the +data. A query is specified using the ``Term`` class under the hood. - 'index' and 'columns' are supported indexers of a DataFrame - - 'major_axis', 'minor_axis', and 'items' are supported indexers of the Panel + - 'major_axis', 'minor_axis', and 'items' are supported indexers of + the Panel -Valid terms can be created from ``dict, list, tuple, or string``. Objects can be embeded as values. Allowed operations are: ``<, <=, >, >=, =``. ``=`` will be inferred as an implicit set operation (e.g. if 2 or more values are provided). The following are all valid terms. +Valid terms can be created from ``dict, list, tuple, or +string``. Objects can be embeded as values. Allowed operations are: ``<, +<=, >, >=, =``. ``=`` will be inferred as an implicit set operation +(e.g. if 2 or more values are provided). The following are all valid +terms. - ``dict(field = 'index', op = '>', value = '20121114')`` - ``('index', '>', '20121114')`` @@ -1170,8 +1197,11 @@ Valid terms can be created from ``dict, list, tuple, or string``. Objects can be - ``('major_axis', '=', Timestamp('2012/11/14'))`` - ``('minor_axis', ['A','B'])`` -Queries are built up using a list of ``Terms`` (currently only **anding** of terms is supported). An example query for a panel might be specified as follows. -``['major_axis>20000102', ('minor_axis', '=', ['A','B']) ]``. This is roughly translated to: `major_axis must be greater than the date 20000102 and the minor_axis must be A or B` +Queries are built up using a list of ``Terms`` (currently only +**anding** of terms is supported). An example query for a panel might be +specified as follows. ``['major_axis>20000102', ('minor_axis', '=', +['A','B']) ]``. This is roughly translated to: `major_axis must be +greater than the date 20000102 and the minor_axis must be A or B` .. ipython:: python @@ -1179,13 +1209,16 @@ Queries are built up using a list of ``Terms`` (currently only **anding** of ter store store.select('wp',[ Term('major_axis>20000102'), Term('minor_axis', '=', ['A','B']) ]) -The ``columns`` keyword can be supplied to select to filter a list of the return columns, this is equivalent to passing a ``Term('columns',list_of_columns_to_filter)`` +The ``columns`` keyword can be supplied to select to filter a list of +the return columns, this is equivalent to passing a +``Term('columns',list_of_columns_to_filter)`` .. ipython:: python store.select('df', columns = ['A','B']) -Start and Stop parameters can be specified to limit the total search space. These are in terms of the total number of rows in a table. +Start and Stop parameters can be specified to limit the total search +space. These are in terms of the total number of rows in a table. .. ipython:: python @@ -1198,7 +1231,15 @@ Start and Stop parameters can be specified to limit the total search space. Thes Indexing ~~~~~~~~ -You can create/modify an index for a table with ``create_table_index`` after data is already in the table (after and ``append/put`` operation). Creating a table index is **highly** encouraged. This will speed your queries a great deal when you use a ``select`` with the indexed dimension as the ``where``. **Indexes are automagically created (starting 0.10.1)** on the indexables and any data columns you specify. This behavior can be turned off by passing ``index=False`` to ``append``. + +You can create/modify an index for a table with ``create_table_index`` +after data is already in the table (after and ``append/put`` +operation). Creating a table index is **highly** encouraged. This will +speed your queries a great deal when you use a ``select`` with the +indexed dimension as the ``where``. **Indexes are automagically created +(starting 0.10.1)** on the indexables and any data columns you +specify. This behavior can be turned off by passing ``index=False`` to +``append``. .. ipython:: python @@ -1214,7 +1255,13 @@ You can create/modify an index for a table with ``create_table_index`` after dat Query via Data Columns ~~~~~~~~~~~~~~~~~~~~~~ -You can designate (and index) certain columns that you want to be able to perform queries (other than the `indexable` columns, which you can always query). For instance say you want to perform this common operation, on-disk, and return just the frame that matches this query. You can specify ``data_columns = True`` to force all columns to be data_columns + +You can designate (and index) certain columns that you want to be able +to perform queries (other than the `indexable` columns, which you can +always query). For instance say you want to perform this common +operation, on-disk, and return just the frame that matches this +query. You can specify ``data_columns = True`` to force all columns to +be data_columns .. ipython:: python @@ -1235,17 +1282,24 @@ You can designate (and index) certain columns that you want to be able to perfor # this is in-memory version of this type of selection df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == 'foo')] - # we have automagically created this index and that the B/C/string/string2 columns are stored separately as ``PyTables`` columns + # we have automagically created this index and that the B/C/string/string2 + # columns are stored separately as ``PyTables`` columns store.root.df_dc.table -There is some performance degredation by making lots of columns into `data columns`, so it is up to the user to designate these. In addition, you cannot change data columns (nor indexables) after the first append/put operation (Of course you can simply read in the data and create a new table!) +There is some performance degredation by making lots of columns into +`data columns`, so it is up to the user to designate these. In addition, +you cannot change data columns (nor indexables) after the first +append/put operation (Of course you can simply read in the data and +create a new table!) Advanced Queries ~~~~~~~~~~~~~~~~ **Unique** -To retrieve the *unique* values of an indexable or data column, use the method ``unique``. This will, for example, enable you to get the index very quickly. Note ``nan`` are excluded from the result set. +To retrieve the *unique* values of an indexable or data column, use the +method ``unique``. This will, for example, enable you to get the index +very quickly. Note ``nan`` are excluded from the result set. .. ipython:: python @@ -1254,7 +1308,9 @@ To retrieve the *unique* values of an indexable or data column, use the method ` **Replicating or** -``not`` and ``or`` conditions are unsupported at this time; however, ``or`` operations are easy to replicate, by repeately applying the criteria to the table, and then ``concat`` the results. +``not`` and ``or`` conditions are unsupported at this time; however, +``or`` operations are easy to replicate, by repeately applying the +criteria to the table, and then ``concat`` the results. .. ipython:: python @@ -1265,42 +1321,70 @@ To retrieve the *unique* values of an indexable or data column, use the method ` **Storer Object** -If you want to inspect the stored object, retrieve via ``get_storer``. You could use this progamatically to say get the number of rows in an object. +If you want to inspect the stored object, retrieve via +``get_storer``. You could use this progamatically to say get the number +of rows in an object. .. ipython:: python store.get_storer('df_dc').nrows + Multiple Table Queries ~~~~~~~~~~~~~~~~~~~~~~ -New in 0.10.1 are the methods ``append_to_multple`` and ``select_as_multiple``, that can perform appending/selecting from multiple tables at once. The idea is to have one table (call it the selector table) that you index most/all of the columns, and perform your queries. The other table(s) are data tables that are indexed the same the selector table. You can then perform a very fast query on the selector table, yet get lots of data back. This method works similar to having a very wide-table, but is more efficient in terms of queries. +New in 0.10.1 are the methods ``append_to_multple`` and +``select_as_multiple``, that can perform appending/selecting from +multiple tables at once. The idea is to have one table (call it the +selector table) that you index most/all of the columns, and perform your +queries. The other table(s) are data tables that are indexed the same +the selector table. You can then perform a very fast query on the +selector table, yet get lots of data back. This method works similar to +having a very wide-table, but is more efficient in terms of queries. -Note, **THE USER IS RESPONSIBLE FOR SYNCHRONIZING THE TABLES**. This means, append to the tables in the same order; ``append_to_multiple`` splits a single object to multiple tables, given a specification (as a dictionary). This dictionary is a mapping of the table names to the 'columns' you want included in that table. Pass a `None` for a single table (optional) to let it have the remaining columns. The argument ``selector`` defines which table is the selector table. +Note, **THE USER IS RESPONSIBLE FOR SYNCHRONIZING THE TABLES**. This +means, append to the tables in the same order; ``append_to_multiple`` +splits a single object to multiple tables, given a specification (as a +dictionary). This dictionary is a mapping of the table names to the +'columns' you want included in that table. Pass a `None` for a single +table (optional) to let it have the remaining columns. The argument +``selector`` defines which table is the selector table. .. ipython:: python - df_mt = DataFrame(randn(8, 6), index=date_range('1/1/2000', periods=8), + df_mt = DataFrame(randn(8, 6), index=date_range('1/1/2000', periods=8), columns=['A', 'B', 'C', 'D', 'E', 'F']) df_mt['foo'] = 'bar' # you can also create the tables individually - store.append_to_multiple({ 'df1_mt' : ['A','B'], 'df2_mt' : None }, df_mt, selector = 'df1_mt') + store.append_to_multiple({ 'df1_mt' : ['A','B'], 'df2_mt' : None }, + df_mt, selector = 'df1_mt') store # indiviual tables were created store.select('df1_mt') store.select('df2_mt') - + # as a multiple store.select_as_multiple(['df1_mt','df2_mt'], where = [ 'A>0','B>0' ], selector = 'df1_mt') - + Delete from a Table ~~~~~~~~~~~~~~~~~~~ -You can delete from a table selectively by specifying a ``where``. In deleting rows, it is important to understand the ``PyTables`` deletes rows by erasing the rows, then **moving** the following data. Thus deleting can potentially be a very expensive operation depending on the orientation of your data. This is especially true in higher dimensional objects (``Panel`` and ``Panel4D``). To get optimal deletion speed, it pays to have the dimension you are deleting be the first of the ``indexables``. -Data is ordered (on the disk) in terms of the ``indexables``. Here's a simple use case. You store panel type data, with dates in the ``major_axis`` and ids in the ``minor_axis``. The data is then interleaved like this: +You can delete from a table selectively by specifying a ``where``. In +deleting rows, it is important to understand the ``PyTables`` deletes +rows by erasing the rows, then **moving** the following data. Thus +deleting can potentially be a very expensive operation depending on the +orientation of your data. This is especially true in higher dimensional +objects (``Panel`` and ``Panel4D``). To get optimal deletion speed, it +pays to have the dimension you are deleting be the first of the +``indexables``. + +Data is ordered (on the disk) in terms of the ``indexables``. Here's a +simple use case. You store panel type data, with dates in the +``major_axis`` and ids in the ``minor_axis``. The data is then +interleaved like this: - date_1 - id_1 @@ -1312,7 +1396,11 @@ Data is ordered (on the disk) in terms of the ``indexables``. Here's a simple us - . - id_n -It should be clear that a delete operation on the ``major_axis`` will be fairly quick, as one chunk is removed, then the following data moved. On the other hand a delete operation on the ``minor_axis`` will be very expensive. In this case it would almost certainly be faster to rewrite the table using a ``where`` that selects all but the missing data. +It should be clear that a delete operation on the ``major_axis`` will be +fairly quick, as one chunk is removed, then the following data moved. On +the other hand a delete operation on the ``minor_axis`` will be very +expensive. In this case it would almost certainly be faster to rewrite +the table using a ``where`` that selects all but the missing data. .. ipython:: python @@ -1320,73 +1408,117 @@ It should be clear that a delete operation on the ``major_axis`` will be fairly store.remove('wp', 'major_axis>20000102' ) store.select('wp') -Please note that HDF5 **DOES NOT RECLAIM SPACE** in the h5 files automatically. Thus, repeatedly deleting (or removing nodes) and adding again **WILL TEND TO INCREASE THE FILE SIZE**. To *clean* the file, use ``ptrepack`` (see below). +Please note that HDF5 **DOES NOT RECLAIM SPACE** in the h5 files +automatically. Thus, repeatedly deleting (or removing nodes) and adding +again **WILL TEND TO INCREASE THE FILE SIZE**. To *clean* the file, use +``ptrepack`` (see below). Compression ~~~~~~~~~~~ -``PyTables`` allows the stored data to be compressed. Tthis applies to all kinds of stores, not just tables. - - Pass ``complevel=int`` for a compression level (1-9, with 0 being no compression, and the default) - - Pass ``complib=lib`` where lib is any of ``zlib, bzip2, lzo, blosc`` for whichever compression library you prefer. +``PyTables`` allows the stored data to be compressed. Tthis applies to +all kinds of stores, not just tables. -``HDFStore`` will use the file based compression scheme if no overriding ``complib`` or ``complevel`` options are provided. ``blosc`` offers very fast compression, and is my most used. Note that ``lzo`` and ``bzip2`` may not be installed (by Python) by default. + - Pass ``complevel=int`` for a compression level (1-9, with 0 being no + compression, and the default) + - Pass ``complib=lib`` where lib is any of ``zlib, bzip2, lzo, blosc`` for + whichever compression library you prefer. + +``HDFStore`` will use the file based compression scheme if no overriding +``complib`` or ``complevel`` options are provided. ``blosc`` offers very +fast compression, and is my most used. Note that ``lzo`` and ``bzip2`` +may not be installed (by Python) by default. Compression for all objects within the file - ``store_compressed = HDFStore('store_compressed.h5', complevel=9, complib='blosc')`` -Or on-the-fly compression (this only applies to tables). You can turn off file compression for a specific table by passing ``complevel=0`` +Or on-the-fly compression (this only applies to tables). You can turn +off file compression for a specific table by passing ``complevel=0`` - ``store.append('df', df, complib='zlib', complevel=5)`` **ptrepack** -``PyTables`` offer better write performance when compressed after writing them, as opposed to turning on compression at the very beginning. You can use the supplied ``PyTables`` utility ``ptrepack``. In addition, ``ptrepack`` can change compression levels after the fact. +``PyTables`` offer better write performance when compressed after +writing them, as opposed to turning on compression at the very +beginning. You can use the supplied ``PyTables`` utility +``ptrepack``. In addition, ``ptrepack`` can change compression levels +after the fact. - ``ptrepack --chunkshape=auto --propindexes --complevel=9 --complib=blosc in.h5 out.h5`` -Furthermore ``ptrepack in.h5 out.h5`` will *repack* the file to allow you to reuse previously deleted space. Aalternatively, one can simply remove the file and write again, or use the ``copy`` method. +Furthermore ``ptrepack in.h5 out.h5`` will *repack* the file to allow +you to reuse previously deleted space. Aalternatively, one can simply +remove the file and write again, or use the ``copy`` method. Notes & Caveats ~~~~~~~~~~~~~~~ - - Once a ``table`` is created its items (Panel) / columns (DataFrame) are fixed; only exactly the same columns can be appended - - You can not append/select/delete to a non-table (table creation is determined on the first append, or by passing ``table=True`` in a put operation) - - ``HDFStore`` is **not-threadsafe for writing**. The underlying ``PyTables`` only supports concurrent reads (via threading or processes). If you need reading and writing *at the same time*, you need to serialize these operations in a single thread in a single process. You will corrupt your data otherwise. See the issue for more information. - - - ``PyTables`` only supports fixed-width string columns in ``tables``. The sizes of a string based indexing column (e.g. *columns* or *minor_axis*) are determined as the maximum size of the elements in that axis or by passing the parameter ``min_itemsize`` on the first table creation (``min_itemsize`` can be an integer or a dict of column name to an integer). If subsequent appends introduce elements in the indexing axis that are larger than the supported indexer, an Exception will be raised (otherwise you could have a silent truncation of these indexers, leading to loss of information). Just to be clear, this fixed-width restriction applies to **indexables** (the indexing columns) and **string values** in a mixed_type table. + - Once a ``table`` is created its items (Panel) / columns (DataFrame) + are fixed; only exactly the same columns can be appended + - You can not append/select/delete to a non-table (table creation is + determined on the first append, or by passing ``table=True`` in a + put operation) + - ``HDFStore`` is **not-threadsafe for writing**. The underlying + ``PyTables`` only supports concurrent reads (via threading or + processes). If you need reading and writing *at the same time*, you + need to serialize these operations in a single thread in a single + process. You will corrupt your data otherwise. See the issue + for more + information. + - ``PyTables`` only supports fixed-width string columns in + ``tables``. The sizes of a string based indexing column + (e.g. *columns* or *minor_axis*) are determined as the maximum size + of the elements in that axis or by passing the parameter + ``min_itemsize`` on the first table creation (``min_itemsize`` can + be an integer or a dict of column name to an integer). If + subsequent appends introduce elements in the indexing axis that are + larger than the supported indexer, an Exception will be raised + (otherwise you could have a silent truncation of these indexers, + leading to loss of information). Just to be clear, this fixed-width + restriction applies to **indexables** (the indexing columns) and + **string values** in a mixed_type table. .. ipython:: python - store.append('wp_big_strings', wp, min_itemsize = { 'minor_axis' : 30 }) - wp = wp.rename_axis(lambda x: x + '_big_strings', axis=2) - store.append('wp_big_strings', wp) - store.select('wp_big_strings') + store.append('wp_big_strings', wp, min_itemsize = { 'minor_axis' : 30 }) + wp = wp.rename_axis(lambda x: x + '_big_strings', axis=2) + store.append('wp_big_strings', wp) + store.select('wp_big_strings') - # we have provided a minimum minor_axis indexable size - store.root.wp_big_strings.table + # we have provided a minimum minor_axis indexable size + store.root.wp_big_strings.table DataTypes ~~~~~~~~~ -``HDFStore`` will map an object dtype to the ``PyTables`` underlying dtype. This means the following types are known to work: +``HDFStore`` will map an object dtype to the ``PyTables`` underlying +dtype. This means the following types are known to work: - - floating : ``float64, float32, float16`` *(using* ``np.nan`` *to represent invalid values)* + - floating : ``float64, float32, float16`` *(using* ``np.nan`` *to + represent invalid values)* - integer : ``int64, int32, int8, uint64, uint32, uint8`` - bool - datetime64[ns] *(using* ``NaT`` *to represent invalid values)* - - object : ``strings`` *(using* ``np.nan`` *to represent invalid values)* + - object : ``strings`` *(using* ``np.nan`` *to represent invalid + values)* -Currently, ``unicode`` and ``datetime`` columns (represented with a dtype of ``object``), **WILL FAIL**. In addition, even though a column may look like a ``datetime64[ns]``, -if it contains ``np.nan``, this **WILL FAIL**. You can try to convert datetimelike columns to proper ``datetime64[ns]`` columns, that possibily contain ``NaT`` to represent invalid values. (Some of these issues have been addressed and these conversion may not be necessary in future versions of pandas) +Currently, ``unicode`` and ``datetime`` columns (represented with a +dtype of ``object``), **WILL FAIL**. In addition, even though a column +may look like a ``datetime64[ns]``, if it contains ``np.nan``, this +**WILL FAIL**. You can try to convert datetimelike columns to proper +``datetime64[ns]`` columns, that possibily contain ``NaT`` to represent +invalid values. (Some of these issues have been addressed and these +conversion may not be necessary in future versions of pandas) .. ipython:: python - + import datetime df = DataFrame(dict(datelike = Series([datetime.datetime(2001,1,1),datetime.datetime(2001,1,2),np.nan]))) df df.dtypes - + # to convert df['datelike'] = Series(df['datelike'].values,dtype='M8[ns]') df @@ -1395,17 +1527,22 @@ if it contains ``np.nan``, this **WILL FAIL**. You can try to convert datetimeli External Compatibility ~~~~~~~~~~~~~~~~~~~~~~ -``HDFStore`` write storer objects in specific formats suitable for producing loss-less roundtrips to pandas objects. For external compatibility, ``HDFStore`` can read native ``PyTables`` format tables. It is possible to write an ``HDFStore`` object that can easily be imported into ``R`` using the ``rhdf5`` library. Create a table format store like this: +``HDFStore`` write storer objects in specific formats suitable for +producing loss-less roundtrips to pandas objects. For external +compatibility, ``HDFStore`` can read native ``PyTables`` format +tables. It is possible to write an ``HDFStore`` object that can easily +be imported into ``R`` using the ``rhdf5`` library. Create a table +format store like this: .. ipython:: python store_export = HDFStore('export.h5') - store_export.append('df_dc',df_dc,data_columns=df_dc.columns) - store_export + store_export.append('df_dc',df_dc,data_columns=df_dc.columns) + store_export .. ipython:: python :suppress: - + store_export.close() import os os.remove('export.h5') @@ -1413,12 +1550,19 @@ External Compatibility Backwards Compatibility ~~~~~~~~~~~~~~~~~~~~~~~ -0.10.1 of ``HDFStore`` is backwards compatible for reading tables created in a prior version of pandas however, query terms using the prior (undocumented) methodology are unsupported. ``HDFStore`` will issue a warning if you try to use a prior-version format file. You must read in the entire file and write it out using the new format, using the method ``copy`` to take advantage of the updates. The group attribute ``pandas_version`` contains the version information. ``copy`` takes a number of options, please see the docstring. +0.10.1 of ``HDFStore`` is backwards compatible for reading tables +created in a prior version of pandas however, query terms using the +prior (undocumented) methodology are unsupported. ``HDFStore`` will +issue a warning if you try to use a prior-version format file. You must +read in the entire file and write it out using the new format, using the +method ``copy`` to take advantage of the updates. The group attribute +``pandas_version`` contains the version information. ``copy`` takes a +number of options, please see the docstring. .. ipython:: python :suppress: - + import os legacy_file_path = os.path.abspath('source/_static/legacy_0.10.h5') @@ -1429,27 +1573,40 @@ Backwards Compatibility legacy_store # copy (and return the new handle) - new_store = legacy_store.copy('store_new.h5') - new_store + new_store = legacy_store.copy('store_new.h5') + new_store new_store.close() .. ipython:: python :suppress: - + legacy_store.close() import os os.remove('store_new.h5') - + Performance ~~~~~~~~~~~ - - ``Tables`` come with a writing performance penalty as compared to regular stores. The benefit is the ability to append/delete and query (potentially very large amounts of data). - Write times are generally longer as compared with regular stores. Query times can be quite fast, especially on an indexed axis. - - You can pass ``chunksize=an integer`` to ``append``, to change the writing chunksize (default is 50000). This will signficantly lower your memory usage on writing. - - You can pass ``expectedrows=an integer`` to the first ``append``, to set the TOTAL number of expectedrows that ``PyTables`` will expected. This will optimize read/write performance. - - Duplicate rows can be written to tables, but are filtered out in selection (with the last items being selected; thus a table is unique on major, minor pairs) - - A ``PerformanceWarning`` will be raised if you are attempting to store types that will be pickled by PyTables (rather than stored as endemic types). See for more information and some solutions. + - ``Tables`` come with a writing performance penalty as compared to + regular stores. The benefit is the ability to append/delete and + query (potentially very large amounts of data). Write times are + generally longer as compared with regular stores. Query times can + be quite fast, especially on an indexed axis. + - You can pass ``chunksize=an integer`` to ``append``, to change the + writing chunksize (default is 50000). This will signficantly lower + your memory usage on writing. + - You can pass ``expectedrows=an integer`` to the first ``append``, + to set the TOTAL number of expectedrows that ``PyTables`` will + expected. This will optimize read/write performance. + - Duplicate rows can be written to tables, but are filtered out in + selection (with the last items being selected; thus a table is + unique on major, minor pairs) + - A ``PerformanceWarning`` will be raised if you are attempting to + store types that will be pickled by PyTables (rather than stored as + endemic types). See + + for more information and some solutions. Experimental ~~~~~~~~~~~~ @@ -1463,7 +1620,12 @@ HDFStore supports ``Panel4D`` storage. store.append('p4d', p4d) store -These, by default, index the three axes ``items, major_axis, minor_axis``. On an ``AppendableTable`` it is possible to setup with the first append a different indexing scheme, depending on how you want to store your data. Pass the ``axes`` keyword with a list of dimension (currently must by exactly 1 less than the total dimensions of the object). This cannot be changed after table creation. +These, by default, index the three axes ``items, major_axis, +minor_axis``. On an ``AppendableTable`` it is possible to setup with the +first append a different indexing scheme, depending on how you want to +store your data. Pass the ``axes`` keyword with a list of dimension +(currently must by exactly 1 less than the total dimensions of the +object). This cannot be changed after table creation. .. ipython:: python From d13fa9f5fd7b8963582820f209cfb344011f1231 Mon Sep 17 00:00:00 2001 From: Alvaro Tejero-Cantero Date: Sat, 9 Feb 2013 15:41:56 +0000 Subject: [PATCH 6/7] DOC: Apply PEP8 to code chunks in HDFStore doc (except Term lists, for readability). --- doc/source/io.rst | 59 ++++++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index c95c8426e7880..9f5a4c88e5784 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -975,8 +975,8 @@ one can use the ExcelWriter class, as in the following example: HDF5 (PyTables) --------------- -``HDFStore`` is a dict-like object which reads and writes pandas to the -high performance HDF5 format using the excellent `PyTables +``HDFStore`` is a dict-like object which reads and writes pandas using +the high performance HDF5 format using the excellent `PyTables `__ library. .. ipython:: python @@ -1041,7 +1041,8 @@ Closing a Store # closing a store store.close() - # Working with, and automatically closing the store with the context manager + # Working with, and automatically closing the store with the context + # manager with get_store('store.h5') as store: store.keys() @@ -1137,7 +1138,7 @@ defaults to `nan`. df_mixed['datetime64'] = Timestamp('20010102') df_mixed.ix[3:5,['A','B','string','datetime64']] = np.nan - store.append('df_mixed', df_mixed, min_itemsize = { 'values' : 50 }) + store.append('df_mixed', df_mixed, min_itemsize = {'values': 50}) df_mixed1 = store.select('df_mixed') df_mixed1 df_mixed1.get_dtype_counts() @@ -1159,7 +1160,7 @@ storing/selecting from homogenous index DataFrames. [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['foo', 'bar']) df_mi = DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) + columns=['A', 'B', 'C']) df_mi store.append('df_mi',df_mi) @@ -1192,10 +1193,10 @@ terms. - ``dict(field = 'index', op = '>', value = '20121114')`` - ``('index', '>', '20121114')`` - ``'index > 20121114'`` - - ``('index', '>', datetime(2012,11,14))`` - - ``('index', ['20121114','20121115'])`` + - ``('index', '>', datetime(2012, 11, 14))`` + - ``('index', ['20121114', '20121115'])`` - ``('major_axis', '=', Timestamp('2012/11/14'))`` - - ``('minor_axis', ['A','B'])`` + - ``('minor_axis', ['A', 'B'])`` Queries are built up using a list of ``Terms`` (currently only **anding** of terms is supported). An example query for a panel might be @@ -1207,7 +1208,7 @@ greater than the date 20000102 and the minor_axis must be A or B` store.append('wp',wp) store - store.select('wp',[ Term('major_axis>20000102'), Term('minor_axis', '=', ['A','B']) ]) + store.select('wp', [ Term('major_axis>20000102'), Term('minor_axis', '=', ['A', 'B']) ]) The ``columns`` keyword can be supplied to select to filter a list of the return columns, this is equivalent to passing a @@ -1215,7 +1216,7 @@ the return columns, this is equivalent to passing a .. ipython:: python - store.select('df', columns = ['A','B']) + store.select('df', columns=['A', 'B']) Start and Stop parameters can be specified to limit the total search space. These are in terms of the total number of rows in a table. @@ -1226,7 +1227,9 @@ space. These are in terms of the total number of rows in a table. wp.to_frame() # limiting the search - store.select('wp',[ Term('major_axis>20000102'), Term('minor_axis', '=', ['A','B']) ], start=0, stop=10) + store.select('wp',[ Term('major_axis>20000102'), + Term('minor_axis', '=', ['A','B']) ], + start=0, stop=10) Indexing @@ -1273,11 +1276,11 @@ be data_columns df_dc # on-disk operations - store.append('df_dc', df_dc, data_columns = ['B','C','string','string2']) - store.select('df_dc',[ Term('B>0') ]) + store.append('df_dc', df_dc, data_columns = ['B', 'C', 'string', 'string2']) + store.select('df_dc', [ Term('B>0') ]) # getting creative - store.select('df_dc',[ 'B > 0', 'C > 0', 'string == foo' ]) + store.select('df_dc', ['B > 0', 'C > 0', 'string == foo']) # this is in-memory version of this type of selection df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == 'foo')] @@ -1303,8 +1306,8 @@ very quickly. Note ``nan`` are excluded from the result set. .. ipython:: python - store.unique('df_dc','index') - store.unique('df_dc','string') + store.unique('df_dc', 'index') + store.unique('df_dc', 'string') **Replicating or** @@ -1317,7 +1320,7 @@ criteria to the table, and then ``concat`` the results. crit1 = [ Term('B>0'), Term('C>0'), Term('string=foo') ] crit2 = [ Term('B<0'), Term('C>0'), Term('string=foo') ] - concat([ store.select('df_dc',c) for c in [ crit1, crit2 ] ]) + concat([store.select('df_dc',c) for c in [crit1, crit2]]) **Storer Object** @@ -1357,8 +1360,8 @@ table (optional) to let it have the remaining columns. The argument df_mt['foo'] = 'bar' # you can also create the tables individually - store.append_to_multiple({ 'df1_mt' : ['A','B'], 'df2_mt' : None }, - df_mt, selector = 'df1_mt') + store.append_to_multiple({'df1_mt': ['A', 'B'], 'df2_mt': None }, + df_mt, selector='df1_mt') store # indiviual tables were created @@ -1366,7 +1369,8 @@ table (optional) to let it have the remaining columns. The argument store.select('df2_mt') # as a multiple - store.select_as_multiple(['df1_mt','df2_mt'], where = [ 'A>0','B>0' ], selector = 'df1_mt') + store.select_as_multiple(['df1_mt','df2_mt'], where=['A>0', 'B>0'], + selector = 'df1_mt') Delete from a Table @@ -1431,7 +1435,8 @@ may not be installed (by Python) by default. Compression for all objects within the file - - ``store_compressed = HDFStore('store_compressed.h5', complevel=9, complib='blosc')`` + - ``store_compressed = HDFStore('store_compressed.h5', complevel=9, + complib='blosc')`` Or on-the-fly compression (this only applies to tables). You can turn off file compression for a specific table by passing ``complevel=0`` @@ -1446,7 +1451,8 @@ beginning. You can use the supplied ``PyTables`` utility ``ptrepack``. In addition, ``ptrepack`` can change compression levels after the fact. - - ``ptrepack --chunkshape=auto --propindexes --complevel=9 --complib=blosc in.h5 out.h5`` + - ``ptrepack --chunkshape=auto --propindexes --complevel=9 + --complib=blosc in.h5 out.h5`` Furthermore ``ptrepack in.h5 out.h5`` will *repack* the file to allow you to reuse previously deleted space. Aalternatively, one can simply @@ -1515,12 +1521,13 @@ conversion may not be necessary in future versions of pandas) .. ipython:: python import datetime - df = DataFrame(dict(datelike = Series([datetime.datetime(2001,1,1),datetime.datetime(2001,1,2),np.nan]))) + df = DataFrame(dict(datelike=Series([datetime.datetime(2001, 1, 1), + datetime.datetime(2001, 1, 2), np.nan]))) df df.dtypes # to convert - df['datelike'] = Series(df['datelike'].values,dtype='M8[ns]') + df['datelike'] = Series(df['datelike'].values, dtype='M8[ns]') df df.dtypes @@ -1537,7 +1544,7 @@ format store like this: .. ipython:: python store_export = HDFStore('export.h5') - store_export.append('df_dc',df_dc,data_columns=df_dc.columns) + store_export.append('df_dc', df_dc, data_columns=df_dc.columns) store_export .. ipython:: python @@ -1629,7 +1636,7 @@ object). This cannot be changed after table creation. .. ipython:: python - store.append('p4d2', p4d, axes = ['labels','major_axis','minor_axis']) + store.append('p4d2', p4d, axes=['labels', 'major_axis', 'minor_axis']) store store.select('p4d2', [ Term('labels=l1'), Term('items=Item1'), Term('minor_axis=A_big_strings') ]) From ee52a4b6898ec52d980ce40a04ebf0fcffb0c1c1 Mon Sep 17 00:00:00 2001 From: Alvaro Tejero-Cantero Date: Sat, 9 Feb 2013 15:51:21 +0000 Subject: [PATCH 7/7] DOC: Typos + little PEP8 spacing instances and reverted inadvertent add at doc/source/conf.py --- doc/source/conf.py | 2 -- doc/source/io.rst | 30 +++++++++++++++--------------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 9f086be82eafc..76093d83b32e7 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -19,8 +19,6 @@ # sys.path.append(os.path.abspath('.')) sys.path.insert(0, os.path.abspath('../sphinxext')) -sys.path.insert(0, '/home/e0/repos/jrb_pytb7') - sys.path.extend([ # numpy standard doc extensions diff --git a/doc/source/io.rst b/doc/source/io.rst index 9f5a4c88e5784..a2f30dc14e29f 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1118,11 +1118,11 @@ everying in the sub-store and BELOW, so be *careful*. Storing Mixed Types in a Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Storing mixed-dtype data is supported. Strings are store as a +Storing mixed-dtype data is supported. Strings are stored as a fixed-width using the maximum size of the appended column. Subsequent appends will truncate strings at this length. -Passing ``min_itemsize = { `values` : size }`` as a parameter to append +Passing ``min_itemsize={`values`: size}`` as a parameter to append will set a larger minimum for the string columns. Storing ``floats, strings, ints, bools, datetime64`` are currently supported. For string columns, passing ``nan_rep = 'nan'`` to append will change the default @@ -1136,7 +1136,7 @@ defaults to `nan`. df_mixed['int'] = 1 df_mixed['bool'] = True df_mixed['datetime64'] = Timestamp('20010102') - df_mixed.ix[3:5,['A','B','string','datetime64']] = np.nan + df_mixed.ix[3:5,['A', 'B', 'string', 'datetime64']] = np.nan store.append('df_mixed', df_mixed, min_itemsize = {'values': 50}) df_mixed1 = store.select('df_mixed') @@ -1150,7 +1150,7 @@ Storing Multi-Index DataFrames ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Storing multi-index dataframes as tables is very similar to -storing/selecting from homogenous index DataFrames. +storing/selecting from homogeneous index DataFrames. .. ipython:: python @@ -1173,7 +1173,7 @@ storing/selecting from homogenous index DataFrames. Querying a Table ~~~~~~~~~~~~~~~~ -``select`` and ``delete`` operations have an optional criteria that can +``select`` and ``delete`` operations have an optional criterion that can be specified to select/delete only a subset of the data. This allows one to have a very large on-disk table and retrieve only a portion of the data. @@ -1201,7 +1201,7 @@ terms. Queries are built up using a list of ``Terms`` (currently only **anding** of terms is supported). An example query for a panel might be specified as follows. ``['major_axis>20000102', ('minor_axis', '=', -['A','B']) ]``. This is roughly translated to: `major_axis must be +['A', 'B']) ]``. This is roughly translated to: `major_axis must be greater than the date 20000102 and the minor_axis must be A or B` .. ipython:: python @@ -1212,13 +1212,13 @@ greater than the date 20000102 and the minor_axis must be A or B` The ``columns`` keyword can be supplied to select to filter a list of the return columns, this is equivalent to passing a -``Term('columns',list_of_columns_to_filter)`` +``Term('columns', list_of_columns_to_filter)`` .. ipython:: python store.select('df', columns=['A', 'B']) -Start and Stop parameters can be specified to limit the total search +``start`` and ``stop`` parameters can be specified to limit the total search space. These are in terms of the total number of rows in a table. .. ipython:: python @@ -1251,7 +1251,7 @@ specify. This behavior can be turned off by passing ``index=False`` to i.optlevel, i.kind # change an index by passing new parameters - store.create_table_index('df', optlevel = 9, kind = 'full') + store.create_table_index('df', optlevel=9, kind='full') i = store.root.df.table.cols.index.index i.optlevel, i.kind @@ -1312,7 +1312,7 @@ very quickly. Note ``nan`` are excluded from the result set. **Replicating or** ``not`` and ``or`` conditions are unsupported at this time; however, -``or`` operations are easy to replicate, by repeately applying the +``or`` operations are easy to replicate, by repeatedly applying the criteria to the table, and then ``concat`` the results. .. ipython:: python @@ -1325,7 +1325,7 @@ criteria to the table, and then ``concat`` the results. **Storer Object** If you want to inspect the stored object, retrieve via -``get_storer``. You could use this progamatically to say get the number +``get_storer``. You could use this programmatically to say get the number of rows in an object. .. ipython:: python @@ -1340,10 +1340,10 @@ New in 0.10.1 are the methods ``append_to_multple`` and ``select_as_multiple``, that can perform appending/selecting from multiple tables at once. The idea is to have one table (call it the selector table) that you index most/all of the columns, and perform your -queries. The other table(s) are data tables that are indexed the same +queries. The other table(s) are data tables that are indexed the same as the selector table. You can then perform a very fast query on the selector table, yet get lots of data back. This method works similar to -having a very wide-table, but is more efficient in terms of queries. +having a very wide table, but is more efficient in terms of queries. Note, **THE USER IS RESPONSIBLE FOR SYNCHRONIZING THE TABLES**. This means, append to the tables in the same order; ``append_to_multiple`` @@ -1369,7 +1369,7 @@ table (optional) to let it have the remaining columns. The argument store.select('df2_mt') # as a multiple - store.select_as_multiple(['df1_mt','df2_mt'], where=['A>0', 'B>0'], + store.select_as_multiple(['df1_mt', 'df2_mt'], where=['A>0', 'B>0'], selector = 'df1_mt') @@ -1386,7 +1386,7 @@ pays to have the dimension you are deleting be the first of the ``indexables``. Data is ordered (on the disk) in terms of the ``indexables``. Here's a -simple use case. You store panel type data, with dates in the +simple use case. You store panel-type data, with dates in the ``major_axis`` and ids in the ``minor_axis``. The data is then interleaved like this: