diff --git a/doc/source/api.rst b/doc/source/api.rst index f2c00d5d12031..8dc5d0e9fc023 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -100,6 +100,7 @@ HDFStore: PyTables (HDF5) HDFStore.select HDFStore.info HDFStore.keys + HDFStore.walk Feather ~~~~~~~ diff --git a/doc/source/io.rst b/doc/source/io.rst index ae6c4f12f04f7..cf845c176b4c7 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -3554,6 +3554,25 @@ everything in the sub-store and **below**, so be *careful*. store.remove('food') store + +You can walk through the group hierarchy using the ``walk`` method which +will yield a tuple for each group key along with the relative keys of its contents. + +.. versionadded:: 0.24.0 + + +.. ipython:: python + + for (path, subgroups, subkeys) in store.walk(): + for subgroup in subgroups: + print('GROUP: {}/{}'.format(path, subgroup)) + for subkey in subkeys: + key = '/'.join([path, subkey]) + print('KEY: {}'.format(key)) + print(store.get(key)) + + + .. warning:: Hierarchical keys cannot be retrieved as dotted (attribute) access as described above for items stored under the root node. diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 72e7373d0dd33..b79d886757f99 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -19,6 +19,8 @@ Other Enhancements - :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`) - :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with :class:`MultiIndex` (:issue:`21115`) - Added support for reading from Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`) +- New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`) +- .. _whatsnew_0240.api_breaking: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 580c7923017e5..f93ad425b2c6a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1106,6 +1106,53 @@ def groups(self): g._v_name != u('table')))) ] + def walk(self, where="/"): + """ Walk the pytables group hierarchy for pandas objects + + This generator will yield the group path, subgroups and pandas object + names for each group. + Any non-pandas PyTables objects that are not a group will be ignored. + + The `where` group itself is listed first (preorder), then each of its + child groups (following an alphanumerical order) is also traversed, + following the same procedure. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + where : str, optional + Group where to start walking. + If not supplied, the root group is used. + + Yields + ------ + path : str + Full path to a group (without trailing '/') + groups : list of str + names of the groups contained in `path` + leaves : list of str + names of the pandas objects contained in `path` + + """ + _tables() + self._check_if_open() + for g in self._handle.walk_groups(where): + if getattr(g._v_attrs, 'pandas_type', None) is not None: + continue + + groups = [] + leaves = [] + for child in g._v_children.values(): + pandas_type = getattr(child._v_attrs, 'pandas_type', None) + if pandas_type is None: + if isinstance(child, _table_mod.group.Group): + groups.append(child._v_name) + else: + leaves.append(child._v_name) + + yield (g._v_pathname.rstrip('/'), groups, leaves) + def get_node(self, key): """ return the node with the key or None if it does not exist """ self._check_if_open() diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index b95df3840b6c5..29063b64221c1 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -635,6 +635,57 @@ def test_get(self): pytest.raises(KeyError, store.get, 'b') + @pytest.mark.parametrize('where, expected', [ + ('/', { + '': ({'first_group', 'second_group'}, set()), + '/first_group': (set(), {'df1', 'df2'}), + '/second_group': ({'third_group'}, {'df3', 's1'}), + '/second_group/third_group': (set(), {'df4'}), + }), + ('/second_group', { + '/second_group': ({'third_group'}, {'df3', 's1'}), + '/second_group/third_group': (set(), {'df4'}), + }) + ]) + def test_walk(self, where, expected): + # GH10143 + objs = { + 'df1': pd.DataFrame([1, 2, 3]), + 'df2': pd.DataFrame([4, 5, 6]), + 'df3': pd.DataFrame([6, 7, 8]), + 'df4': pd.DataFrame([9, 10, 11]), + 's1': pd.Series([10, 9, 8]), + # Next 3 items aren't pandas objects and should be ignored + 'a1': np.array([[1, 2, 3], [4, 5, 6]]), + 'tb1': np.array([(1, 2, 3), (4, 5, 6)], dtype='i,i,i'), + 'tb2': np.array([(7, 8, 9), (10, 11, 12)], dtype='i,i,i') + } + + with ensure_clean_store('walk_groups.hdf', mode='w') as store: + store.put('/first_group/df1', objs['df1']) + store.put('/first_group/df2', objs['df2']) + store.put('/second_group/df3', objs['df3']) + store.put('/second_group/s1', objs['s1']) + store.put('/second_group/third_group/df4', objs['df4']) + # Create non-pandas objects + store._handle.create_array('/first_group', 'a1', objs['a1']) + store._handle.create_table('/first_group', 'tb1', obj=objs['tb1']) + store._handle.create_table('/second_group', 'tb2', obj=objs['tb2']) + + assert len(list(store.walk(where=where))) == len(expected) + for path, groups, leaves in store.walk(where=where): + assert path in expected + expected_groups, expected_frames = expected[path] + assert expected_groups == set(groups) + assert expected_frames == set(leaves) + for leaf in leaves: + frame_path = '/'.join([path, leaf]) + obj = store.get(frame_path) + if 'df' in leaf: + tm.assert_frame_equal(obj, objs[leaf]) + else: + tm.assert_series_equal(obj, objs[leaf]) + def test_getattr(self): with ensure_clean_store(self.path) as store: