Skip to content

Commit 45e55af

Browse files
Stephen Pascoejreback
Stephen Pascoe
authored andcommitted
ENH: Function to walk the group hierarchy of a PyTables HDF5 file.
closes #10143
1 parent ebe480a commit 45e55af

File tree

5 files changed

+120
-1
lines changed

5 files changed

+120
-1
lines changed

doc/source/api.rst

+1
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ HDFStore: PyTables (HDF5)
100100
HDFStore.select
101101
HDFStore.info
102102
HDFStore.keys
103+
HDFStore.walk
103104

104105
Feather
105106
~~~~~~~

doc/source/io.rst

+19
Original file line numberDiff line numberDiff line change
@@ -3554,6 +3554,25 @@ everything in the sub-store and **below**, so be *careful*.
35543554
store.remove('food')
35553555
store
35563556
3557+
3558+
You can walk through the group hierarchy using the ``walk`` method which
3559+
will yield a tuple for each group key along with the relative keys of its contents.
3560+
3561+
.. versionadded:: 0.24.0
3562+
3563+
3564+
.. ipython:: python
3565+
3566+
for (path, subgroups, subkeys) in store.walk():
3567+
for subgroup in subgroups:
3568+
print('GROUP: {}/{}'.format(path, subgroup))
3569+
for subkey in subkeys:
3570+
key = '/'.join([path, subkey])
3571+
print('KEY: {}'.format(key))
3572+
print(store.get(key))
3573+
3574+
3575+
35573576
.. warning::
35583577

35593578
Hierarchical keys cannot be retrieved as dotted (attribute) access as described above for items stored under the root node.

doc/source/whatsnew/v0.24.0.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ Other Enhancements
2323
reflect changes from the `Pandas-GBQ library version 0.5.0
2424
<https://pandas-gbq.readthedocs.io/en/latest/changelog.html#changelog-0-5-0>`__.
2525
(:issue:`21627`)
26-
26+
- New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`)
27+
-
2728

2829
.. _whatsnew_0240.api_breaking:
2930

pandas/io/pytables.py

+47
Original file line numberDiff line numberDiff line change
@@ -1106,6 +1106,53 @@ def groups(self):
11061106
g._v_name != u('table'))))
11071107
]
11081108

1109+
def walk(self, where="/"):
1110+
""" Walk the pytables group hierarchy for pandas objects
1111+
1112+
This generator will yield the group path, subgroups and pandas object
1113+
names for each group.
1114+
Any non-pandas PyTables objects that are not a group will be ignored.
1115+
1116+
The `where` group itself is listed first (preorder), then each of its
1117+
child groups (following an alphanumerical order) is also traversed,
1118+
following the same procedure.
1119+
1120+
.. versionadded:: 0.24.0
1121+
1122+
Parameters
1123+
----------
1124+
where : str, optional
1125+
Group where to start walking.
1126+
If not supplied, the root group is used.
1127+
1128+
Yields
1129+
------
1130+
path : str
1131+
Full path to a group (without trailing '/')
1132+
groups : list of str
1133+
names of the groups contained in `path`
1134+
leaves : list of str
1135+
names of the pandas objects contained in `path`
1136+
1137+
"""
1138+
_tables()
1139+
self._check_if_open()
1140+
for g in self._handle.walk_groups(where):
1141+
if getattr(g._v_attrs, 'pandas_type', None) is not None:
1142+
continue
1143+
1144+
groups = []
1145+
leaves = []
1146+
for child in g._v_children.values():
1147+
pandas_type = getattr(child._v_attrs, 'pandas_type', None)
1148+
if pandas_type is None:
1149+
if isinstance(child, _table_mod.group.Group):
1150+
groups.append(child._v_name)
1151+
else:
1152+
leaves.append(child._v_name)
1153+
1154+
yield (g._v_pathname.rstrip('/'), groups, leaves)
1155+
11091156
def get_node(self, key):
11101157
""" return the node with the key or None if it does not exist """
11111158
self._check_if_open()

pandas/tests/io/test_pytables.py

+51
Original file line numberDiff line numberDiff line change
@@ -635,6 +635,57 @@ def test_get(self):
635635

636636
pytest.raises(KeyError, store.get, 'b')
637637

638+
@pytest.mark.parametrize('where, expected', [
639+
('/', {
640+
'': ({'first_group', 'second_group'}, set()),
641+
'/first_group': (set(), {'df1', 'df2'}),
642+
'/second_group': ({'third_group'}, {'df3', 's1'}),
643+
'/second_group/third_group': (set(), {'df4'}),
644+
}),
645+
('/second_group', {
646+
'/second_group': ({'third_group'}, {'df3', 's1'}),
647+
'/second_group/third_group': (set(), {'df4'}),
648+
})
649+
])
650+
def test_walk(self, where, expected):
651+
# GH10143
652+
objs = {
653+
'df1': pd.DataFrame([1, 2, 3]),
654+
'df2': pd.DataFrame([4, 5, 6]),
655+
'df3': pd.DataFrame([6, 7, 8]),
656+
'df4': pd.DataFrame([9, 10, 11]),
657+
's1': pd.Series([10, 9, 8]),
658+
# Next 3 items aren't pandas objects and should be ignored
659+
'a1': np.array([[1, 2, 3], [4, 5, 6]]),
660+
'tb1': np.array([(1, 2, 3), (4, 5, 6)], dtype='i,i,i'),
661+
'tb2': np.array([(7, 8, 9), (10, 11, 12)], dtype='i,i,i')
662+
}
663+
664+
with ensure_clean_store('walk_groups.hdf', mode='w') as store:
665+
store.put('/first_group/df1', objs['df1'])
666+
store.put('/first_group/df2', objs['df2'])
667+
store.put('/second_group/df3', objs['df3'])
668+
store.put('/second_group/s1', objs['s1'])
669+
store.put('/second_group/third_group/df4', objs['df4'])
670+
# Create non-pandas objects
671+
store._handle.create_array('/first_group', 'a1', objs['a1'])
672+
store._handle.create_table('/first_group', 'tb1', obj=objs['tb1'])
673+
store._handle.create_table('/second_group', 'tb2', obj=objs['tb2'])
674+
675+
assert len(list(store.walk(where=where))) == len(expected)
676+
for path, groups, leaves in store.walk(where=where):
677+
assert path in expected
678+
expected_groups, expected_frames = expected[path]
679+
assert expected_groups == set(groups)
680+
assert expected_frames == set(leaves)
681+
for leaf in leaves:
682+
frame_path = '/'.join([path, leaf])
683+
obj = store.get(frame_path)
684+
if 'df' in leaf:
685+
tm.assert_frame_equal(obj, objs[leaf])
686+
else:
687+
tm.assert_series_equal(obj, objs[leaf])
688+
638689
def test_getattr(self):
639690

640691
with ensure_clean_store(self.path) as store:

0 commit comments

Comments
 (0)