Skip to content

HDFStore.walk() to iterate on groups #21339

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ HDFStore: PyTables (HDF5)
HDFStore.select
HDFStore.info
HDFStore.keys
HDFStore.walk

Feather
~~~~~~~
Expand Down
19 changes: 19 additions & 0 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3554,6 +3554,25 @@ everything in the sub-store and **below**, so be *careful*.
store.remove('food')
store


You can walk through the group hierarchy using the ``walk`` method which
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a versionadded

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

will yield a tuple for each group key along with the relative keys of its contents.

.. versionadded:: 0.24.0


.. ipython:: python

for (path, subgroups, subkeys) in store.walk():
for subgroup in subgroups:
print('GROUP: {}/{}'.format(path, subgroup))
for subkey in subkeys:
key = '/'.join([path, subkey])
print('KEY: {}'.format(key))
print(store.get(key))



.. warning::

Hierarchical keys cannot be retrieved as dotted (attribute) access as described above for items stored under the root node.
Expand Down
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ Other Enhancements
- :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`)
- :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with :class:`MultiIndex` (:issue:`21115`)
- Added support for reading from Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`)
- New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`)
-

.. _whatsnew_0240.api_breaking:

Expand Down
47 changes: 47 additions & 0 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -1106,6 +1106,53 @@ def groups(self):
g._v_name != u('table'))))
]

def walk(self, where="/"):
""" Walk the pytables group hierarchy for pandas objects

This generator will yield the group path, subgroups and pandas object
names for each group.
Any non-pandas PyTables objects that are not a group will be ignored.

The `where` group itself is listed first (preorder), then each of its
child groups (following an alphanumerical order) is also traversed,
following the same procedure.

.. versionadded:: 0.24.0

Parameters
----------
where : str, optional
Group where to start walking.
If not supplied, the root group is used.

Yields
------
path : str
Full path to a group (without trailing '/')
groups : list of str
names of the groups contained in `path`
leaves : list of str
names of the pandas objects contained in `path`

"""
_tables()
self._check_if_open()
for g in self._handle.walk_groups(where):
if getattr(g._v_attrs, 'pandas_type', None) is not None:
continue

groups = []
leaves = []
for child in g._v_children.values():
pandas_type = getattr(child._v_attrs, 'pandas_type', None)
if pandas_type is None:
if isinstance(child, _table_mod.group.Group):
groups.append(child._v_name)
else:
leaves.append(child._v_name)

yield (g._v_pathname.rstrip('/'), groups, leaves)

def get_node(self, key):
""" return the node with the key or None if it does not exist """
self._check_if_open()
Expand Down
51 changes: 51 additions & 0 deletions pandas/tests/io/test_pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,57 @@ def test_get(self):

pytest.raises(KeyError, store.get, 'b')

@pytest.mark.parametrize('where, expected', [
('/', {
'': ({'first_group', 'second_group'}, set()),
'/first_group': (set(), {'df1', 'df2'}),
'/second_group': ({'third_group'}, {'df3', 's1'}),
'/second_group/third_group': (set(), {'df4'}),
}),
('/second_group', {
'/second_group': ({'third_group'}, {'df3', 's1'}),
'/second_group/third_group': (set(), {'df4'}),
})
])
def test_walk(self, where, expected):
# GH10143
objs = {
'df1': pd.DataFrame([1, 2, 3]),
'df2': pd.DataFrame([4, 5, 6]),
'df3': pd.DataFrame([6, 7, 8]),
'df4': pd.DataFrame([9, 10, 11]),
's1': pd.Series([10, 9, 8]),
# Next 3 items aren't pandas objects and should be ignored
'a1': np.array([[1, 2, 3], [4, 5, 6]]),
'tb1': np.array([(1, 2, 3), (4, 5, 6)], dtype='i,i,i'),
'tb2': np.array([(7, 8, 9), (10, 11, 12)], dtype='i,i,i')
}

with ensure_clean_store('walk_groups.hdf', mode='w') as store:
store.put('/first_group/df1', objs['df1'])
store.put('/first_group/df2', objs['df2'])
store.put('/second_group/df3', objs['df3'])
store.put('/second_group/s1', objs['s1'])
store.put('/second_group/third_group/df4', objs['df4'])
# Create non-pandas objects
store._handle.create_array('/first_group', 'a1', objs['a1'])
store._handle.create_table('/first_group', 'tb1', obj=objs['tb1'])
store._handle.create_table('/second_group', 'tb2', obj=objs['tb2'])

assert len(list(store.walk(where=where))) == len(expected)
for path, groups, leaves in store.walk(where=where):
assert path in expected
expected_groups, expected_frames = expected[path]
assert expected_groups == set(groups)
assert expected_frames == set(leaves)
for leaf in leaves:
frame_path = '/'.join([path, leaf])
obj = store.get(frame_path)
if 'df' in leaf:
tm.assert_frame_equal(obj, objs[leaf])
else:
tm.assert_series_equal(obj, objs[leaf])

def test_getattr(self):

with ensure_clean_store(self.path) as store:
Expand Down