From 10337182c5493812ec69a4f33a311fa737e02777 Mon Sep 17 00:00:00 2001 From: Antonio Valentino Date: Tue, 16 Apr 2024 08:08:29 +0200 Subject: [PATCH 1/3] Avoid unnecessary re-opening of HDF5 files --- pandas/io/pytables.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 5ecf7e287ea58..3cfd740a51304 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -292,14 +292,14 @@ def to_hdf( dropna=dropna, ) - path_or_buf = stringify_path(path_or_buf) - if isinstance(path_or_buf, str): + if isinstance(path_or_buf, HDFStore): + f(path_or_buf) + else: + path_or_buf = stringify_path(path_or_buf) with HDFStore( path_or_buf, mode=mode, complevel=complevel, complib=complib ) as store: f(store) - else: - f(path_or_buf) def read_hdf( From 813753a6987a77c0ca03704a7478cd8293c18cab Mon Sep 17 00:00:00 2001 From: Antonio Valentino Date: Tue, 16 Apr 2024 09:56:57 +0200 Subject: [PATCH 2/3] Update the whatsnew file --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 17328e6084cb4..7c5c4a3bc687e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -405,7 +405,7 @@ I/O - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - +- Enhancement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`) Period ^^^^^^ From eee8cf99b53629e17fafdb98171169b4d5287517 Mon Sep 17 00:00:00 2001 From: Antonio Valentino Date: Tue, 16 Apr 2024 19:23:25 +0200 Subject: [PATCH 3/3] Move the changelog entry for #58248 to the correct section --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 7c5c4a3bc687e..237001df750c8 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -331,6 +331,7 @@ Performance improvements - Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`) - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`) - Performance improvement in :func:`merge` if hash-join can be used (:issue:`57970`) +- Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`) - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) - Performance improvement in unary methods on a :class:`RangeIndex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57825`) @@ -405,7 +406,6 @@ I/O - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) -- Enhancement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`) Period ^^^^^^