From d2a682c0789e54a5cc2e28a48fe5dd492ca26f67 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 12 Jan 2023 09:38:17 -0800 Subject: [PATCH 1/2] PERF: Fix reference leak in read_hdf --- asv_bench/benchmarks/io/hdf.py | 17 +++++++++++++++++ doc/source/whatsnew/v2.0.0.rst | 1 + pandas/io/pytables.py | 4 +++- 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py index 12bc65f9e7bf5..e7b6a2459d6da 100644 --- a/asv_bench/benchmarks/io/hdf.py +++ b/asv_bench/benchmarks/io/hdf.py @@ -128,9 +128,26 @@ def setup(self, format): self.df["object"] = tm.makeStringIndex(N) self.df.to_hdf(self.fname, "df", format=format) + # Numeric df + self.df1 = self.df.copy() + self.df1 = self.df1.reset_index() + self.df1.to_hdf(self.fname, "df1", format=format) + def time_read_hdf(self, format): read_hdf(self.fname, "df") + def mem_read_hdf_index(self, format): + # Check to make sure that index is not a view into + # the original recarray (which prevents it from being freed) + # xref GH 37441 + # TODO: Don't abuse internals, need to fix asv + # to detect memory of ndarray views properly + df1 = read_hdf(self.fname, "df1") + return df1.index._data.base # Will be None(0 bytes) if not a view + + def peakmem_read_hdf(self, format): + read_hdf(self.fname, "df") + def time_write_hdf(self, format): self.df.to_hdf(self.fname, "df", format=format) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 1c99ba0b8e412..2e1be0623441a 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -857,6 +857,7 @@ Performance improvements - Performance improvement in :func:`to_datetime` when format is given or can be inferred (:issue:`50465`) - Performance improvement in :func:`read_csv` when passing :func:`to_datetime` lambda-function to ``date_parser`` and inputs have mixed timezone offsetes (:issue:`35296`) - Performance improvement in :meth:`.SeriesGroupBy.value_counts` with categorical dtype (:issue:`46202`) +- Fixed a reference leak in :func:`read_hdf` (:issue:`37441`) .. --------------------------------------------------------------------------- .. _whatsnew_200.bug_fixes: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 8c0d5c3da385c..eedcf46fd063e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2057,7 +2057,9 @@ def convert( # values is a recarray if values.dtype.fields is not None: - values = values[self.cname] + # Copy, otherwise values will be a view + # preventing the original recarry from being free'ed + values = values[self.cname].copy() val_kind = _ensure_decoded(self.kind) values = _maybe_convert(values, val_kind, encoding, errors) From 9b5cd483023537cce612fba71762286a53fe71ca Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 13 Jan 2023 11:03:27 -0800 Subject: [PATCH 2/2] address comments --- asv_bench/benchmarks/io/hdf.py | 9 --------- pandas/tests/io/pytables/test_read.py | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py index e7b6a2459d6da..e44a59114b30d 100644 --- a/asv_bench/benchmarks/io/hdf.py +++ b/asv_bench/benchmarks/io/hdf.py @@ -136,15 +136,6 @@ def setup(self, format): def time_read_hdf(self, format): read_hdf(self.fname, "df") - def mem_read_hdf_index(self, format): - # Check to make sure that index is not a view into - # the original recarray (which prevents it from being freed) - # xref GH 37441 - # TODO: Don't abuse internals, need to fix asv - # to detect memory of ndarray views properly - df1 = read_hdf(self.fname, "df1") - return df1.index._data.base # Will be None(0 bytes) if not a view - def peakmem_read_hdf(self, format): read_hdf(self.fname, "df") diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index 6d92c15f1ea10..cb0c1ce35c7c7 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -214,6 +214,20 @@ def test_read_hdf_open_store(tmp_path, setup_path): assert store.is_open +def test_read_hdf_index_not_view(tmp_path, setup_path): + # GH 37441 + # Ensure that the index of the DataFrame is not a view + # into the original recarray that pytables reads in + df = DataFrame(np.random.rand(4, 5), index=[0, 1, 2, 3], columns=list("ABCDE")) + + path = tmp_path / setup_path + df.to_hdf(path, "df", mode="w", format="table") + + df2 = read_hdf(path, "df") + assert df2.index._data.base is None + tm.assert_frame_equal(df, df2) + + def test_read_hdf_iterator(tmp_path, setup_path): df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) df.index.name = "letters"