From 8d368fb34fe08b3375172e2a8ba35208fa3b1f9b Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 3 Dec 2020 21:30:22 +0700 Subject: [PATCH 01/10] TST: add failing test for saving subclasses --- pandas/tests/io/pytables/test_store.py | 41 ++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index c20955227d05e..e2e620191193d 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -4888,6 +4888,47 @@ def test_unsuppored_hdf_file_error(self, datapath): with pytest.raises(ValueError, match=message): pd.read_hdf(data_path) + def test_supported_for_subclasses_dataframe(self): + class SubDataFrame(DataFrame): + @property + def _constructor(self): + return SubDataFrame + + data = {"a": [1, 2], "b": [3, 4]} + sdf = SubDataFrame(data, dtype=np.intp) + + expected = np.array([[1, 3], [2, 4]], dtype=np.intp) + + with ensure_clean_path("temp.h5") as path: + sdf.to_hdf(path, "df") + result = read_hdf(path, "df").values + assert np.array_equal(result, expected) + + with ensure_clean_path("temp.h5") as path: + with HDFStore(path) as store: + store.put("df", sdf) + result = read_hdf(path, "df").values + assert np.array_equal(result, expected) + + def test_supported_for_subclasses_series(self): + class SubSeries(Series): + @property + def _constructor(self): + return SubSeries + + sser = SubSeries([1, 2, 3], dtype=np.intp) + + expected = np.array([1, 2, 3], dtype=np.intp) + + with ensure_clean_path("temp.h5") as path: + sser.to_hdf(path, "ser") + + with ensure_clean_path("temp.h5") as path: + with HDFStore(path) as store: + store.put("ser", sser) + result = read_hdf(path, "ser").values + assert np.array_equal(result, expected) + @pytest.mark.parametrize("bad_version", [(1, 2), (1,), [], "12", "123"]) def test_maybe_adjust_name_bad_version_raises(bad_version): From e38fae8d24a4120bc060b63b7a0feb077607246f Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 3 Dec 2020 21:31:07 +0700 Subject: [PATCH 02/10] FIX: mapping by type -> checking instance --- pandas/io/pytables.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index d7ee4acc2e670..8f023d9e9fe87 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1646,8 +1646,10 @@ def error(t): "nor a value are passed" ) else: - _TYPE_MAP = {Series: "series", DataFrame: "frame"} - pt = _TYPE_MAP[type(value)] + if isinstance(value, Series): + pt = "series" + else: + pt = "frame" # we are actually a table if format == "table": From 6633145b48051bd7698cc6218bf9fece9e3d7a5a Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 3 Dec 2020 22:20:45 +0700 Subject: [PATCH 03/10] TST: use pandas testing tools --- pandas/tests/io/pytables/test_store.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index e2e620191193d..ef254657ff354 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -4902,13 +4902,13 @@ def _constructor(self): with ensure_clean_path("temp.h5") as path: sdf.to_hdf(path, "df") result = read_hdf(path, "df").values - assert np.array_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) with ensure_clean_path("temp.h5") as path: with HDFStore(path) as store: store.put("df", sdf) result = read_hdf(path, "df").values - assert np.array_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) def test_supported_for_subclasses_series(self): class SubSeries(Series): @@ -4922,12 +4922,14 @@ def _constructor(self): with ensure_clean_path("temp.h5") as path: sser.to_hdf(path, "ser") + result = read_hdf(path, "ser").values + tm.assert_numpy_array_equal(result, expected) with ensure_clean_path("temp.h5") as path: with HDFStore(path) as store: store.put("ser", sser) result = read_hdf(path, "ser").values - assert np.array_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("bad_version", [(1, 2), (1,), [], "12", "123"]) From 68b5e697868f13139585a76e734d9aa81a02dbdd Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 4 Dec 2020 14:08:57 +0700 Subject: [PATCH 04/10] TST: replace with assert_frame/series_equal --- pandas/tests/io/pytables/test_store.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index ef254657ff354..98a42f395baf5 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -4897,18 +4897,18 @@ def _constructor(self): data = {"a": [1, 2], "b": [3, 4]} sdf = SubDataFrame(data, dtype=np.intp) - expected = np.array([[1, 3], [2, 4]], dtype=np.intp) + expected = DataFrame(data, dtype=np.intp) with ensure_clean_path("temp.h5") as path: sdf.to_hdf(path, "df") - result = read_hdf(path, "df").values - tm.assert_numpy_array_equal(result, expected) + result = read_hdf(path, "df") + tm.assert_frame_equal(result, expected) with ensure_clean_path("temp.h5") as path: with HDFStore(path) as store: store.put("df", sdf) - result = read_hdf(path, "df").values - tm.assert_numpy_array_equal(result, expected) + result = read_hdf(path, "df") + tm.assert_frame_equal(result, expected) def test_supported_for_subclasses_series(self): class SubSeries(Series): @@ -4916,20 +4916,21 @@ class SubSeries(Series): def _constructor(self): return SubSeries - sser = SubSeries([1, 2, 3], dtype=np.intp) + data = [1, 2, 3] + sser = SubSeries(data, dtype=np.intp) - expected = np.array([1, 2, 3], dtype=np.intp) + expected = Series(data, dtype=np.intp) with ensure_clean_path("temp.h5") as path: sser.to_hdf(path, "ser") - result = read_hdf(path, "ser").values - tm.assert_numpy_array_equal(result, expected) + result = read_hdf(path, "ser") + tm.assert_series_equal(result, expected) with ensure_clean_path("temp.h5") as path: with HDFStore(path) as store: store.put("ser", sser) - result = read_hdf(path, "ser").values - tm.assert_numpy_array_equal(result, expected) + result = read_hdf(path, "ser") + tm.assert_series_equal(result, expected) @pytest.mark.parametrize("bad_version", [(1, 2), (1,), [], "12", "123"]) From 6088f4c04302f99f1c2796a05c4a0f0580632368 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 7 Dec 2020 12:41:58 +0700 Subject: [PATCH 05/10] TST: move to test_subclass.py --- pandas/tests/io/pytables/test_store.py | 44 ------------------ pandas/tests/io/pytables/test_subclass.py | 54 +++++++++++++++++++++++ 2 files changed, 54 insertions(+), 44 deletions(-) create mode 100644 pandas/tests/io/pytables/test_subclass.py diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 98a42f395baf5..c20955227d05e 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -4888,50 +4888,6 @@ def test_unsuppored_hdf_file_error(self, datapath): with pytest.raises(ValueError, match=message): pd.read_hdf(data_path) - def test_supported_for_subclasses_dataframe(self): - class SubDataFrame(DataFrame): - @property - def _constructor(self): - return SubDataFrame - - data = {"a": [1, 2], "b": [3, 4]} - sdf = SubDataFrame(data, dtype=np.intp) - - expected = DataFrame(data, dtype=np.intp) - - with ensure_clean_path("temp.h5") as path: - sdf.to_hdf(path, "df") - result = read_hdf(path, "df") - tm.assert_frame_equal(result, expected) - - with ensure_clean_path("temp.h5") as path: - with HDFStore(path) as store: - store.put("df", sdf) - result = read_hdf(path, "df") - tm.assert_frame_equal(result, expected) - - def test_supported_for_subclasses_series(self): - class SubSeries(Series): - @property - def _constructor(self): - return SubSeries - - data = [1, 2, 3] - sser = SubSeries(data, dtype=np.intp) - - expected = Series(data, dtype=np.intp) - - with ensure_clean_path("temp.h5") as path: - sser.to_hdf(path, "ser") - result = read_hdf(path, "ser") - tm.assert_series_equal(result, expected) - - with ensure_clean_path("temp.h5") as path: - with HDFStore(path) as store: - store.put("ser", sser) - result = read_hdf(path, "ser") - tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("bad_version", [(1, 2), (1,), [], "12", "123"]) def test_maybe_adjust_name_bad_version_raises(bad_version): diff --git a/pandas/tests/io/pytables/test_subclass.py b/pandas/tests/io/pytables/test_subclass.py new file mode 100644 index 0000000000000..96f550cd826d5 --- /dev/null +++ b/pandas/tests/io/pytables/test_subclass.py @@ -0,0 +1,54 @@ +import numpy as np + +from pandas import DataFrame, Series +import pandas._testing as tm +from pandas.tests.io.pytables.common import ensure_clean_path + +from pandas.io.pytables import HDFStore, read_hdf + + +class TestHDFStoreSubclass: + # GH 33748 + def test_supported_for_subclass_dataframe(self): + class SubDataFrame(DataFrame): + @property + def _constructor(self): + return SubDataFrame + + data = {"a": [1, 2], "b": [3, 4]} + sdf = SubDataFrame(data, dtype=np.intp) + + expected = DataFrame(data, dtype=np.intp) + + with ensure_clean_path("temp.h5") as path: + sdf.to_hdf(path, "df") + result = read_hdf(path, "df") + tm.assert_frame_equal(result, expected) + + with ensure_clean_path("temp.h5") as path: + with HDFStore(path) as store: + store.put("df", sdf) + result = read_hdf(path, "df") + tm.assert_frame_equal(result, expected) + + def test_supported_for_subclass_series(self): + class SubSeries(Series): + @property + def _constructor(self): + return SubSeries + + data = [1, 2, 3] + sser = SubSeries(data, dtype=np.intp) + + expected = Series(data, dtype=np.intp) + + with ensure_clean_path("temp.h5") as path: + sser.to_hdf(path, "ser") + result = read_hdf(path, "ser") + tm.assert_series_equal(result, expected) + + with ensure_clean_path("temp.h5") as path: + with HDFStore(path) as store: + store.put("ser", sser) + result = read_hdf(path, "ser") + tm.assert_series_equal(result, expected) From a25ad9fe94f0b4e95a664640d15ab486a7b9b234 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 8 Dec 2020 10:53:51 +0700 Subject: [PATCH 06/10] CLN: use subclassed df/series from _testing --- pandas/tests/io/pytables/test_subclass.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/pandas/tests/io/pytables/test_subclass.py b/pandas/tests/io/pytables/test_subclass.py index 96f550cd826d5..196f729cd6eb2 100644 --- a/pandas/tests/io/pytables/test_subclass.py +++ b/pandas/tests/io/pytables/test_subclass.py @@ -10,13 +10,8 @@ class TestHDFStoreSubclass: # GH 33748 def test_supported_for_subclass_dataframe(self): - class SubDataFrame(DataFrame): - @property - def _constructor(self): - return SubDataFrame - data = {"a": [1, 2], "b": [3, 4]} - sdf = SubDataFrame(data, dtype=np.intp) + sdf = tm.SubclassedDataFrame(data, dtype=np.intp) expected = DataFrame(data, dtype=np.intp) @@ -32,13 +27,8 @@ def _constructor(self): tm.assert_frame_equal(result, expected) def test_supported_for_subclass_series(self): - class SubSeries(Series): - @property - def _constructor(self): - return SubSeries - data = [1, 2, 3] - sser = SubSeries(data, dtype=np.intp) + sser = tm.SubclassedSeries(data, dtype=np.intp) expected = Series(data, dtype=np.intp) From 6d8d7583641fe37305b2a2ad5fa82da2a73e2a6f Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 18 Dec 2020 15:51:28 +0700 Subject: [PATCH 07/10] DOC: add whatsnew note --- doc/source/whatsnew/v1.3.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 3545dd8a89159..d12fefbab8797 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -241,7 +241,8 @@ I/O - Bug in :func:`read_csv` not accepting ``usecols`` with different length than ``names`` for ``engine="python"`` (:issue:`16469`) - Bug in :func:`read_csv` raising ``TypeError`` when ``names`` and ``parse_dates`` is specified for ``engine="c"`` (:issue:`33699`) - Allow custom error values for parse_dates argument of :func:`read_sql`, :func:`read_sql_query` and :func:`read_sql_table` (:issue:`35185`) -- +- Bug in :func:`to_hdf` raising ``KeyError`` when trying to apply + for subclasses of ``DataFrame`` or ``Series`` (:issue:`33748`). Period ^^^^^^ From 0b338a8f7049c96ce2d1a56c3457bfaf137beade Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 18 Dec 2020 17:32:21 +0700 Subject: [PATCH 08/10] DOC: add warning to DataFrame.to_hdf --- pandas/core/generic.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9b0c3caa0b407..ef3ecb55b1ae0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2568,6 +2568,11 @@ def to_hdf( DataFrame.to_feather : Write out feather-format for DataFrames. DataFrame.to_csv : Write out to a csv file. + Warnings + -------- + One can store a subclass of ``DataFrame`` or ``Series`` to HDF5, + but the type of the subclass is lost upon storing. + Examples -------- >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, From 41605c5a3604a786a2adbbe5e60d2ffc9296dc52 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 18 Dec 2020 17:33:34 +0700 Subject: [PATCH 09/10] DOC: add warning to io.rst, HDF section --- doc/source/reference/io.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/source/reference/io.rst b/doc/source/reference/io.rst index 0037d4a4410c3..e755ce94812bb 100644 --- a/doc/source/reference/io.rst +++ b/doc/source/reference/io.rst @@ -83,6 +83,11 @@ HDFStore: PyTables (HDF5) HDFStore.groups HDFStore.walk +.. warning:: + + One can store a subclass of ``DataFrame`` or ``Series`` to HDF5, + but the type of the subclass is lost upon storing. + Feather ~~~~~~~ .. autosummary:: From 99720b8d86a26d9383cb44467835a122c5eb253d Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 18 Dec 2020 18:25:11 +0700 Subject: [PATCH 10/10] DOC: use rst warning instead of numpy-like --- pandas/core/generic.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ef3ecb55b1ae0..fbf502ffa280c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2505,6 +2505,11 @@ def to_hdf( In order to add another DataFrame or Series to an existing HDF file please use append mode and a different a key. + .. warning:: + + One can store a subclass of ``DataFrame`` or ``Series`` to HDF5, + but the type of the subclass is lost upon storing. + For more information see the :ref:`user guide `. Parameters @@ -2568,11 +2573,6 @@ def to_hdf( DataFrame.to_feather : Write out feather-format for DataFrames. DataFrame.to_csv : Write out to a csv file. - Warnings - -------- - One can store a subclass of ``DataFrame`` or ``Series`` to HDF5, - but the type of the subclass is lost upon storing. - Examples -------- >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},