From b7450c786ec166a9ca673126f2c44f66dd89ab11 Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Wed, 2 Oct 2019 09:26:44 +0100 Subject: [PATCH 01/28] added a conftest.py to pytables subdir --- pandas/tests/io/pytables/conftest.py | 17 +++++++++++++++++ pandas/tests/io/pytables/test_pytables.py | 13 ------------- 2 files changed, 17 insertions(+), 13 deletions(-) create mode 100644 pandas/tests/io/pytables/conftest.py diff --git a/pandas/tests/io/pytables/conftest.py b/pandas/tests/io/pytables/conftest.py new file mode 100644 index 0000000000000..365fc23a402fd --- /dev/null +++ b/pandas/tests/io/pytables/conftest.py @@ -0,0 +1,17 @@ +import pytest + +import pandas.util.testing as tm + + +@pytest.fixture +def setup_path(): + """Fixture for setup path""" + return "tmp.__{}__.h5".format(tm.rands(10)) + + +@pytest.fixture(scope="class", autouse=True) +def setup_mode(): + """ Reset testing mode fixture""" + tm.reset_testing_mode() + yield + tm.set_testing_mode() diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index 46d8ef04dd8e5..4d4b7bcf2af75 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -51,19 +51,6 @@ tables = pytest.importorskip("tables") -@pytest.fixture -def setup_path(): - """Fixture for setup path""" - return "tmp.__{}__.h5".format(tm.rands(10)) - - -@pytest.fixture(scope="class", autouse=True) -def setup_mode(): - tm.reset_testing_mode() - yield - tm.set_testing_mode() - - # TODO: # remove when gh-24839 is fixed; this affects numpy 1.16 # and pytables 3.4.4 From 480184dfbccac08881c5ad4cd037735297cf72fa Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Wed, 2 Oct 2019 09:49:40 +0100 Subject: [PATCH 02/28] added common.py file to migrate the functions which all 3 test classes share --- pandas/tests/io/pytables/common.py | 73 ++++++++++++++++++++++ pandas/tests/io/pytables/test_timezones.py | 0 2 files changed, 73 insertions(+) create mode 100644 pandas/tests/io/pytables/common.py create mode 100644 pandas/tests/io/pytables/test_timezones.py diff --git a/pandas/tests/io/pytables/common.py b/pandas/tests/io/pytables/common.py new file mode 100644 index 0000000000000..1a1fba1e7de49 --- /dev/null +++ b/pandas/tests/io/pytables/common.py @@ -0,0 +1,73 @@ +from contextlib import contextmanager +import os +import tempfile + +from pandas.io.pytables import HDFStore + + +def safe_remove(path): + if path is not None: + try: + os.remove(path) + except OSError: + pass + + +def safe_close(store): + try: + if store is not None: + store.close() + except IOError: + pass + + +def create_tempfile(path): + """ create an unopened named temporary file """ + return os.path.join(tempfile.gettempdir(), path) + + +@contextmanager +def ensure_clean_store(path, mode="a", complevel=None, complib=None, fletcher32=False): + + try: + + # put in the temporary path if we don't have one already + if not len(os.path.dirname(path)): + path = create_tempfile(path) + + store = HDFStore( + path, mode=mode, complevel=complevel, complib=complib, fletcher32=False + ) + yield store + finally: + safe_close(store) + if mode == "w" or mode == "a": + safe_remove(path) + + +@contextmanager +def ensure_clean_path(path): + """ + return essentially a named temporary file that is not opened + and deleted on exiting; if path is a list, then create and + return list of filenames + """ + try: + if isinstance(path, list): + filenames = [create_tempfile(p) for p in path] + yield filenames + else: + filenames = [create_tempfile(path)] + yield filenames[0] + finally: + for f in filenames: + safe_remove(f) + + +def _maybe_remove(store, key): + """For tests using tables, try removing the table to be sure there is + no content from previous tests using the same table name.""" + try: + store.remove(key) + except (ValueError, KeyError): + pass diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py new file mode 100644 index 0000000000000..e69de29bb2d1d From 31d68369e04b5e8c06c5f75049b9eb58b7acf931 Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Wed, 2 Oct 2019 10:01:43 +0100 Subject: [PATCH 03/28] moved TestTimezone class to test_timezones.py and relevant imports --- pandas/tests/io/pytables/test_timezones.py | 389 +++++++++++++++++++++ 1 file changed, 389 insertions(+) diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index e69de29bb2d1d..2b9489c4de8cd 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -0,0 +1,389 @@ +import datetime + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + Series, + Timestamp, + date_range, +) +import pandas.util.testing as tm +from pandas.util.testing import assert_frame_equal, set_timezone +from pandas.tests.io.pytables.common import (ensure_clean_path, + ensure_clean_store, + _maybe_remove) + +tables = pytest.importorskip("tables") + + +class TestTimezones: + def _compare_with_tz(self, a, b): + tm.assert_frame_equal(a, b) + + # compare the zones on each element + for c in a.columns: + for i in a.index: + a_e = a.loc[i, c] + b_e = b.loc[i, c] + if not (a_e == b_e and a_e.tz == b_e.tz): + raise AssertionError( + "invalid tz comparison [{a_e}] [{b_e}]".format(a_e=a_e, b_e=b_e) + ) + + def test_append_with_timezones_dateutil(self, setup_path): + + from datetime import timedelta + + # use maybe_get_tz instead of dateutil.tz.gettz to handle the windows + # filename issues. + from pandas._libs.tslibs.timezones import maybe_get_tz + + gettz = lambda x: maybe_get_tz("dateutil/" + x) + + # as columns + with ensure_clean_store(setup_path) as store: + + _maybe_remove(store, "df_tz") + df = DataFrame( + dict( + A=[ + Timestamp("20130102 2:00:00", tz=gettz("US/Eastern")) + + timedelta(hours=1) * i + for i in range(5) + ] + ) + ) + + store.append("df_tz", df, data_columns=["A"]) + result = store["df_tz"] + self._compare_with_tz(result, df) + assert_frame_equal(result, df) + + # select with tz aware + expected = df[df.A >= df.A[3]] + result = store.select("df_tz", where="A>=df.A[3]") + self._compare_with_tz(result, expected) + + # ensure we include dates in DST and STD time here. + _maybe_remove(store, "df_tz") + df = DataFrame( + dict( + A=Timestamp("20130102", tz=gettz("US/Eastern")), + B=Timestamp("20130603", tz=gettz("US/Eastern")), + ), + index=range(5), + ) + store.append("df_tz", df) + result = store["df_tz"] + self._compare_with_tz(result, df) + assert_frame_equal(result, df) + + df = DataFrame( + dict( + A=Timestamp("20130102", tz=gettz("US/Eastern")), + B=Timestamp("20130102", tz=gettz("EET")), + ), + index=range(5), + ) + with pytest.raises(ValueError): + store.append("df_tz", df) + + # this is ok + _maybe_remove(store, "df_tz") + store.append("df_tz", df, data_columns=["A", "B"]) + result = store["df_tz"] + self._compare_with_tz(result, df) + assert_frame_equal(result, df) + + # can't append with diff timezone + df = DataFrame( + dict( + A=Timestamp("20130102", tz=gettz("US/Eastern")), + B=Timestamp("20130102", tz=gettz("CET")), + ), + index=range(5), + ) + with pytest.raises(ValueError): + store.append("df_tz", df) + + # as index + with ensure_clean_store(setup_path) as store: + + # GH 4098 example + df = DataFrame( + dict( + A=Series( + range(3), + index=date_range( + "2000-1-1", periods=3, freq="H", tz=gettz("US/Eastern") + ), + ) + ) + ) + + _maybe_remove(store, "df") + store.put("df", df) + result = store.select("df") + assert_frame_equal(result, df) + + _maybe_remove(store, "df") + store.append("df", df) + result = store.select("df") + assert_frame_equal(result, df) + + def test_append_with_timezones_pytz(self, setup_path): + + from datetime import timedelta + + # as columns + with ensure_clean_store(setup_path) as store: + + _maybe_remove(store, "df_tz") + df = DataFrame( + dict( + A=[ + Timestamp("20130102 2:00:00", tz="US/Eastern") + + timedelta(hours=1) * i + for i in range(5) + ] + ) + ) + store.append("df_tz", df, data_columns=["A"]) + result = store["df_tz"] + self._compare_with_tz(result, df) + assert_frame_equal(result, df) + + # select with tz aware + self._compare_with_tz( + store.select("df_tz", where="A>=df.A[3]"), df[df.A >= df.A[3]] + ) + + _maybe_remove(store, "df_tz") + # ensure we include dates in DST and STD time here. + df = DataFrame( + dict( + A=Timestamp("20130102", tz="US/Eastern"), + B=Timestamp("20130603", tz="US/Eastern"), + ), + index=range(5), + ) + store.append("df_tz", df) + result = store["df_tz"] + self._compare_with_tz(result, df) + assert_frame_equal(result, df) + + df = DataFrame( + dict( + A=Timestamp("20130102", tz="US/Eastern"), + B=Timestamp("20130102", tz="EET"), + ), + index=range(5), + ) + with pytest.raises(ValueError): + store.append("df_tz", df) + + # this is ok + _maybe_remove(store, "df_tz") + store.append("df_tz", df, data_columns=["A", "B"]) + result = store["df_tz"] + self._compare_with_tz(result, df) + assert_frame_equal(result, df) + + # can't append with diff timezone + df = DataFrame( + dict( + A=Timestamp("20130102", tz="US/Eastern"), + B=Timestamp("20130102", tz="CET"), + ), + index=range(5), + ) + with pytest.raises(ValueError): + store.append("df_tz", df) + + # as index + with ensure_clean_store(setup_path) as store: + + # GH 4098 example + df = DataFrame( + dict( + A=Series( + range(3), + index=date_range( + "2000-1-1", periods=3, freq="H", tz="US/Eastern" + ), + ) + ) + ) + + _maybe_remove(store, "df") + store.put("df", df) + result = store.select("df") + assert_frame_equal(result, df) + + _maybe_remove(store, "df") + store.append("df", df) + result = store.select("df") + assert_frame_equal(result, df) + + def test_tseries_select_index_column(self, setup_path): + # GH7777 + # selecting a UTC datetimeindex column did + # not preserve UTC tzinfo set before storing + + # check that no tz still works + rng = date_range("1/1/2000", "1/30/2000") + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + + with ensure_clean_store(setup_path) as store: + store.append("frame", frame) + result = store.select_column("frame", "index") + assert rng.tz == DatetimeIndex(result.values).tz + + # check utc + rng = date_range("1/1/2000", "1/30/2000", tz="UTC") + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + + with ensure_clean_store(setup_path) as store: + store.append("frame", frame) + result = store.select_column("frame", "index") + assert rng.tz == result.dt.tz + + # double check non-utc + rng = date_range("1/1/2000", "1/30/2000", tz="US/Eastern") + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + + with ensure_clean_store(setup_path) as store: + store.append("frame", frame) + result = store.select_column("frame", "index") + assert rng.tz == result.dt.tz + + def test_timezones_fixed(self, setup_path): + with ensure_clean_store(setup_path) as store: + + # index + rng = date_range("1/1/2000", "1/30/2000", tz="US/Eastern") + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + store["df"] = df + result = store["df"] + assert_frame_equal(result, df) + + # as data + # GH11411 + _maybe_remove(store, "df") + df = DataFrame( + { + "A": rng, + "B": rng.tz_convert("UTC").tz_localize(None), + "C": rng.tz_convert("CET"), + "D": range(len(rng)), + }, + index=rng, + ) + store["df"] = df + result = store["df"] + assert_frame_equal(result, df) + + def test_fixed_offset_tz(self, setup_path): + rng = date_range("1/1/2000 00:00:00-07:00", "1/30/2000 00:00:00-07:00") + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + + with ensure_clean_store(setup_path) as store: + store["frame"] = frame + recons = store["frame"] + tm.assert_index_equal(recons.index, rng) + assert rng.tz == recons.index.tz + + @td.skip_if_windows + def test_store_timezone(self, setup_path): + # GH2852 + # issue storing datetime.date with a timezone as it resets when read + # back in a new timezone + + # original method + with ensure_clean_store(setup_path) as store: + + today = datetime.date(2013, 9, 10) + df = DataFrame([1, 2, 3], index=[today, today, today]) + store["obj1"] = df + result = store["obj1"] + assert_frame_equal(result, df) + + # with tz setting + with ensure_clean_store(setup_path) as store: + + with set_timezone("EST5EDT"): + today = datetime.date(2013, 9, 10) + df = DataFrame([1, 2, 3], index=[today, today, today]) + store["obj1"] = df + + with set_timezone("CST6CDT"): + result = store["obj1"] + + assert_frame_equal(result, df) + + def test_legacy_datetimetz_object(self, datapath, setup_path): + # legacy from < 0.17.0 + # 8260 + expected = DataFrame( + dict( + A=Timestamp("20130102", tz="US/Eastern"), + B=Timestamp("20130603", tz="CET"), + ), + index=range(5), + ) + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "datetimetz_object.h5"), mode="r" + ) as store: + result = store["df"] + assert_frame_equal(result, expected) + + def test_dst_transitions(self, setup_path): + # make sure we are not failing on transitions + with ensure_clean_store(setup_path) as store: + times = pd.date_range( + "2013-10-26 23:00", + "2013-10-27 01:00", + tz="Europe/London", + freq="H", + ambiguous="infer", + ) + + for i in [times, times + pd.Timedelta("10min")]: + _maybe_remove(store, "df") + df = DataFrame({"A": range(len(i)), "B": i}, index=i) + store.append("df", df) + result = store.select("df") + assert_frame_equal(result, df) + + def test_read_with_where_tz_aware_index(self, setup_path): + # GH 11926 + periods = 10 + dts = pd.date_range("20151201", periods=periods, freq="D", tz="UTC") + mi = pd.MultiIndex.from_arrays([dts, range(periods)], names=["DATE", "NO"]) + expected = pd.DataFrame({"MYCOL": 0}, index=mi) + + key = "mykey" + with ensure_clean_path(setup_path) as path: + with pd.HDFStore(path) as store: + store.append(key, expected, format="table", append=True) + result = pd.read_hdf(path, key, where="DATE > 20151130") + assert_frame_equal(result, expected) + + def test_py2_created_with_datetimez(self, datapath, setup_path): + # The test HDF5 file was created in Python 2, but could not be read in + # Python 3. + # + # GH26443 + index = [pd.Timestamp("2019-01-01T18:00").tz_localize("America/New_York")] + expected = DataFrame({"data": 123}, index=index) + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "gh26443.h5"), mode="r" + ) as store: + result = store["key"] + assert_frame_equal(result, expected) From e3242f49a2322b0c3173264a1d1ba4e78a59c8af Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Wed, 2 Oct 2019 10:09:05 +0100 Subject: [PATCH 04/28] moved HDFComplexvalues class to test_hdf_complex_values.py and relevant imports --- .../io/pytables/test_hdf_complex_values.py | 196 ++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100644 pandas/tests/io/pytables/test_hdf_complex_values.py diff --git a/pandas/tests/io/pytables/test_hdf_complex_values.py b/pandas/tests/io/pytables/test_hdf_complex_values.py new file mode 100644 index 0000000000000..23b30b11407f8 --- /dev/null +++ b/pandas/tests/io/pytables/test_hdf_complex_values.py @@ -0,0 +1,196 @@ +from warnings import catch_warnings + +import numpy as np +import pytest +from distutils.version import LooseVersion + +import pandas as pd +from pandas import ( + DataFrame, + Series, +) +import pandas.util.testing as tm +from pandas.util.testing import assert_frame_equal + +from pandas.io.pytables import read_hdf +from pandas.tests.io.pytables.common import (ensure_clean_path, + ensure_clean_store) + +tables = pytest.importorskip("tables") + +# TODO: +# remove when gh-24839 is fixed; this affects numpy 1.16 +# and pytables 3.4.4 +xfail_non_writeable = pytest.mark.xfail( + LooseVersion(np.__version__) >= LooseVersion("1.16") + and LooseVersion(tables.__version__) < LooseVersion("3.5.1"), + reason=( + "gh-25511, gh-24839. pytables needs a " + "release beyong 3.4.4 to support numpy 1.16x" + ), +) + + +class TestHDFComplexValues: + # GH10447 + + def test_complex_fixed(self, setup_path): + df = DataFrame( + np.random.rand(4, 5).astype(np.complex64), + index=list("abcd"), + columns=list("ABCDE"), + ) + + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df") + reread = read_hdf(path, "df") + assert_frame_equal(df, reread) + + df = DataFrame( + np.random.rand(4, 5).astype(np.complex128), + index=list("abcd"), + columns=list("ABCDE"), + ) + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df") + reread = read_hdf(path, "df") + assert_frame_equal(df, reread) + + def test_complex_table(self, setup_path): + df = DataFrame( + np.random.rand(4, 5).astype(np.complex64), + index=list("abcd"), + columns=list("ABCDE"), + ) + + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", format="table") + reread = read_hdf(path, "df") + assert_frame_equal(df, reread) + + df = DataFrame( + np.random.rand(4, 5).astype(np.complex128), + index=list("abcd"), + columns=list("ABCDE"), + ) + + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", format="table", mode="w") + reread = read_hdf(path, "df") + assert_frame_equal(df, reread) + + @xfail_non_writeable + def test_complex_mixed_fixed(self, setup_path): + complex64 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64 + ) + complex128 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 + ) + df = DataFrame( + { + "A": [1, 2, 3, 4], + "B": ["a", "b", "c", "d"], + "C": complex64, + "D": complex128, + "E": [1.0, 2.0, 3.0, 4.0], + }, + index=list("abcd"), + ) + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df") + reread = read_hdf(path, "df") + assert_frame_equal(df, reread) + + def test_complex_mixed_table(self, setup_path): + complex64 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64 + ) + complex128 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 + ) + df = DataFrame( + { + "A": [1, 2, 3, 4], + "B": ["a", "b", "c", "d"], + "C": complex64, + "D": complex128, + "E": [1.0, 2.0, 3.0, 4.0], + }, + index=list("abcd"), + ) + + with ensure_clean_store(setup_path) as store: + store.append("df", df, data_columns=["A", "B"]) + result = store.select("df", where="A>2") + assert_frame_equal(df.loc[df.A > 2], result) + + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", format="table") + reread = read_hdf(path, "df") + assert_frame_equal(df, reread) + + def test_complex_across_dimensions_fixed(self, setup_path): + with catch_warnings(record=True): + complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) + s = Series(complex128, index=list("abcd")) + df = DataFrame({"A": s, "B": s}) + + objs = [s, df] + comps = [tm.assert_series_equal, tm.assert_frame_equal] + for obj, comp in zip(objs, comps): + with ensure_clean_path(setup_path) as path: + obj.to_hdf(path, "obj", format="fixed") + reread = read_hdf(path, "obj") + comp(obj, reread) + + def test_complex_across_dimensions(self, setup_path): + complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) + s = Series(complex128, index=list("abcd")) + df = DataFrame({"A": s, "B": s}) + + with catch_warnings(record=True): + + objs = [df] + comps = [tm.assert_frame_equal] + for obj, comp in zip(objs, comps): + with ensure_clean_path(setup_path) as path: + obj.to_hdf(path, "obj", format="table") + reread = read_hdf(path, "obj") + comp(obj, reread) + + def test_complex_indexing_error(self, setup_path): + complex128 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 + ) + df = DataFrame( + {"A": [1, 2, 3, 4], "B": ["a", "b", "c", "d"], "C": complex128}, + index=list("abcd"), + ) + with ensure_clean_store(setup_path) as store: + with pytest.raises(TypeError): + store.append("df", df, data_columns=["C"]) + + def test_complex_series_error(self, setup_path): + complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) + s = Series(complex128, index=list("abcd")) + + with ensure_clean_path(setup_path) as path: + with pytest.raises(TypeError): + s.to_hdf(path, "obj", format="t") + + with ensure_clean_path(setup_path) as path: + s.to_hdf(path, "obj", format="t", index=False) + reread = read_hdf(path, "obj") + tm.assert_series_equal(s, reread) + + def test_complex_append(self, setup_path): + df = DataFrame( + {"a": np.random.randn(100).astype(np.complex128), "b": np.random.randn(100)} + ) + + with ensure_clean_store(setup_path) as store: + store.append("df", df, data_columns=["b"]) + store.append("df", df) + result = store.select("df") + assert_frame_equal(pd.concat([df, df], 0), result) From 1985c96992ff443ee86238245705fd51c9faf0dd Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Wed, 2 Oct 2019 10:21:52 +0100 Subject: [PATCH 05/28] updated common.py, updated imports in test_hdf_complex_values and created test_hdf_store.py --- pandas/tests/io/pytables/common.py | 23 +++++++++++++++++++ .../io/pytables/test_hdf_complex_values.py | 17 ++------------ pandas/tests/io/pytables/test_hdf_store.py | 0 3 files changed, 25 insertions(+), 15 deletions(-) create mode 100644 pandas/tests/io/pytables/test_hdf_store.py diff --git a/pandas/tests/io/pytables/common.py b/pandas/tests/io/pytables/common.py index 1a1fba1e7de49..2a327ce5ad1d6 100644 --- a/pandas/tests/io/pytables/common.py +++ b/pandas/tests/io/pytables/common.py @@ -1,10 +1,32 @@ from contextlib import contextmanager import os +import pytest +import numpy as np +from distutils.version import LooseVersion import tempfile from pandas.io.pytables import HDFStore +# TODO: +# remove when gh-24839 is fixed; this affects numpy 1.16 +# and pytables 3.4.4 +tables = pytest.importorskip("tables") +xfail_non_writeable = pytest.mark.xfail( + LooseVersion(np.__version__) >= LooseVersion("1.16") + and LooseVersion(tables.__version__) < LooseVersion("3.5.1"), + reason=( + "gh-25511, gh-24839. pytables needs a " + "release beyong 3.4.4 to support numpy 1.16x" + ), +) + +# set these parameters so we don't have file sharing +tables.parameters.MAX_NUMEXPR_THREADS = 1 +tables.parameters.MAX_BLOSC_THREADS = 1 +tables.parameters.MAX_THREADS = 1 + + def safe_remove(path): if path is not None: try: @@ -26,6 +48,7 @@ def create_tempfile(path): return os.path.join(tempfile.gettempdir(), path) +# contextmanager to ensure the file cleanup @contextmanager def ensure_clean_store(path, mode="a", complevel=None, complib=None, fletcher32=False): diff --git a/pandas/tests/io/pytables/test_hdf_complex_values.py b/pandas/tests/io/pytables/test_hdf_complex_values.py index 23b30b11407f8..b871df7d0a211 100644 --- a/pandas/tests/io/pytables/test_hdf_complex_values.py +++ b/pandas/tests/io/pytables/test_hdf_complex_values.py @@ -13,23 +13,10 @@ from pandas.util.testing import assert_frame_equal from pandas.io.pytables import read_hdf -from pandas.tests.io.pytables.common import (ensure_clean_path, +from pandas.tests.io.pytables.common import (xfail_non_writeable, + ensure_clean_path, ensure_clean_store) -tables = pytest.importorskip("tables") - -# TODO: -# remove when gh-24839 is fixed; this affects numpy 1.16 -# and pytables 3.4.4 -xfail_non_writeable = pytest.mark.xfail( - LooseVersion(np.__version__) >= LooseVersion("1.16") - and LooseVersion(tables.__version__) < LooseVersion("3.5.1"), - reason=( - "gh-25511, gh-24839. pytables needs a " - "release beyong 3.4.4 to support numpy 1.16x" - ), -) - class TestHDFComplexValues: # GH10447 diff --git a/pandas/tests/io/pytables/test_hdf_store.py b/pandas/tests/io/pytables/test_hdf_store.py new file mode 100644 index 0000000000000..e69de29bb2d1d From e93f3df55bc84c8ec623a1056ff781b7e2397545 Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Wed, 2 Oct 2019 10:23:07 +0100 Subject: [PATCH 06/28] removed tables variable as not required for Timezone class --- pandas/tests/io/pytables/test_timezones.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 2b9489c4de8cd..6d7b73ee6e803 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -19,8 +19,6 @@ ensure_clean_store, _maybe_remove) -tables = pytest.importorskip("tables") - class TestTimezones: def _compare_with_tz(self, a, b): From 80d6aaf45b10aab9026a3aab249229185a2dace6 Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Wed, 2 Oct 2019 10:24:08 +0100 Subject: [PATCH 07/28] cleaned up unused imports --- pandas/tests/io/pytables/test_hdf_complex_values.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/pytables/test_hdf_complex_values.py b/pandas/tests/io/pytables/test_hdf_complex_values.py index b871df7d0a211..65587b8b8b993 100644 --- a/pandas/tests/io/pytables/test_hdf_complex_values.py +++ b/pandas/tests/io/pytables/test_hdf_complex_values.py @@ -2,7 +2,6 @@ import numpy as np import pytest -from distutils.version import LooseVersion import pandas as pd from pandas import ( From 7f0842b791494935d36aa8d9476ffa31db2c8fdb Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Wed, 2 Oct 2019 10:33:49 +0100 Subject: [PATCH 08/28] updated import for ensure_clean_path to reference common.py instead --- pandas/tests/io/pytables/test_compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/pytables/test_compat.py b/pandas/tests/io/pytables/test_compat.py index f5f73beab6d60..fe8d8c56a4e82 100644 --- a/pandas/tests/io/pytables/test_compat.py +++ b/pandas/tests/io/pytables/test_compat.py @@ -1,7 +1,7 @@ import pytest import pandas as pd -from pandas.tests.io.pytables.test_pytables import ensure_clean_path +from pandas.tests.io.pytables.common import ensure_clean_path from pandas.util.testing import assert_frame_equal tables = pytest.importorskip("tables") From 882ba8eb273219e9ef4016bcbed3d051557e045b Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Wed, 2 Oct 2019 10:35:07 +0100 Subject: [PATCH 09/28] deleted test_pytables.py after splitting contents across 4 python files --- pandas/tests/io/pytables/test_pytables.py | 5377 --------------------- 1 file changed, 5377 deletions(-) delete mode 100644 pandas/tests/io/pytables/test_pytables.py diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py deleted file mode 100644 index 4d4b7bcf2af75..0000000000000 --- a/pandas/tests/io/pytables/test_pytables.py +++ /dev/null @@ -1,5377 +0,0 @@ -from contextlib import contextmanager -import datetime -from datetime import timedelta -from distutils.version import LooseVersion -from io import BytesIO -import os -import re -import tempfile -from warnings import catch_warnings, simplefilter - -import numpy as np -import pytest - -from pandas.compat import PY36, is_platform_little_endian, is_platform_windows -import pandas.util._test_decorators as td - -from pandas.core.dtypes.common import is_categorical_dtype - -import pandas as pd -from pandas import ( - Categorical, - CategoricalIndex, - DataFrame, - DatetimeIndex, - Index, - Int64Index, - MultiIndex, - RangeIndex, - Series, - Timestamp, - bdate_range, - concat, - date_range, - isna, - timedelta_range, -) -import pandas.util.testing as tm -from pandas.util.testing import assert_frame_equal, assert_series_equal, set_timezone - -from pandas.io.pytables import ( - ClosedFileError, - HDFStore, - PossibleDataLossError, - Term, - read_hdf, -) - -from pandas.io import pytables as pytables # noqa: E402 isort:skip -from pandas.io.pytables import TableIterator # noqa: E402 isort:skip - -tables = pytest.importorskip("tables") - - -# TODO: -# remove when gh-24839 is fixed; this affects numpy 1.16 -# and pytables 3.4.4 -xfail_non_writeable = pytest.mark.xfail( - LooseVersion(np.__version__) >= LooseVersion("1.16") - and LooseVersion(tables.__version__) < LooseVersion("3.5.1"), - reason=( - "gh-25511, gh-24839. pytables needs a " - "release beyong 3.4.4 to support numpy 1.16x" - ), -) - - -_default_compressor = "blosc" - - -ignore_natural_naming_warning = pytest.mark.filterwarnings( - "ignore:object name:tables.exceptions.NaturalNameWarning" -) - -# contextmanager to ensure the file cleanup - - -def safe_remove(path): - if path is not None: - try: - os.remove(path) - except OSError: - pass - - -def safe_close(store): - try: - if store is not None: - store.close() - except IOError: - pass - - -def create_tempfile(path): - """ create an unopened named temporary file """ - return os.path.join(tempfile.gettempdir(), path) - - -@contextmanager -def ensure_clean_store(path, mode="a", complevel=None, complib=None, fletcher32=False): - - try: - - # put in the temporary path if we don't have one already - if not len(os.path.dirname(path)): - path = create_tempfile(path) - - store = HDFStore( - path, mode=mode, complevel=complevel, complib=complib, fletcher32=False - ) - yield store - finally: - safe_close(store) - if mode == "w" or mode == "a": - safe_remove(path) - - -@contextmanager -def ensure_clean_path(path): - """ - return essentially a named temporary file that is not opened - and deleted on exiting; if path is a list, then create and - return list of filenames - """ - try: - if isinstance(path, list): - filenames = [create_tempfile(p) for p in path] - yield filenames - else: - filenames = [create_tempfile(path)] - yield filenames[0] - finally: - for f in filenames: - safe_remove(f) - - -# set these parameters so we don't have file sharing -tables.parameters.MAX_NUMEXPR_THREADS = 1 -tables.parameters.MAX_BLOSC_THREADS = 1 -tables.parameters.MAX_THREADS = 1 - - -def _maybe_remove(store, key): - """For tests using tables, try removing the table to be sure there is - no content from previous tests using the same table name.""" - try: - store.remove(key) - except (ValueError, KeyError): - pass - - -@pytest.mark.single -class TestHDFStore: - def test_format_kwarg_in_constructor(self, setup_path): - # GH 13291 - with ensure_clean_path(setup_path) as path: - with pytest.raises(ValueError): - HDFStore(path, format="table") - - def test_context(self, setup_path): - path = create_tempfile(setup_path) - try: - with HDFStore(path) as tbl: - raise ValueError("blah") - except ValueError: - pass - finally: - safe_remove(path) - - try: - with HDFStore(path) as tbl: - tbl["a"] = tm.makeDataFrame() - - with HDFStore(path) as tbl: - assert len(tbl) == 1 - assert type(tbl["a"]) == DataFrame - finally: - safe_remove(path) - - def test_conv_read_write(self, setup_path): - path = create_tempfile(setup_path) - try: - - def roundtrip(key, obj, **kwargs): - obj.to_hdf(path, key, **kwargs) - return read_hdf(path, key) - - o = tm.makeTimeSeries() - assert_series_equal(o, roundtrip("series", o)) - - o = tm.makeStringSeries() - assert_series_equal(o, roundtrip("string_series", o)) - - o = tm.makeDataFrame() - assert_frame_equal(o, roundtrip("frame", o)) - - # table - df = DataFrame(dict(A=range(5), B=range(5))) - df.to_hdf(path, "table", append=True) - result = read_hdf(path, "table", where=["index>2"]) - assert_frame_equal(df[df.index > 2], result) - - finally: - safe_remove(path) - - def test_long_strings(self, setup_path): - - # GH6166 - df = DataFrame( - {"a": tm.rands_array(100, size=10)}, index=tm.rands_array(100, size=10) - ) - - with ensure_clean_store(setup_path) as store: - store.append("df", df, data_columns=["a"]) - - result = store.select("df") - assert_frame_equal(df, result) - - def test_api(self, setup_path): - - # GH4584 - # API issue when to_hdf doesn't accept append AND format args - with ensure_clean_path(setup_path) as path: - - df = tm.makeDataFrame() - df.iloc[:10].to_hdf(path, "df", append=True, format="table") - df.iloc[10:].to_hdf(path, "df", append=True, format="table") - assert_frame_equal(read_hdf(path, "df"), df) - - # append to False - df.iloc[:10].to_hdf(path, "df", append=False, format="table") - df.iloc[10:].to_hdf(path, "df", append=True, format="table") - assert_frame_equal(read_hdf(path, "df"), df) - - with ensure_clean_path(setup_path) as path: - - df = tm.makeDataFrame() - df.iloc[:10].to_hdf(path, "df", append=True) - df.iloc[10:].to_hdf(path, "df", append=True, format="table") - assert_frame_equal(read_hdf(path, "df"), df) - - # append to False - df.iloc[:10].to_hdf(path, "df", append=False, format="table") - df.iloc[10:].to_hdf(path, "df", append=True) - assert_frame_equal(read_hdf(path, "df"), df) - - with ensure_clean_path(setup_path) as path: - - df = tm.makeDataFrame() - df.to_hdf(path, "df", append=False, format="fixed") - assert_frame_equal(read_hdf(path, "df"), df) - - df.to_hdf(path, "df", append=False, format="f") - assert_frame_equal(read_hdf(path, "df"), df) - - df.to_hdf(path, "df", append=False) - assert_frame_equal(read_hdf(path, "df"), df) - - df.to_hdf(path, "df") - assert_frame_equal(read_hdf(path, "df"), df) - - with ensure_clean_store(setup_path) as store: - - path = store._path - df = tm.makeDataFrame() - - _maybe_remove(store, "df") - store.append("df", df.iloc[:10], append=True, format="table") - store.append("df", df.iloc[10:], append=True, format="table") - assert_frame_equal(store.select("df"), df) - - # append to False - _maybe_remove(store, "df") - store.append("df", df.iloc[:10], append=False, format="table") - store.append("df", df.iloc[10:], append=True, format="table") - assert_frame_equal(store.select("df"), df) - - # formats - _maybe_remove(store, "df") - store.append("df", df.iloc[:10], append=False, format="table") - store.append("df", df.iloc[10:], append=True, format="table") - assert_frame_equal(store.select("df"), df) - - _maybe_remove(store, "df") - store.append("df", df.iloc[:10], append=False, format="table") - store.append("df", df.iloc[10:], append=True, format=None) - assert_frame_equal(store.select("df"), df) - - with ensure_clean_path(setup_path) as path: - # Invalid. - df = tm.makeDataFrame() - - with pytest.raises(ValueError): - df.to_hdf(path, "df", append=True, format="f") - - with pytest.raises(ValueError): - df.to_hdf(path, "df", append=True, format="fixed") - - with pytest.raises(TypeError): - df.to_hdf(path, "df", append=True, format="foo") - - with pytest.raises(TypeError): - df.to_hdf(path, "df", append=False, format="bar") - - # File path doesn't exist - path = "" - with pytest.raises(FileNotFoundError): - read_hdf(path, "df") - - def test_api_default_format(self, setup_path): - - # default_format option - with ensure_clean_store(setup_path) as store: - df = tm.makeDataFrame() - - pd.set_option("io.hdf.default_format", "fixed") - _maybe_remove(store, "df") - store.put("df", df) - assert not store.get_storer("df").is_table - with pytest.raises(ValueError): - store.append("df2", df) - - pd.set_option("io.hdf.default_format", "table") - _maybe_remove(store, "df") - store.put("df", df) - assert store.get_storer("df").is_table - _maybe_remove(store, "df2") - store.append("df2", df) - assert store.get_storer("df").is_table - - pd.set_option("io.hdf.default_format", None) - - with ensure_clean_path(setup_path) as path: - - df = tm.makeDataFrame() - - pd.set_option("io.hdf.default_format", "fixed") - df.to_hdf(path, "df") - with HDFStore(path) as store: - assert not store.get_storer("df").is_table - with pytest.raises(ValueError): - df.to_hdf(path, "df2", append=True) - - pd.set_option("io.hdf.default_format", "table") - df.to_hdf(path, "df3") - with HDFStore(path) as store: - assert store.get_storer("df3").is_table - df.to_hdf(path, "df4", append=True) - with HDFStore(path) as store: - assert store.get_storer("df4").is_table - - pd.set_option("io.hdf.default_format", None) - - def test_keys(self, setup_path): - - with ensure_clean_store(setup_path) as store: - store["a"] = tm.makeTimeSeries() - store["b"] = tm.makeStringSeries() - store["c"] = tm.makeDataFrame() - - assert len(store) == 3 - expected = {"/a", "/b", "/c"} - assert set(store.keys()) == expected - assert set(store) == expected - - def test_keys_ignore_hdf_softlink(self, setup_path): - - # GH 20523 - # Puts a softlink into HDF file and rereads - - with ensure_clean_store(setup_path) as store: - - df = DataFrame(dict(A=range(5), B=range(5))) - store.put("df", df) - - assert store.keys() == ["/df"] - - store._handle.create_soft_link(store._handle.root, "symlink", "df") - - # Should ignore the softlink - assert store.keys() == ["/df"] - - def test_iter_empty(self, setup_path): - - with ensure_clean_store(setup_path) as store: - # GH 12221 - assert list(store) == [] - - def test_repr(self, setup_path): - - with ensure_clean_store(setup_path) as store: - repr(store) - store.info() - store["a"] = tm.makeTimeSeries() - store["b"] = tm.makeStringSeries() - store["c"] = tm.makeDataFrame() - - df = tm.makeDataFrame() - df["obj1"] = "foo" - df["obj2"] = "bar" - df["bool1"] = df["A"] > 0 - df["bool2"] = df["B"] > 0 - df["bool3"] = True - df["int1"] = 1 - df["int2"] = 2 - df["timestamp1"] = Timestamp("20010102") - df["timestamp2"] = Timestamp("20010103") - df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) - df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) - df.loc[3:6, ["obj1"]] = np.nan - df = df._consolidate()._convert(datetime=True) - - with catch_warnings(record=True): - simplefilter("ignore", pd.errors.PerformanceWarning) - store["df"] = df - - # make a random group in hdf space - store._handle.create_group(store._handle.root, "bah") - - assert store.filename in repr(store) - assert store.filename in str(store) - store.info() - - # storers - with ensure_clean_store(setup_path) as store: - - df = tm.makeDataFrame() - store.append("df", df) - - s = store.get_storer("df") - repr(s) - str(s) - - @ignore_natural_naming_warning - def test_contains(self, setup_path): - - with ensure_clean_store(setup_path) as store: - store["a"] = tm.makeTimeSeries() - store["b"] = tm.makeDataFrame() - store["foo/bar"] = tm.makeDataFrame() - assert "a" in store - assert "b" in store - assert "c" not in store - assert "foo/bar" in store - assert "/foo/bar" in store - assert "/foo/b" not in store - assert "bar" not in store - - # gh-2694: tables.NaturalNameWarning - with catch_warnings(record=True): - store["node())"] = tm.makeDataFrame() - assert "node())" in store - - def test_versioning(self, setup_path): - - with ensure_clean_store(setup_path) as store: - store["a"] = tm.makeTimeSeries() - store["b"] = tm.makeDataFrame() - df = tm.makeTimeDataFrame() - _maybe_remove(store, "df1") - store.append("df1", df[:10]) - store.append("df1", df[10:]) - assert store.root.a._v_attrs.pandas_version == "0.15.2" - assert store.root.b._v_attrs.pandas_version == "0.15.2" - assert store.root.df1._v_attrs.pandas_version == "0.15.2" - - # write a file and wipe its versioning - _maybe_remove(store, "df2") - store.append("df2", df) - - # this is an error because its table_type is appendable, but no - # version info - store.get_node("df2")._v_attrs.pandas_version = None - with pytest.raises(Exception): - store.select("df2") - - def test_mode(self, setup_path): - - df = tm.makeTimeDataFrame() - - def check(mode): - - with ensure_clean_path(setup_path) as path: - - # constructor - if mode in ["r", "r+"]: - with pytest.raises(IOError): - HDFStore(path, mode=mode) - - else: - store = HDFStore(path, mode=mode) - assert store._handle.mode == mode - store.close() - - with ensure_clean_path(setup_path) as path: - - # context - if mode in ["r", "r+"]: - with pytest.raises(IOError): - with HDFStore(path, mode=mode) as store: # noqa - pass - else: - with HDFStore(path, mode=mode) as store: - assert store._handle.mode == mode - - with ensure_clean_path(setup_path) as path: - - # conv write - if mode in ["r", "r+"]: - with pytest.raises(IOError): - df.to_hdf(path, "df", mode=mode) - df.to_hdf(path, "df", mode="w") - else: - df.to_hdf(path, "df", mode=mode) - - # conv read - if mode in ["w"]: - with pytest.raises(ValueError): - read_hdf(path, "df", mode=mode) - else: - result = read_hdf(path, "df", mode=mode) - assert_frame_equal(result, df) - - def check_default_mode(): - - # read_hdf uses default mode - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", mode="w") - result = read_hdf(path, "df") - assert_frame_equal(result, df) - - check("r") - check("r+") - check("a") - check("w") - check_default_mode() - - def test_reopen_handle(self, setup_path): - - with ensure_clean_path(setup_path) as path: - - store = HDFStore(path, mode="a") - store["a"] = tm.makeTimeSeries() - - # invalid mode change - with pytest.raises(PossibleDataLossError): - store.open("w") - - store.close() - assert not store.is_open - - # truncation ok here - store.open("w") - assert store.is_open - assert len(store) == 0 - store.close() - assert not store.is_open - - store = HDFStore(path, mode="a") - store["a"] = tm.makeTimeSeries() - - # reopen as read - store.open("r") - assert store.is_open - assert len(store) == 1 - assert store._mode == "r" - store.close() - assert not store.is_open - - # reopen as append - store.open("a") - assert store.is_open - assert len(store) == 1 - assert store._mode == "a" - store.close() - assert not store.is_open - - # reopen as append (again) - store.open("a") - assert store.is_open - assert len(store) == 1 - assert store._mode == "a" - store.close() - assert not store.is_open - - def test_open_args(self, setup_path): - - with ensure_clean_path(setup_path) as path: - - df = tm.makeDataFrame() - - # create an in memory store - store = HDFStore( - path, mode="a", driver="H5FD_CORE", driver_core_backing_store=0 - ) - store["df"] = df - store.append("df2", df) - - tm.assert_frame_equal(store["df"], df) - tm.assert_frame_equal(store["df2"], df) - - store.close() - - # the file should not have actually been written - assert not os.path.exists(path) - - def test_flush(self, setup_path): - - with ensure_clean_store(setup_path) as store: - store["a"] = tm.makeTimeSeries() - store.flush() - store.flush(fsync=True) - - def test_get(self, setup_path): - - with ensure_clean_store(setup_path) as store: - store["a"] = tm.makeTimeSeries() - left = store.get("a") - right = store["a"] - tm.assert_series_equal(left, right) - - left = store.get("/a") - right = store["/a"] - tm.assert_series_equal(left, right) - - with pytest.raises(KeyError, match="'No object named b in the file'"): - store.get("b") - - @pytest.mark.parametrize( - "where, expected", - [ - ( - "/", - { - "": ({"first_group", "second_group"}, set()), - "/first_group": (set(), {"df1", "df2"}), - "/second_group": ({"third_group"}, {"df3", "s1"}), - "/second_group/third_group": (set(), {"df4"}), - }, - ), - ( - "/second_group", - { - "/second_group": ({"third_group"}, {"df3", "s1"}), - "/second_group/third_group": (set(), {"df4"}), - }, - ), - ], - ) - def test_walk(self, where, expected, setup_path): - # GH10143 - objs = { - "df1": pd.DataFrame([1, 2, 3]), - "df2": pd.DataFrame([4, 5, 6]), - "df3": pd.DataFrame([6, 7, 8]), - "df4": pd.DataFrame([9, 10, 11]), - "s1": pd.Series([10, 9, 8]), - # Next 3 items aren't pandas objects and should be ignored - "a1": np.array([[1, 2, 3], [4, 5, 6]]), - "tb1": np.array([(1, 2, 3), (4, 5, 6)], dtype="i,i,i"), - "tb2": np.array([(7, 8, 9), (10, 11, 12)], dtype="i,i,i"), - } - - with ensure_clean_store("walk_groups.hdf", mode="w") as store: - store.put("/first_group/df1", objs["df1"]) - store.put("/first_group/df2", objs["df2"]) - store.put("/second_group/df3", objs["df3"]) - store.put("/second_group/s1", objs["s1"]) - store.put("/second_group/third_group/df4", objs["df4"]) - # Create non-pandas objects - store._handle.create_array("/first_group", "a1", objs["a1"]) - store._handle.create_table("/first_group", "tb1", obj=objs["tb1"]) - store._handle.create_table("/second_group", "tb2", obj=objs["tb2"]) - - assert len(list(store.walk(where=where))) == len(expected) - for path, groups, leaves in store.walk(where=where): - assert path in expected - expected_groups, expected_frames = expected[path] - assert expected_groups == set(groups) - assert expected_frames == set(leaves) - for leaf in leaves: - frame_path = "/".join([path, leaf]) - obj = store.get(frame_path) - if "df" in leaf: - tm.assert_frame_equal(obj, objs[leaf]) - else: - tm.assert_series_equal(obj, objs[leaf]) - - def test_getattr(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - s = tm.makeTimeSeries() - store["a"] = s - - # test attribute access - result = store.a - tm.assert_series_equal(result, s) - result = getattr(store, "a") - tm.assert_series_equal(result, s) - - df = tm.makeTimeDataFrame() - store["df"] = df - result = store.df - tm.assert_frame_equal(result, df) - - # errors - for x in ["d", "mode", "path", "handle", "complib"]: - with pytest.raises(AttributeError): - getattr(store, x) - - # not stores - for x in ["mode", "path", "handle", "complib"]: - getattr(store, "_{x}".format(x=x)) - - def test_put(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - ts = tm.makeTimeSeries() - df = tm.makeTimeDataFrame() - store["a"] = ts - store["b"] = df[:10] - store["foo/bar/bah"] = df[:10] - store["foo"] = df[:10] - store["/foo"] = df[:10] - store.put("c", df[:10], format="table") - - # not OK, not a table - with pytest.raises(ValueError): - store.put("b", df[10:], append=True) - - # node does not currently exist, test _is_table_type returns False - # in this case - _maybe_remove(store, "f") - with pytest.raises(ValueError): - store.put("f", df[10:], append=True) - - # can't put to a table (use append instead) - with pytest.raises(ValueError): - store.put("c", df[10:], append=True) - - # overwrite table - store.put("c", df[:10], format="table", append=False) - tm.assert_frame_equal(df[:10], store["c"]) - - def test_put_string_index(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - index = Index( - ["I am a very long string index: {i}".format(i=i) for i in range(20)] - ) - s = Series(np.arange(20), index=index) - df = DataFrame({"A": s, "B": s}) - - store["a"] = s - tm.assert_series_equal(store["a"], s) - - store["b"] = df - tm.assert_frame_equal(store["b"], df) - - # mixed length - index = Index( - ["abcdefghijklmnopqrstuvwxyz1234567890"] - + ["I am a very long string index: {i}".format(i=i) for i in range(20)] - ) - s = Series(np.arange(21), index=index) - df = DataFrame({"A": s, "B": s}) - store["a"] = s - tm.assert_series_equal(store["a"], s) - - store["b"] = df - tm.assert_frame_equal(store["b"], df) - - def test_put_compression(self, setup_path): - - with ensure_clean_store(setup_path) as store: - df = tm.makeTimeDataFrame() - - store.put("c", df, format="table", complib="zlib") - tm.assert_frame_equal(store["c"], df) - - # can't compress if format='fixed' - with pytest.raises(ValueError): - store.put("b", df, format="fixed", complib="zlib") - - @td.skip_if_windows_python_3 - def test_put_compression_blosc(self, setup_path): - df = tm.makeTimeDataFrame() - - with ensure_clean_store(setup_path) as store: - - # can't compress if format='fixed' - with pytest.raises(ValueError): - store.put("b", df, format="fixed", complib="blosc") - - store.put("c", df, format="table", complib="blosc") - tm.assert_frame_equal(store["c"], df) - - def test_complibs_default_settings(self, setup_path): - # GH15943 - df = tm.makeDataFrame() - - # Set complevel and check if complib is automatically set to - # default value - with ensure_clean_path(setup_path) as tmpfile: - df.to_hdf(tmpfile, "df", complevel=9) - result = pd.read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) - - with tables.open_file(tmpfile, mode="r") as h5file: - for node in h5file.walk_nodes(where="/df", classname="Leaf"): - assert node.filters.complevel == 9 - assert node.filters.complib == "zlib" - - # Set complib and check to see if compression is disabled - with ensure_clean_path(setup_path) as tmpfile: - df.to_hdf(tmpfile, "df", complib="zlib") - result = pd.read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) - - with tables.open_file(tmpfile, mode="r") as h5file: - for node in h5file.walk_nodes(where="/df", classname="Leaf"): - assert node.filters.complevel == 0 - assert node.filters.complib is None - - # Check if not setting complib or complevel results in no compression - with ensure_clean_path(setup_path) as tmpfile: - df.to_hdf(tmpfile, "df") - result = pd.read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) - - with tables.open_file(tmpfile, mode="r") as h5file: - for node in h5file.walk_nodes(where="/df", classname="Leaf"): - assert node.filters.complevel == 0 - assert node.filters.complib is None - - # Check if file-defaults can be overridden on a per table basis - with ensure_clean_path(setup_path) as tmpfile: - store = pd.HDFStore(tmpfile) - store.append("dfc", df, complevel=9, complib="blosc") - store.append("df", df) - store.close() - - with tables.open_file(tmpfile, mode="r") as h5file: - for node in h5file.walk_nodes(where="/df", classname="Leaf"): - assert node.filters.complevel == 0 - assert node.filters.complib is None - for node in h5file.walk_nodes(where="/dfc", classname="Leaf"): - assert node.filters.complevel == 9 - assert node.filters.complib == "blosc" - - def test_complibs(self, setup_path): - # GH14478 - df = tm.makeDataFrame() - - # Building list of all complibs and complevels tuples - all_complibs = tables.filters.all_complibs - # Remove lzo if its not available on this platform - if not tables.which_lib_version("lzo"): - all_complibs.remove("lzo") - # Remove bzip2 if its not available on this platform - if not tables.which_lib_version("bzip2"): - all_complibs.remove("bzip2") - - all_levels = range(0, 10) - all_tests = [(lib, lvl) for lib in all_complibs for lvl in all_levels] - - for (lib, lvl) in all_tests: - with ensure_clean_path(setup_path) as tmpfile: - gname = "foo" - - # Write and read file to see if data is consistent - df.to_hdf(tmpfile, gname, complib=lib, complevel=lvl) - result = pd.read_hdf(tmpfile, gname) - tm.assert_frame_equal(result, df) - - # Open file and check metadata - # for correct amount of compression - h5table = tables.open_file(tmpfile, mode="r") - for node in h5table.walk_nodes(where="/" + gname, classname="Leaf"): - assert node.filters.complevel == lvl - if lvl == 0: - assert node.filters.complib is None - else: - assert node.filters.complib == lib - h5table.close() - - def test_put_integer(self, setup_path): - # non-date, non-string index - df = DataFrame(np.random.randn(50, 100)) - self._check_roundtrip(df, tm.assert_frame_equal, setup_path) - - @xfail_non_writeable - def test_put_mixed_type(self, setup_path): - df = tm.makeTimeDataFrame() - df["obj1"] = "foo" - df["obj2"] = "bar" - df["bool1"] = df["A"] > 0 - df["bool2"] = df["B"] > 0 - df["bool3"] = True - df["int1"] = 1 - df["int2"] = 2 - df["timestamp1"] = Timestamp("20010102") - df["timestamp2"] = Timestamp("20010103") - df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) - df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) - df.loc[3:6, ["obj1"]] = np.nan - df = df._consolidate()._convert(datetime=True) - - with ensure_clean_store(setup_path) as store: - _maybe_remove(store, "df") - - # PerformanceWarning - with catch_warnings(record=True): - simplefilter("ignore", pd.errors.PerformanceWarning) - store.put("df", df) - - expected = store.get("df") - tm.assert_frame_equal(expected, df) - - @pytest.mark.filterwarnings( - "ignore:object name:tables.exceptions.NaturalNameWarning" - ) - def test_append(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - # this is allowed by almost always don't want to do it - # tables.NaturalNameWarning): - with catch_warnings(record=True): - - df = tm.makeTimeDataFrame() - _maybe_remove(store, "df1") - store.append("df1", df[:10]) - store.append("df1", df[10:]) - tm.assert_frame_equal(store["df1"], df) - - _maybe_remove(store, "df2") - store.put("df2", df[:10], format="table") - store.append("df2", df[10:]) - tm.assert_frame_equal(store["df2"], df) - - _maybe_remove(store, "df3") - store.append("/df3", df[:10]) - store.append("/df3", df[10:]) - tm.assert_frame_equal(store["df3"], df) - - # this is allowed by almost always don't want to do it - # tables.NaturalNameWarning - _maybe_remove(store, "/df3 foo") - store.append("/df3 foo", df[:10]) - store.append("/df3 foo", df[10:]) - tm.assert_frame_equal(store["df3 foo"], df) - - # dtype issues - mizxed type in a single object column - df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]]) - df["mixed_column"] = "testing" - df.loc[2, "mixed_column"] = np.nan - _maybe_remove(store, "df") - store.append("df", df) - tm.assert_frame_equal(store["df"], df) - - # uints - test storage of uints - uint_data = DataFrame( - { - "u08": Series( - np.random.randint(0, high=255, size=5), dtype=np.uint8 - ), - "u16": Series( - np.random.randint(0, high=65535, size=5), dtype=np.uint16 - ), - "u32": Series( - np.random.randint(0, high=2 ** 30, size=5), dtype=np.uint32 - ), - "u64": Series( - [2 ** 58, 2 ** 59, 2 ** 60, 2 ** 61, 2 ** 62], - dtype=np.uint64, - ), - }, - index=np.arange(5), - ) - _maybe_remove(store, "uints") - store.append("uints", uint_data) - tm.assert_frame_equal(store["uints"], uint_data) - - # uints - test storage of uints in indexable columns - _maybe_remove(store, "uints") - # 64-bit indices not yet supported - store.append("uints", uint_data, data_columns=["u08", "u16", "u32"]) - tm.assert_frame_equal(store["uints"], uint_data) - - def test_append_series(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - # basic - ss = tm.makeStringSeries() - ts = tm.makeTimeSeries() - ns = Series(np.arange(100)) - - store.append("ss", ss) - result = store["ss"] - tm.assert_series_equal(result, ss) - assert result.name is None - - store.append("ts", ts) - result = store["ts"] - tm.assert_series_equal(result, ts) - assert result.name is None - - ns.name = "foo" - store.append("ns", ns) - result = store["ns"] - tm.assert_series_equal(result, ns) - assert result.name == ns.name - - # select on the values - expected = ns[ns > 60] - result = store.select("ns", "foo>60") - tm.assert_series_equal(result, expected) - - # select on the index and values - expected = ns[(ns > 70) & (ns.index < 90)] - result = store.select("ns", "foo>70 and index<90") - tm.assert_series_equal(result, expected) - - # multi-index - mi = DataFrame(np.random.randn(5, 1), columns=["A"]) - mi["B"] = np.arange(len(mi)) - mi["C"] = "foo" - mi.loc[3:5, "C"] = "bar" - mi.set_index(["C", "B"], inplace=True) - s = mi.stack() - s.index = s.index.droplevel(2) - store.append("mi", s) - tm.assert_series_equal(store["mi"], s) - - def test_store_index_types(self, setup_path): - # GH5386 - # test storing various index types - - with ensure_clean_store(setup_path) as store: - - def check(format, index): - df = DataFrame(np.random.randn(10, 2), columns=list("AB")) - df.index = index(len(df)) - - _maybe_remove(store, "df") - store.put("df", df, format=format) - assert_frame_equal(df, store["df"]) - - for index in [ - tm.makeFloatIndex, - tm.makeStringIndex, - tm.makeIntIndex, - tm.makeDateIndex, - ]: - - check("table", index) - check("fixed", index) - - # period index currently broken for table - # seee GH7796 FIXME - check("fixed", tm.makePeriodIndex) - # check('table',tm.makePeriodIndex) - - # unicode - index = tm.makeUnicodeIndex - check("table", index) - check("fixed", index) - - @pytest.mark.skipif( - not is_platform_little_endian(), reason="reason platform is not little endian" - ) - def test_encoding(self, setup_path): - - with ensure_clean_store(setup_path) as store: - df = DataFrame(dict(A="foo", B="bar"), index=range(5)) - df.loc[2, "A"] = np.nan - df.loc[3, "B"] = np.nan - _maybe_remove(store, "df") - store.append("df", df, encoding="ascii") - tm.assert_frame_equal(store["df"], df) - - expected = df.reindex(columns=["A"]) - result = store.select("df", Term("columns=A", encoding="ascii")) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "val", - [ - [b"E\xc9, 17", b"", b"a", b"b", b"c"], - [b"E\xc9, 17", b"a", b"b", b"c"], - [b"EE, 17", b"", b"a", b"b", b"c"], - [b"E\xc9, 17", b"\xf8\xfc", b"a", b"b", b"c"], - [b"", b"a", b"b", b"c"], - [b"\xf8\xfc", b"a", b"b", b"c"], - [b"A\xf8\xfc", b"", b"a", b"b", b"c"], - [np.nan, b"", b"b", b"c"], - [b"A\xf8\xfc", np.nan, b"", b"b", b"c"], - ], - ) - @pytest.mark.parametrize("dtype", ["category", object]) - def test_latin_encoding(self, setup_path, dtype, val): - enc = "latin-1" - nan_rep = "" - key = "data" - - val = [x.decode(enc) if isinstance(x, bytes) else x for x in val] - ser = pd.Series(val, dtype=dtype) - - with ensure_clean_path(setup_path) as store: - ser.to_hdf(store, key, format="table", encoding=enc, nan_rep=nan_rep) - retr = read_hdf(store, key) - - s_nan = ser.replace(nan_rep, np.nan) - - if is_categorical_dtype(s_nan): - assert is_categorical_dtype(retr) - assert_series_equal(s_nan, retr, check_dtype=False, check_categorical=False) - else: - assert_series_equal(s_nan, retr) - - # FIXME: don't leave commented-out - # fails: - # for x in examples: - # roundtrip(s, nan_rep=b'\xf8\xfc') - - def test_append_some_nans(self, setup_path): - - with ensure_clean_store(setup_path) as store: - df = DataFrame( - { - "A": Series(np.random.randn(20)).astype("int32"), - "A1": np.random.randn(20), - "A2": np.random.randn(20), - "B": "foo", - "C": "bar", - "D": Timestamp("20010101"), - "E": datetime.datetime(2001, 1, 2, 0, 0), - }, - index=np.arange(20), - ) - # some nans - _maybe_remove(store, "df1") - df.loc[0:15, ["A1", "B", "D", "E"]] = np.nan - store.append("df1", df[:10]) - store.append("df1", df[10:]) - tm.assert_frame_equal(store["df1"], df) - - # first column - df1 = df.copy() - df1.loc[:, "A1"] = np.nan - _maybe_remove(store, "df1") - store.append("df1", df1[:10]) - store.append("df1", df1[10:]) - tm.assert_frame_equal(store["df1"], df1) - - # 2nd column - df2 = df.copy() - df2.loc[:, "A2"] = np.nan - _maybe_remove(store, "df2") - store.append("df2", df2[:10]) - store.append("df2", df2[10:]) - tm.assert_frame_equal(store["df2"], df2) - - # datetimes - df3 = df.copy() - df3.loc[:, "E"] = np.nan - _maybe_remove(store, "df3") - store.append("df3", df3[:10]) - store.append("df3", df3[10:]) - tm.assert_frame_equal(store["df3"], df3) - - def test_append_all_nans(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - df = DataFrame( - {"A1": np.random.randn(20), "A2": np.random.randn(20)}, - index=np.arange(20), - ) - df.loc[0:15, :] = np.nan - - # nan some entire rows (dropna=True) - _maybe_remove(store, "df") - store.append("df", df[:10], dropna=True) - store.append("df", df[10:], dropna=True) - tm.assert_frame_equal(store["df"], df[-4:]) - - # nan some entire rows (dropna=False) - _maybe_remove(store, "df2") - store.append("df2", df[:10], dropna=False) - store.append("df2", df[10:], dropna=False) - tm.assert_frame_equal(store["df2"], df) - - # tests the option io.hdf.dropna_table - pd.set_option("io.hdf.dropna_table", False) - _maybe_remove(store, "df3") - store.append("df3", df[:10]) - store.append("df3", df[10:]) - tm.assert_frame_equal(store["df3"], df) - - pd.set_option("io.hdf.dropna_table", True) - _maybe_remove(store, "df4") - store.append("df4", df[:10]) - store.append("df4", df[10:]) - tm.assert_frame_equal(store["df4"], df[-4:]) - - # nan some entire rows (string are still written!) - df = DataFrame( - { - "A1": np.random.randn(20), - "A2": np.random.randn(20), - "B": "foo", - "C": "bar", - }, - index=np.arange(20), - ) - - df.loc[0:15, :] = np.nan - - _maybe_remove(store, "df") - store.append("df", df[:10], dropna=True) - store.append("df", df[10:], dropna=True) - tm.assert_frame_equal(store["df"], df) - - _maybe_remove(store, "df2") - store.append("df2", df[:10], dropna=False) - store.append("df2", df[10:], dropna=False) - tm.assert_frame_equal(store["df2"], df) - - # nan some entire rows (but since we have dates they are still - # written!) - df = DataFrame( - { - "A1": np.random.randn(20), - "A2": np.random.randn(20), - "B": "foo", - "C": "bar", - "D": Timestamp("20010101"), - "E": datetime.datetime(2001, 1, 2, 0, 0), - }, - index=np.arange(20), - ) - - df.loc[0:15, :] = np.nan - - _maybe_remove(store, "df") - store.append("df", df[:10], dropna=True) - store.append("df", df[10:], dropna=True) - tm.assert_frame_equal(store["df"], df) - - _maybe_remove(store, "df2") - store.append("df2", df[:10], dropna=False) - store.append("df2", df[10:], dropna=False) - tm.assert_frame_equal(store["df2"], df) - - # Test to make sure defaults are to not drop. - # Corresponding to Issue 9382 - df_with_missing = DataFrame( - {"col1": [0, np.nan, 2], "col2": [1, np.nan, np.nan]} - ) - - with ensure_clean_path(setup_path) as path: - df_with_missing.to_hdf(path, "df_with_missing", format="table") - reloaded = read_hdf(path, "df_with_missing") - tm.assert_frame_equal(df_with_missing, reloaded) - - def test_read_missing_key_close_store(self, setup_path): - # GH 25766 - with ensure_clean_path(setup_path) as path: - df = pd.DataFrame({"a": range(2), "b": range(2)}) - df.to_hdf(path, "k1") - - with pytest.raises(KeyError, match="'No object named k2 in the file'"): - pd.read_hdf(path, "k2") - - # smoke test to test that file is properly closed after - # read with KeyError before another write - df.to_hdf(path, "k2") - - def test_append_frame_column_oriented(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - # column oriented - df = tm.makeTimeDataFrame() - _maybe_remove(store, "df1") - store.append("df1", df.iloc[:, :2], axes=["columns"]) - store.append("df1", df.iloc[:, 2:]) - tm.assert_frame_equal(store["df1"], df) - - result = store.select("df1", "columns=A") - expected = df.reindex(columns=["A"]) - tm.assert_frame_equal(expected, result) - - # selection on the non-indexable - result = store.select("df1", ("columns=A", "index=df.index[0:4]")) - expected = df.reindex(columns=["A"], index=df.index[0:4]) - tm.assert_frame_equal(expected, result) - - # this isn't supported - with pytest.raises(TypeError): - store.select("df1", "columns=A and index>df.index[4]") - - def test_append_with_different_block_ordering(self, setup_path): - - # GH 4096; using same frames, but different block orderings - with ensure_clean_store(setup_path) as store: - - for i in range(10): - - df = DataFrame(np.random.randn(10, 2), columns=list("AB")) - df["index"] = range(10) - df["index"] += i * 10 - df["int64"] = Series([1] * len(df), dtype="int64") - df["int16"] = Series([1] * len(df), dtype="int16") - - if i % 2 == 0: - del df["int64"] - df["int64"] = Series([1] * len(df), dtype="int64") - if i % 3 == 0: - a = df.pop("A") - df["A"] = a - - df.set_index("index", inplace=True) - - store.append("df", df) - - # test a different ordering but with more fields (like invalid - # combinate) - with ensure_clean_store(setup_path) as store: - - df = DataFrame(np.random.randn(10, 2), columns=list("AB"), dtype="float64") - df["int64"] = Series([1] * len(df), dtype="int64") - df["int16"] = Series([1] * len(df), dtype="int16") - store.append("df", df) - - # store additional fields in different blocks - df["int16_2"] = Series([1] * len(df), dtype="int16") - with pytest.raises(ValueError): - store.append("df", df) - - # store multile additional fields in different blocks - df["float_3"] = Series([1.0] * len(df), dtype="float64") - with pytest.raises(ValueError): - store.append("df", df) - - def test_append_with_strings(self, setup_path): - - with ensure_clean_store(setup_path) as store: - with catch_warnings(record=True): - - def check_col(key, name, size): - assert ( - getattr(store.get_storer(key).table.description, name).itemsize - == size - ) - - # avoid truncation on elements - df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]]) - store.append("df_big", df) - tm.assert_frame_equal(store.select("df_big"), df) - check_col("df_big", "values_block_1", 15) - - # appending smaller string ok - df2 = DataFrame([[124, "asdqy"], [346, "dggnhefbdfb"]]) - store.append("df_big", df2) - expected = concat([df, df2]) - tm.assert_frame_equal(store.select("df_big"), expected) - check_col("df_big", "values_block_1", 15) - - # avoid truncation on elements - df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]]) - store.append("df_big2", df, min_itemsize={"values": 50}) - tm.assert_frame_equal(store.select("df_big2"), df) - check_col("df_big2", "values_block_1", 50) - - # bigger string on next append - store.append("df_new", df) - df_new = DataFrame( - [[124, "abcdefqhij"], [346, "abcdefghijklmnopqrtsuvwxyz"]] - ) - with pytest.raises(ValueError): - store.append("df_new", df_new) - - # min_itemsize on Series index (GH 11412) - df = tm.makeMixedDataFrame().set_index("C") - store.append("ss", df["B"], min_itemsize={"index": 4}) - tm.assert_series_equal(store.select("ss"), df["B"]) - - # same as above, with data_columns=True - store.append( - "ss2", df["B"], data_columns=True, min_itemsize={"index": 4} - ) - tm.assert_series_equal(store.select("ss2"), df["B"]) - - # min_itemsize in index without appending (GH 10381) - store.put("ss3", df, format="table", min_itemsize={"index": 6}) - # just make sure there is a longer string: - df2 = df.copy().reset_index().assign(C="longer").set_index("C") - store.append("ss3", df2) - tm.assert_frame_equal(store.select("ss3"), pd.concat([df, df2])) - - # same as above, with a Series - store.put("ss4", df["B"], format="table", min_itemsize={"index": 6}) - store.append("ss4", df2["B"]) - tm.assert_series_equal( - store.select("ss4"), pd.concat([df["B"], df2["B"]]) - ) - - # with nans - _maybe_remove(store, "df") - df = tm.makeTimeDataFrame() - df["string"] = "foo" - df.loc[1:4, "string"] = np.nan - df["string2"] = "bar" - df.loc[4:8, "string2"] = np.nan - df["string3"] = "bah" - df.loc[1:, "string3"] = np.nan - store.append("df", df) - result = store.select("df") - tm.assert_frame_equal(result, df) - - with ensure_clean_store(setup_path) as store: - - def check_col(key, name, size): - assert getattr( - store.get_storer(key).table.description, name - ).itemsize, size - - df = DataFrame(dict(A="foo", B="bar"), index=range(10)) - - # a min_itemsize that creates a data_column - _maybe_remove(store, "df") - store.append("df", df, min_itemsize={"A": 200}) - check_col("df", "A", 200) - assert store.get_storer("df").data_columns == ["A"] - - # a min_itemsize that creates a data_column2 - _maybe_remove(store, "df") - store.append("df", df, data_columns=["B"], min_itemsize={"A": 200}) - check_col("df", "A", 200) - assert store.get_storer("df").data_columns == ["B", "A"] - - # a min_itemsize that creates a data_column2 - _maybe_remove(store, "df") - store.append("df", df, data_columns=["B"], min_itemsize={"values": 200}) - check_col("df", "B", 200) - check_col("df", "values_block_0", 200) - assert store.get_storer("df").data_columns == ["B"] - - # infer the .typ on subsequent appends - _maybe_remove(store, "df") - store.append("df", df[:5], min_itemsize=200) - store.append("df", df[5:], min_itemsize=200) - tm.assert_frame_equal(store["df"], df) - - # invalid min_itemsize keys - df = DataFrame(["foo", "foo", "foo", "barh", "barh", "barh"], columns=["A"]) - _maybe_remove(store, "df") - with pytest.raises(ValueError): - store.append("df", df, min_itemsize={"foo": 20, "foobar": 20}) - - def test_append_with_empty_string(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - # with all empty strings (GH 12242) - df = DataFrame({"x": ["a", "b", "c", "d", "e", "f", ""]}) - store.append("df", df[:-1], min_itemsize={"x": 1}) - store.append("df", df[-1:], min_itemsize={"x": 1}) - tm.assert_frame_equal(store.select("df"), df) - - def test_to_hdf_with_min_itemsize(self, setup_path): - - with ensure_clean_path(setup_path) as path: - - # min_itemsize in index with to_hdf (GH 10381) - df = tm.makeMixedDataFrame().set_index("C") - df.to_hdf(path, "ss3", format="table", min_itemsize={"index": 6}) - # just make sure there is a longer string: - df2 = df.copy().reset_index().assign(C="longer").set_index("C") - df2.to_hdf(path, "ss3", append=True, format="table") - tm.assert_frame_equal(pd.read_hdf(path, "ss3"), pd.concat([df, df2])) - - # same as above, with a Series - df["B"].to_hdf(path, "ss4", format="table", min_itemsize={"index": 6}) - df2["B"].to_hdf(path, "ss4", append=True, format="table") - tm.assert_series_equal( - pd.read_hdf(path, "ss4"), pd.concat([df["B"], df2["B"]]) - ) - - @pytest.mark.parametrize( - "format", [pytest.param("fixed", marks=xfail_non_writeable), "table"] - ) - def test_to_hdf_errors(self, format, setup_path): - - data = ["\ud800foo"] - ser = pd.Series(data, index=pd.Index(data)) - with ensure_clean_path(setup_path) as path: - # GH 20835 - ser.to_hdf(path, "table", format=format, errors="surrogatepass") - - result = pd.read_hdf(path, "table", errors="surrogatepass") - tm.assert_series_equal(result, ser) - - def test_append_with_data_columns(self, setup_path): - - with ensure_clean_store(setup_path) as store: - df = tm.makeTimeDataFrame() - df.iloc[0, df.columns.get_loc("B")] = 1.0 - _maybe_remove(store, "df") - store.append("df", df[:2], data_columns=["B"]) - store.append("df", df[2:]) - tm.assert_frame_equal(store["df"], df) - - # check that we have indices created - assert store._handle.root.df.table.cols.index.is_indexed is True - assert store._handle.root.df.table.cols.B.is_indexed is True - - # data column searching - result = store.select("df", "B>0") - expected = df[df.B > 0] - tm.assert_frame_equal(result, expected) - - # data column searching (with an indexable and a data_columns) - result = store.select("df", "B>0 and index>df.index[3]") - df_new = df.reindex(index=df.index[4:]) - expected = df_new[df_new.B > 0] - tm.assert_frame_equal(result, expected) - - # data column selection with a string data_column - df_new = df.copy() - df_new["string"] = "foo" - df_new.loc[1:4, "string"] = np.nan - df_new.loc[5:6, "string"] = "bar" - _maybe_remove(store, "df") - store.append("df", df_new, data_columns=["string"]) - result = store.select("df", "string='foo'") - expected = df_new[df_new.string == "foo"] - tm.assert_frame_equal(result, expected) - - # using min_itemsize and a data column - def check_col(key, name, size): - assert ( - getattr(store.get_storer(key).table.description, name).itemsize - == size - ) - - with ensure_clean_store(setup_path) as store: - _maybe_remove(store, "df") - store.append( - "df", df_new, data_columns=["string"], min_itemsize={"string": 30} - ) - check_col("df", "string", 30) - _maybe_remove(store, "df") - store.append("df", df_new, data_columns=["string"], min_itemsize=30) - check_col("df", "string", 30) - _maybe_remove(store, "df") - store.append( - "df", df_new, data_columns=["string"], min_itemsize={"values": 30} - ) - check_col("df", "string", 30) - - with ensure_clean_store(setup_path) as store: - df_new["string2"] = "foobarbah" - df_new["string_block1"] = "foobarbah1" - df_new["string_block2"] = "foobarbah2" - _maybe_remove(store, "df") - store.append( - "df", - df_new, - data_columns=["string", "string2"], - min_itemsize={"string": 30, "string2": 40, "values": 50}, - ) - check_col("df", "string", 30) - check_col("df", "string2", 40) - check_col("df", "values_block_1", 50) - - with ensure_clean_store(setup_path) as store: - # multiple data columns - df_new = df.copy() - df_new.iloc[0, df_new.columns.get_loc("A")] = 1.0 - df_new.iloc[0, df_new.columns.get_loc("B")] = -1.0 - df_new["string"] = "foo" - - sl = df_new.columns.get_loc("string") - df_new.iloc[1:4, sl] = np.nan - df_new.iloc[5:6, sl] = "bar" - - df_new["string2"] = "foo" - sl = df_new.columns.get_loc("string2") - df_new.iloc[2:5, sl] = np.nan - df_new.iloc[7:8, sl] = "bar" - _maybe_remove(store, "df") - store.append("df", df_new, data_columns=["A", "B", "string", "string2"]) - result = store.select( - "df", "string='foo' and string2='foo' and A>0 and B<0" - ) - expected = df_new[ - (df_new.string == "foo") - & (df_new.string2 == "foo") - & (df_new.A > 0) - & (df_new.B < 0) - ] - tm.assert_frame_equal(result, expected, check_index_type=False) - - # yield an empty frame - result = store.select("df", "string='foo' and string2='cool'") - expected = df_new[(df_new.string == "foo") & (df_new.string2 == "cool")] - tm.assert_frame_equal(result, expected, check_index_type=False) - - with ensure_clean_store(setup_path) as store: - # doc example - df_dc = df.copy() - df_dc["string"] = "foo" - df_dc.loc[4:6, "string"] = np.nan - df_dc.loc[7:9, "string"] = "bar" - df_dc["string2"] = "cool" - df_dc["datetime"] = Timestamp("20010102") - df_dc = df_dc._convert(datetime=True) - df_dc.loc[3:5, ["A", "B", "datetime"]] = np.nan - - _maybe_remove(store, "df_dc") - store.append( - "df_dc", df_dc, data_columns=["B", "C", "string", "string2", "datetime"] - ) - result = store.select("df_dc", "B>0") - - expected = df_dc[df_dc.B > 0] - tm.assert_frame_equal(result, expected, check_index_type=False) - - result = store.select("df_dc", ["B > 0", "C > 0", "string == foo"]) - expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] - tm.assert_frame_equal(result, expected, check_index_type=False) - - with ensure_clean_store(setup_path) as store: - # doc example part 2 - np.random.seed(1234) - index = date_range("1/1/2000", periods=8) - df_dc = DataFrame( - np.random.randn(8, 3), index=index, columns=["A", "B", "C"] - ) - df_dc["string"] = "foo" - df_dc.loc[4:6, "string"] = np.nan - df_dc.loc[7:9, "string"] = "bar" - df_dc.loc[:, ["B", "C"]] = df_dc.loc[:, ["B", "C"]].abs() - df_dc["string2"] = "cool" - - # on-disk operations - store.append("df_dc", df_dc, data_columns=["B", "C", "string", "string2"]) - - result = store.select("df_dc", "B>0") - expected = df_dc[df_dc.B > 0] - tm.assert_frame_equal(result, expected) - - result = store.select("df_dc", ["B > 0", "C > 0", 'string == "foo"']) - expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] - tm.assert_frame_equal(result, expected) - - def test_create_table_index(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - with catch_warnings(record=True): - - def col(t, column): - return getattr(store.get_storer(t).table.cols, column) - - # data columns - df = tm.makeTimeDataFrame() - df["string"] = "foo" - df["string2"] = "bar" - store.append("f", df, data_columns=["string", "string2"]) - assert col("f", "index").is_indexed is True - assert col("f", "string").is_indexed is True - assert col("f", "string2").is_indexed is True - - # specify index=columns - store.append( - "f2", df, index=["string"], data_columns=["string", "string2"] - ) - assert col("f2", "index").is_indexed is False - assert col("f2", "string").is_indexed is True - assert col("f2", "string2").is_indexed is False - - # try to index a non-table - _maybe_remove(store, "f2") - store.put("f2", df) - with pytest.raises(TypeError): - store.create_table_index("f2") - - def test_append_hierarchical(self, setup_path): - index = MultiIndex( - levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=["foo", "bar"], - ) - df = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) - - with ensure_clean_store(setup_path) as store: - store.append("mi", df) - result = store.select("mi") - tm.assert_frame_equal(result, df) - - # GH 3748 - result = store.select("mi", columns=["A", "B"]) - expected = df.reindex(columns=["A", "B"]) - tm.assert_frame_equal(result, expected) - - with ensure_clean_path("test.hdf") as path: - df.to_hdf(path, "df", format="table") - result = read_hdf(path, "df", columns=["A", "B"]) - expected = df.reindex(columns=["A", "B"]) - tm.assert_frame_equal(result, expected) - - def test_column_multiindex(self, setup_path): - # GH 4710 - # recreate multi-indexes properly - - index = MultiIndex.from_tuples( - [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")], names=["first", "second"] - ) - df = DataFrame(np.arange(12).reshape(3, 4), columns=index) - expected = df.copy() - if isinstance(expected.index, RangeIndex): - expected.index = Int64Index(expected.index) - - with ensure_clean_store(setup_path) as store: - - store.put("df", df) - tm.assert_frame_equal( - store["df"], expected, check_index_type=True, check_column_type=True - ) - - store.put("df1", df, format="table") - tm.assert_frame_equal( - store["df1"], expected, check_index_type=True, check_column_type=True - ) - - with pytest.raises(ValueError): - store.put("df2", df, format="table", data_columns=["A"]) - with pytest.raises(ValueError): - store.put("df3", df, format="table", data_columns=True) - - # appending multi-column on existing table (see GH 6167) - with ensure_clean_store(setup_path) as store: - store.append("df2", df) - store.append("df2", df) - - tm.assert_frame_equal(store["df2"], concat((df, df))) - - # non_index_axes name - df = DataFrame( - np.arange(12).reshape(3, 4), columns=Index(list("ABCD"), name="foo") - ) - expected = df.copy() - if isinstance(expected.index, RangeIndex): - expected.index = Int64Index(expected.index) - - with ensure_clean_store(setup_path) as store: - - store.put("df1", df, format="table") - tm.assert_frame_equal( - store["df1"], expected, check_index_type=True, check_column_type=True - ) - - def test_store_multiindex(self, setup_path): - - # validate multi-index names - # GH 5527 - with ensure_clean_store(setup_path) as store: - - def make_index(names=None): - return MultiIndex.from_tuples( - [ - (datetime.datetime(2013, 12, d), s, t) - for d in range(1, 3) - for s in range(2) - for t in range(3) - ], - names=names, - ) - - # no names - _maybe_remove(store, "df") - df = DataFrame(np.zeros((12, 2)), columns=["a", "b"], index=make_index()) - store.append("df", df) - tm.assert_frame_equal(store.select("df"), df) - - # partial names - _maybe_remove(store, "df") - df = DataFrame( - np.zeros((12, 2)), - columns=["a", "b"], - index=make_index(["date", None, None]), - ) - store.append("df", df) - tm.assert_frame_equal(store.select("df"), df) - - # series - _maybe_remove(store, "s") - s = Series(np.zeros(12), index=make_index(["date", None, None])) - store.append("s", s) - xp = Series(np.zeros(12), index=make_index(["date", "level_1", "level_2"])) - tm.assert_series_equal(store.select("s"), xp) - - # dup with column - _maybe_remove(store, "df") - df = DataFrame( - np.zeros((12, 2)), - columns=["a", "b"], - index=make_index(["date", "a", "t"]), - ) - with pytest.raises(ValueError): - store.append("df", df) - - # dup within level - _maybe_remove(store, "df") - df = DataFrame( - np.zeros((12, 2)), - columns=["a", "b"], - index=make_index(["date", "date", "date"]), - ) - with pytest.raises(ValueError): - store.append("df", df) - - # fully names - _maybe_remove(store, "df") - df = DataFrame( - np.zeros((12, 2)), - columns=["a", "b"], - index=make_index(["date", "s", "t"]), - ) - store.append("df", df) - tm.assert_frame_equal(store.select("df"), df) - - def test_select_columns_in_where(self, setup_path): - - # GH 6169 - # recreate multi-indexes when columns is passed - # in the `where` argument - index = MultiIndex( - levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=["foo_name", "bar_name"], - ) - - # With a DataFrame - df = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) - - with ensure_clean_store(setup_path) as store: - store.put("df", df, format="table") - expected = df[["A"]] - - tm.assert_frame_equal(store.select("df", columns=["A"]), expected) - - tm.assert_frame_equal(store.select("df", where="columns=['A']"), expected) - - # With a Series - s = Series(np.random.randn(10), index=index, name="A") - with ensure_clean_store(setup_path) as store: - store.put("s", s, format="table") - tm.assert_series_equal(store.select("s", where="columns=['A']"), s) - - def test_mi_data_columns(self, setup_path): - # GH 14435 - idx = pd.MultiIndex.from_arrays( - [date_range("2000-01-01", periods=5), range(5)], names=["date", "id"] - ) - df = pd.DataFrame({"a": [1.1, 1.2, 1.3, 1.4, 1.5]}, index=idx) - - with ensure_clean_store(setup_path) as store: - store.append("df", df, data_columns=True) - - actual = store.select("df", where="id == 1") - expected = df.iloc[[1], :] - tm.assert_frame_equal(actual, expected) - - def test_pass_spec_to_storer(self, setup_path): - - df = tm.makeDataFrame() - - with ensure_clean_store(setup_path) as store: - store.put("df", df) - with pytest.raises(TypeError): - store.select("df", columns=["A"]) - with pytest.raises(TypeError): - store.select("df", where=[("columns=A")]) - - @xfail_non_writeable - def test_append_misc(self, setup_path): - - with ensure_clean_store(setup_path) as store: - df = tm.makeDataFrame() - store.append("df", df, chunksize=1) - result = store.select("df") - tm.assert_frame_equal(result, df) - - store.append("df1", df, expectedrows=10) - result = store.select("df1") - tm.assert_frame_equal(result, df) - - # more chunksize in append tests - def check(obj, comparator): - for c in [10, 200, 1000]: - with ensure_clean_store(setup_path, mode="w") as store: - store.append("obj", obj, chunksize=c) - result = store.select("obj") - comparator(result, obj) - - df = tm.makeDataFrame() - df["string"] = "foo" - df["float322"] = 1.0 - df["float322"] = df["float322"].astype("float32") - df["bool"] = df["float322"] > 0 - df["time1"] = Timestamp("20130101") - df["time2"] = Timestamp("20130102") - check(df, tm.assert_frame_equal) - - # empty frame, GH4273 - with ensure_clean_store(setup_path) as store: - - # 0 len - df_empty = DataFrame(columns=list("ABC")) - store.append("df", df_empty) - with pytest.raises(KeyError, match="'No object named df in the file'"): - store.select("df") - - # repeated append of 0/non-zero frames - df = DataFrame(np.random.rand(10, 3), columns=list("ABC")) - store.append("df", df) - assert_frame_equal(store.select("df"), df) - store.append("df", df_empty) - assert_frame_equal(store.select("df"), df) - - # store - df = DataFrame(columns=list("ABC")) - store.put("df2", df) - assert_frame_equal(store.select("df2"), df) - - def test_append_raise(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - # test append with invalid input to get good error messages - - # list in column - df = tm.makeDataFrame() - df["invalid"] = [["a"]] * len(df) - assert df.dtypes["invalid"] == np.object_ - with pytest.raises(TypeError): - store.append("df", df) - - # multiple invalid columns - df["invalid2"] = [["a"]] * len(df) - df["invalid3"] = [["a"]] * len(df) - with pytest.raises(TypeError): - store.append("df", df) - - # datetime with embedded nans as object - df = tm.makeDataFrame() - s = Series(datetime.datetime(2001, 1, 2), index=df.index) - s = s.astype(object) - s[0:5] = np.nan - df["invalid"] = s - assert df.dtypes["invalid"] == np.object_ - with pytest.raises(TypeError): - store.append("df", df) - - # directly ndarray - with pytest.raises(TypeError): - store.append("df", np.arange(10)) - - # series directly - with pytest.raises(TypeError): - store.append("df", Series(np.arange(10))) - - # appending an incompatible table - df = tm.makeDataFrame() - store.append("df", df) - - df["foo"] = "foo" - with pytest.raises(ValueError): - store.append("df", df) - - def test_table_index_incompatible_dtypes(self, setup_path): - df1 = DataFrame({"a": [1, 2, 3]}) - df2 = DataFrame({"a": [4, 5, 6]}, index=date_range("1/1/2000", periods=3)) - - with ensure_clean_store(setup_path) as store: - store.put("frame", df1, format="table") - with pytest.raises(TypeError): - store.put("frame", df2, format="table", append=True) - - def test_table_values_dtypes_roundtrip(self, setup_path): - - with ensure_clean_store(setup_path) as store: - df1 = DataFrame({"a": [1, 2, 3]}, dtype="f8") - store.append("df_f8", df1) - assert_series_equal(df1.dtypes, store["df_f8"].dtypes) - - df2 = DataFrame({"a": [1, 2, 3]}, dtype="i8") - store.append("df_i8", df2) - assert_series_equal(df2.dtypes, store["df_i8"].dtypes) - - # incompatible dtype - with pytest.raises(ValueError): - store.append("df_i8", df1) - - # check creation/storage/retrieval of float32 (a bit hacky to - # actually create them thought) - df1 = DataFrame(np.array([[1], [2], [3]], dtype="f4"), columns=["A"]) - store.append("df_f4", df1) - assert_series_equal(df1.dtypes, store["df_f4"].dtypes) - assert df1.dtypes[0] == "float32" - - # check with mixed dtypes - df1 = DataFrame( - { - c: Series(np.random.randint(5), dtype=c) - for c in ["float32", "float64", "int32", "int64", "int16", "int8"] - } - ) - df1["string"] = "foo" - df1["float322"] = 1.0 - df1["float322"] = df1["float322"].astype("float32") - df1["bool"] = df1["float32"] > 0 - df1["time1"] = Timestamp("20130101") - df1["time2"] = Timestamp("20130102") - - store.append("df_mixed_dtypes1", df1) - result = store.select("df_mixed_dtypes1").dtypes.value_counts() - result.index = [str(i) for i in result.index] - expected = Series( - { - "float32": 2, - "float64": 1, - "int32": 1, - "bool": 1, - "int16": 1, - "int8": 1, - "int64": 1, - "object": 1, - "datetime64[ns]": 2, - } - ) - result = result.sort_index() - expected = expected.sort_index() - tm.assert_series_equal(result, expected) - - def test_table_mixed_dtypes(self, setup_path): - - # frame - df = tm.makeDataFrame() - df["obj1"] = "foo" - df["obj2"] = "bar" - df["bool1"] = df["A"] > 0 - df["bool2"] = df["B"] > 0 - df["bool3"] = True - df["int1"] = 1 - df["int2"] = 2 - df["timestamp1"] = Timestamp("20010102") - df["timestamp2"] = Timestamp("20010103") - df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) - df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) - df.loc[3:6, ["obj1"]] = np.nan - df = df._consolidate()._convert(datetime=True) - - with ensure_clean_store(setup_path) as store: - store.append("df1_mixed", df) - tm.assert_frame_equal(store.select("df1_mixed"), df) - - def test_unimplemented_dtypes_table_columns(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - dtypes = [("date", datetime.date(2001, 1, 2))] - - # currently not supported dtypes #### - for n, f in dtypes: - df = tm.makeDataFrame() - df[n] = f - with pytest.raises(TypeError): - store.append("df1_{n}".format(n=n), df) - - # frame - df = tm.makeDataFrame() - df["obj1"] = "foo" - df["obj2"] = "bar" - df["datetime1"] = datetime.date(2001, 1, 2) - df = df._consolidate()._convert(datetime=True) - - with ensure_clean_store(setup_path) as store: - # this fails because we have a date in the object block...... - with pytest.raises(TypeError): - store.append("df_unimplemented", df) - - @xfail_non_writeable - @pytest.mark.skipif( - LooseVersion(np.__version__) == LooseVersion("1.15.0"), - reason=( - "Skipping pytables test when numpy version is " - "exactly equal to 1.15.0: gh-22098" - ), - ) - def test_calendar_roundtrip_issue(self, setup_path): - - # 8591 - # doc example from tseries holiday section - weekmask_egypt = "Sun Mon Tue Wed Thu" - holidays = [ - "2012-05-01", - datetime.datetime(2013, 5, 1), - np.datetime64("2014-05-01"), - ] - bday_egypt = pd.offsets.CustomBusinessDay( - holidays=holidays, weekmask=weekmask_egypt - ) - dt = datetime.datetime(2013, 4, 30) - dts = date_range(dt, periods=5, freq=bday_egypt) - - s = Series(dts.weekday, dts).map(Series("Mon Tue Wed Thu Fri Sat Sun".split())) - - with ensure_clean_store(setup_path) as store: - - store.put("fixed", s) - result = store.select("fixed") - assert_series_equal(result, s) - - store.append("table", s) - result = store.select("table") - assert_series_equal(result, s) - - def test_roundtrip_tz_aware_index(self, setup_path): - # GH 17618 - time = pd.Timestamp("2000-01-01 01:00:00", tz="US/Eastern") - df = pd.DataFrame(data=[0], index=[time]) - - with ensure_clean_store(setup_path) as store: - store.put("frame", df, format="fixed") - recons = store["frame"] - tm.assert_frame_equal(recons, df) - assert recons.index[0].value == 946706400000000000 - - def test_append_with_timedelta(self, setup_path): - # GH 3577 - # append timedelta - - df = DataFrame( - dict( - A=Timestamp("20130101"), - B=[ - Timestamp("20130101") + timedelta(days=i, seconds=10) - for i in range(10) - ], - ) - ) - df["C"] = df["A"] - df["B"] - df.loc[3:5, "C"] = np.nan - - with ensure_clean_store(setup_path) as store: - - # table - _maybe_remove(store, "df") - store.append("df", df, data_columns=True) - result = store.select("df") - assert_frame_equal(result, df) - - result = store.select("df", where="C<100000") - assert_frame_equal(result, df) - - result = store.select("df", where="C") - - # from the docs - with ensure_clean_path(setup_path) as path: - dfq = DataFrame( - np.random.randn(10, 4), - columns=list("ABCD"), - index=date_range("20130101", periods=10), - ) - dfq.to_hdf(path, "dfq", format="table", data_columns=True) - - # check ok - read_hdf( - path, "dfq", where="index>Timestamp('20130104') & columns=['A', 'B']" - ) - read_hdf(path, "dfq", where="A>0 or C>0") - - # catch the invalid reference - with ensure_clean_path(setup_path) as path: - dfq = DataFrame( - np.random.randn(10, 4), - columns=list("ABCD"), - index=date_range("20130101", periods=10), - ) - dfq.to_hdf(path, "dfq", format="table") - - with pytest.raises(ValueError): - read_hdf(path, "dfq", where="A>0 or C>0") - - def test_same_name_scoping(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - import pandas as pd - - df = DataFrame( - np.random.randn(20, 2), index=pd.date_range("20130101", periods=20) - ) - store.put("df", df, format="table") - expected = df[df.index > pd.Timestamp("20130105")] - - import datetime # noqa - - result = store.select("df", "index>datetime.datetime(2013,1,5)") - assert_frame_equal(result, expected) - - from datetime import datetime # noqa - - # technically an error, but allow it - result = store.select("df", "index>datetime.datetime(2013,1,5)") - assert_frame_equal(result, expected) - - result = store.select("df", "index>datetime(2013,1,5)") - assert_frame_equal(result, expected) - - def test_series(self, setup_path): - - s = tm.makeStringSeries() - self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) - - ts = tm.makeTimeSeries() - self._check_roundtrip(ts, tm.assert_series_equal, path=setup_path) - - ts2 = Series(ts.index, Index(ts.index, dtype=object)) - self._check_roundtrip(ts2, tm.assert_series_equal, path=setup_path) - - ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) - self._check_roundtrip( - ts3, tm.assert_series_equal, path=setup_path, check_index_type=False - ) - - def test_float_index(self, setup_path): - - # GH #454 - index = np.random.randn(10) - s = Series(np.random.randn(10), index=index) - self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) - - @xfail_non_writeable - def test_tuple_index(self, setup_path): - - # GH #492 - col = np.arange(10) - idx = [(0.0, 1.0), (2.0, 3.0), (4.0, 5.0)] - data = np.random.randn(30).reshape((3, 10)) - DF = DataFrame(data, index=idx, columns=col) - - with catch_warnings(record=True): - simplefilter("ignore", pd.errors.PerformanceWarning) - self._check_roundtrip(DF, tm.assert_frame_equal, path=setup_path) - - @xfail_non_writeable - @pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning") - def test_index_types(self, setup_path): - - with catch_warnings(record=True): - values = np.random.randn(2) - - func = lambda l, r: tm.assert_series_equal( - l, r, check_dtype=True, check_index_type=True, check_series_type=True - ) - - with catch_warnings(record=True): - ser = Series(values, [0, "y"]) - self._check_roundtrip(ser, func, path=setup_path) - - with catch_warnings(record=True): - ser = Series(values, [datetime.datetime.today(), 0]) - self._check_roundtrip(ser, func, path=setup_path) - - with catch_warnings(record=True): - ser = Series(values, ["y", 0]) - self._check_roundtrip(ser, func, path=setup_path) - - with catch_warnings(record=True): - ser = Series(values, [datetime.date.today(), "a"]) - self._check_roundtrip(ser, func, path=setup_path) - - with catch_warnings(record=True): - - ser = Series(values, [0, "y"]) - self._check_roundtrip(ser, func, path=setup_path) - - ser = Series(values, [datetime.datetime.today(), 0]) - self._check_roundtrip(ser, func, path=setup_path) - - ser = Series(values, ["y", 0]) - self._check_roundtrip(ser, func, path=setup_path) - - ser = Series(values, [datetime.date.today(), "a"]) - self._check_roundtrip(ser, func, path=setup_path) - - ser = Series(values, [1.23, "b"]) - self._check_roundtrip(ser, func, path=setup_path) - - ser = Series(values, [1, 1.53]) - self._check_roundtrip(ser, func, path=setup_path) - - ser = Series(values, [1, 5]) - self._check_roundtrip(ser, func, path=setup_path) - - ser = Series( - values, [datetime.datetime(2012, 1, 1), datetime.datetime(2012, 1, 2)] - ) - self._check_roundtrip(ser, func, path=setup_path) - - def test_timeseries_preepoch(self, setup_path): - - dr = bdate_range("1/1/1940", "1/1/1960") - ts = Series(np.random.randn(len(dr)), index=dr) - try: - self._check_roundtrip(ts, tm.assert_series_equal, path=setup_path) - except OverflowError: - pytest.skip("known failer on some windows platforms") - - @xfail_non_writeable - @pytest.mark.parametrize( - "compression", [False, pytest.param(True, marks=td.skip_if_windows_python_3)] - ) - def test_frame(self, compression, setup_path): - - df = tm.makeDataFrame() - - # put in some random NAs - df.values[0, 0] = np.nan - df.values[5, 3] = np.nan - - self._check_roundtrip_table( - df, tm.assert_frame_equal, path=setup_path, compression=compression - ) - self._check_roundtrip( - df, tm.assert_frame_equal, path=setup_path, compression=compression - ) - - tdf = tm.makeTimeDataFrame() - self._check_roundtrip( - tdf, tm.assert_frame_equal, path=setup_path, compression=compression - ) - - with ensure_clean_store(setup_path) as store: - # not consolidated - df["foo"] = np.random.randn(len(df)) - store["df"] = df - recons = store["df"] - assert recons._data.is_consolidated() - - # empty - self._check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path) - - @xfail_non_writeable - def test_empty_series_frame(self, setup_path): - s0 = Series() - s1 = Series(name="myseries") - df0 = DataFrame() - df1 = DataFrame(index=["a", "b", "c"]) - df2 = DataFrame(columns=["d", "e", "f"]) - - self._check_roundtrip(s0, tm.assert_series_equal, path=setup_path) - self._check_roundtrip(s1, tm.assert_series_equal, path=setup_path) - self._check_roundtrip(df0, tm.assert_frame_equal, path=setup_path) - self._check_roundtrip(df1, tm.assert_frame_equal, path=setup_path) - self._check_roundtrip(df2, tm.assert_frame_equal, path=setup_path) - - @xfail_non_writeable - @pytest.mark.parametrize( - "dtype", [np.int64, np.float64, np.object, "m8[ns]", "M8[ns]"] - ) - def test_empty_series(self, dtype, setup_path): - s = Series(dtype=dtype) - self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) - - def test_can_serialize_dates(self, setup_path): - - rng = [x.date() for x in bdate_range("1/1/2000", "1/30/2000")] - frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - - self._check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) - - def test_store_hierarchical(self, setup_path): - index = MultiIndex( - levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=["foo", "bar"], - ) - frame = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) - - self._check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) - self._check_roundtrip(frame.T, tm.assert_frame_equal, path=setup_path) - self._check_roundtrip(frame["A"], tm.assert_series_equal, path=setup_path) - - # check that the names are stored - with ensure_clean_store(setup_path) as store: - store["frame"] = frame - recons = store["frame"] - tm.assert_frame_equal(recons, frame) - - def test_store_index_name(self, setup_path): - df = tm.makeDataFrame() - df.index.name = "foo" - - with ensure_clean_store(setup_path) as store: - store["frame"] = df - recons = store["frame"] - tm.assert_frame_equal(recons, df) - - def test_store_index_name_with_tz(self, setup_path): - # GH 13884 - df = pd.DataFrame({"A": [1, 2]}) - df.index = pd.DatetimeIndex([1234567890123456787, 1234567890123456788]) - df.index = df.index.tz_localize("UTC") - df.index.name = "foo" - - with ensure_clean_store(setup_path) as store: - store.put("frame", df, format="table") - recons = store["frame"] - tm.assert_frame_equal(recons, df) - - @pytest.mark.parametrize("table_format", ["table", "fixed"]) - def test_store_index_name_numpy_str(self, table_format, setup_path): - # GH #13492 - idx = pd.Index( - pd.to_datetime([datetime.date(2000, 1, 1), datetime.date(2000, 1, 2)]), - name="cols\u05d2", - ) - idx1 = pd.Index( - pd.to_datetime([datetime.date(2010, 1, 1), datetime.date(2010, 1, 2)]), - name="rows\u05d0", - ) - df = pd.DataFrame(np.arange(4).reshape(2, 2), columns=idx, index=idx1) - - # This used to fail, returning numpy strings instead of python strings. - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", format=table_format) - df2 = read_hdf(path, "df") - - assert_frame_equal(df, df2, check_names=True) - - assert type(df2.index.name) == str - assert type(df2.columns.name) == str - - def test_store_series_name(self, setup_path): - df = tm.makeDataFrame() - series = df["A"] - - with ensure_clean_store(setup_path) as store: - store["series"] = series - recons = store["series"] - tm.assert_series_equal(recons, series) - - @xfail_non_writeable - @pytest.mark.parametrize( - "compression", [False, pytest.param(True, marks=td.skip_if_windows_python_3)] - ) - def test_store_mixed(self, compression, setup_path): - def _make_one(): - df = tm.makeDataFrame() - df["obj1"] = "foo" - df["obj2"] = "bar" - df["bool1"] = df["A"] > 0 - df["bool2"] = df["B"] > 0 - df["int1"] = 1 - df["int2"] = 2 - return df._consolidate() - - df1 = _make_one() - df2 = _make_one() - - self._check_roundtrip(df1, tm.assert_frame_equal, path=setup_path) - self._check_roundtrip(df2, tm.assert_frame_equal, path=setup_path) - - with ensure_clean_store(setup_path) as store: - store["obj"] = df1 - tm.assert_frame_equal(store["obj"], df1) - store["obj"] = df2 - tm.assert_frame_equal(store["obj"], df2) - - # check that can store Series of all of these types - self._check_roundtrip( - df1["obj1"], - tm.assert_series_equal, - path=setup_path, - compression=compression, - ) - self._check_roundtrip( - df1["bool1"], - tm.assert_series_equal, - path=setup_path, - compression=compression, - ) - self._check_roundtrip( - df1["int1"], - tm.assert_series_equal, - path=setup_path, - compression=compression, - ) - - @pytest.mark.filterwarnings( - "ignore:\\nduplicate:pandas.io.pytables.DuplicateWarning" - ) - def test_select_with_dups(self, setup_path): - - # single dtypes - df = DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]) - df.index = date_range("20130101 9:30", periods=10, freq="T") - - with ensure_clean_store(setup_path) as store: - store.append("df", df) - - result = store.select("df") - expected = df - assert_frame_equal(result, expected, by_blocks=True) - - result = store.select("df", columns=df.columns) - expected = df - assert_frame_equal(result, expected, by_blocks=True) - - result = store.select("df", columns=["A"]) - expected = df.loc[:, ["A"]] - assert_frame_equal(result, expected) - - # dups across dtypes - df = concat( - [ - DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]), - DataFrame( - np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"] - ), - ], - axis=1, - ) - df.index = date_range("20130101 9:30", periods=10, freq="T") - - with ensure_clean_store(setup_path) as store: - store.append("df", df) - - result = store.select("df") - expected = df - assert_frame_equal(result, expected, by_blocks=True) - - result = store.select("df", columns=df.columns) - expected = df - assert_frame_equal(result, expected, by_blocks=True) - - expected = df.loc[:, ["A"]] - result = store.select("df", columns=["A"]) - assert_frame_equal(result, expected, by_blocks=True) - - expected = df.loc[:, ["B", "A"]] - result = store.select("df", columns=["B", "A"]) - assert_frame_equal(result, expected, by_blocks=True) - - # duplicates on both index and columns - with ensure_clean_store(setup_path) as store: - store.append("df", df) - store.append("df", df) - - expected = df.loc[:, ["B", "A"]] - expected = concat([expected, expected]) - result = store.select("df", columns=["B", "A"]) - assert_frame_equal(result, expected, by_blocks=True) - - def test_overwrite_node(self, setup_path): - - with ensure_clean_store(setup_path) as store: - store["a"] = tm.makeTimeDataFrame() - ts = tm.makeTimeSeries() - store["a"] = ts - - tm.assert_series_equal(store["a"], ts) - - def test_select(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - with catch_warnings(record=True): - - # select with columns= - df = tm.makeTimeDataFrame() - _maybe_remove(store, "df") - store.append("df", df) - result = store.select("df", columns=["A", "B"]) - expected = df.reindex(columns=["A", "B"]) - tm.assert_frame_equal(expected, result) - - # equivalently - result = store.select("df", [("columns=['A', 'B']")]) - expected = df.reindex(columns=["A", "B"]) - tm.assert_frame_equal(expected, result) - - # with a data column - _maybe_remove(store, "df") - store.append("df", df, data_columns=["A"]) - result = store.select("df", ["A > 0"], columns=["A", "B"]) - expected = df[df.A > 0].reindex(columns=["A", "B"]) - tm.assert_frame_equal(expected, result) - - # all a data columns - _maybe_remove(store, "df") - store.append("df", df, data_columns=True) - result = store.select("df", ["A > 0"], columns=["A", "B"]) - expected = df[df.A > 0].reindex(columns=["A", "B"]) - tm.assert_frame_equal(expected, result) - - # with a data column, but different columns - _maybe_remove(store, "df") - store.append("df", df, data_columns=["A"]) - result = store.select("df", ["A > 0"], columns=["C", "D"]) - expected = df[df.A > 0].reindex(columns=["C", "D"]) - tm.assert_frame_equal(expected, result) - - def test_select_dtypes(self, setup_path): - - with ensure_clean_store(setup_path) as store: - # with a Timestamp data column (GH #2637) - df = DataFrame( - dict(ts=bdate_range("2012-01-01", periods=300), A=np.random.randn(300)) - ) - _maybe_remove(store, "df") - store.append("df", df, data_columns=["ts", "A"]) - - result = store.select("df", "ts>=Timestamp('2012-02-01')") - expected = df[df.ts >= Timestamp("2012-02-01")] - tm.assert_frame_equal(expected, result) - - # bool columns (GH #2849) - df = DataFrame(np.random.randn(5, 2), columns=["A", "B"]) - df["object"] = "foo" - df.loc[4:5, "object"] = "bar" - df["boolv"] = df["A"] > 0 - _maybe_remove(store, "df") - store.append("df", df, data_columns=True) - - expected = df[df.boolv == True].reindex(columns=["A", "boolv"]) # noqa - for v in [True, "true", 1]: - result = store.select( - "df", "boolv == {v!s}".format(v=v), columns=["A", "boolv"] - ) - tm.assert_frame_equal(expected, result) - - expected = df[df.boolv == False].reindex(columns=["A", "boolv"]) # noqa - for v in [False, "false", 0]: - result = store.select( - "df", "boolv == {v!s}".format(v=v), columns=["A", "boolv"] - ) - tm.assert_frame_equal(expected, result) - - # integer index - df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20))) - _maybe_remove(store, "df_int") - store.append("df_int", df) - result = store.select("df_int", "index<10 and columns=['A']") - expected = df.reindex(index=list(df.index)[0:10], columns=["A"]) - tm.assert_frame_equal(expected, result) - - # float index - df = DataFrame( - dict( - A=np.random.rand(20), - B=np.random.rand(20), - index=np.arange(20, dtype="f8"), - ) - ) - _maybe_remove(store, "df_float") - store.append("df_float", df) - result = store.select("df_float", "index<10.0 and columns=['A']") - expected = df.reindex(index=list(df.index)[0:10], columns=["A"]) - tm.assert_frame_equal(expected, result) - - with ensure_clean_store(setup_path) as store: - - # floats w/o NaN - df = DataFrame(dict(cols=range(11), values=range(11)), dtype="float64") - df["cols"] = (df["cols"] + 10).apply(str) - - store.append("df1", df, data_columns=True) - result = store.select("df1", where="values>2.0") - expected = df[df["values"] > 2.0] - tm.assert_frame_equal(expected, result) - - # floats with NaN - df.iloc[0] = np.nan - expected = df[df["values"] > 2.0] - - store.append("df2", df, data_columns=True, index=False) - result = store.select("df2", where="values>2.0") - tm.assert_frame_equal(expected, result) - - # https://github.com/PyTables/PyTables/issues/282 - # bug in selection when 0th row has a np.nan and an index - # store.append('df3',df,data_columns=True) - # result = store.select( - # 'df3', where='values>2.0') - # tm.assert_frame_equal(expected, result) - - # not in first position float with NaN ok too - df = DataFrame(dict(cols=range(11), values=range(11)), dtype="float64") - df["cols"] = (df["cols"] + 10).apply(str) - - df.iloc[1] = np.nan - expected = df[df["values"] > 2.0] - - store.append("df4", df, data_columns=True) - result = store.select("df4", where="values>2.0") - tm.assert_frame_equal(expected, result) - - # test selection with comparison against numpy scalar - # GH 11283 - with ensure_clean_store(setup_path) as store: - df = tm.makeDataFrame() - - expected = df[df["A"] > 0] - - store.append("df", df, data_columns=True) - np_zero = np.float64(0) # noqa - result = store.select("df", where=["A>np_zero"]) - tm.assert_frame_equal(expected, result) - - def test_select_with_many_inputs(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - df = DataFrame( - dict( - ts=bdate_range("2012-01-01", periods=300), - A=np.random.randn(300), - B=range(300), - users=["a"] * 50 - + ["b"] * 50 - + ["c"] * 100 - + ["a{i:03d}".format(i=i) for i in range(100)], - ) - ) - _maybe_remove(store, "df") - store.append("df", df, data_columns=["ts", "A", "B", "users"]) - - # regular select - result = store.select("df", "ts>=Timestamp('2012-02-01')") - expected = df[df.ts >= Timestamp("2012-02-01")] - tm.assert_frame_equal(expected, result) - - # small selector - result = store.select( - "df", "ts>=Timestamp('2012-02-01') & users=['a','b','c']" - ) - expected = df[ - (df.ts >= Timestamp("2012-02-01")) & df.users.isin(["a", "b", "c"]) - ] - tm.assert_frame_equal(expected, result) - - # big selector along the columns - selector = ["a", "b", "c"] + ["a{i:03d}".format(i=i) for i in range(60)] - result = store.select( - "df", "ts>=Timestamp('2012-02-01') and users=selector" - ) - expected = df[(df.ts >= Timestamp("2012-02-01")) & df.users.isin(selector)] - tm.assert_frame_equal(expected, result) - - selector = range(100, 200) - result = store.select("df", "B=selector") - expected = df[df.B.isin(selector)] - tm.assert_frame_equal(expected, result) - assert len(result) == 100 - - # big selector along the index - selector = Index(df.ts[0:100].values) - result = store.select("df", "ts=selector") - expected = df[df.ts.isin(selector.values)] - tm.assert_frame_equal(expected, result) - assert len(result) == 100 - - def test_select_iterator(self, setup_path): - - # single table - with ensure_clean_store(setup_path) as store: - - df = tm.makeTimeDataFrame(500) - _maybe_remove(store, "df") - store.append("df", df) - - expected = store.select("df") - - results = [s for s in store.select("df", iterator=True)] - result = concat(results) - tm.assert_frame_equal(expected, result) - - results = [s for s in store.select("df", chunksize=100)] - assert len(results) == 5 - result = concat(results) - tm.assert_frame_equal(expected, result) - - results = [s for s in store.select("df", chunksize=150)] - result = concat(results) - tm.assert_frame_equal(result, expected) - - with ensure_clean_path(setup_path) as path: - - df = tm.makeTimeDataFrame(500) - df.to_hdf(path, "df_non_table") - - with pytest.raises(TypeError): - read_hdf(path, "df_non_table", chunksize=100) - - with pytest.raises(TypeError): - read_hdf(path, "df_non_table", iterator=True) - - with ensure_clean_path(setup_path) as path: - - df = tm.makeTimeDataFrame(500) - df.to_hdf(path, "df", format="table") - - results = [s for s in read_hdf(path, "df", chunksize=100)] - result = concat(results) - - assert len(results) == 5 - tm.assert_frame_equal(result, df) - tm.assert_frame_equal(result, read_hdf(path, "df")) - - # multiple - - with ensure_clean_store(setup_path) as store: - - df1 = tm.makeTimeDataFrame(500) - store.append("df1", df1, data_columns=True) - df2 = tm.makeTimeDataFrame(500).rename(columns="{}_2".format) - df2["foo"] = "bar" - store.append("df2", df2) - - df = concat([df1, df2], axis=1) - - # full selection - expected = store.select_as_multiple(["df1", "df2"], selector="df1") - results = [ - s - for s in store.select_as_multiple( - ["df1", "df2"], selector="df1", chunksize=150 - ) - ] - result = concat(results) - tm.assert_frame_equal(expected, result) - - def test_select_iterator_complete_8014(self, setup_path): - - # GH 8014 - # using iterator and where clause - chunksize = 1e4 - - # no iterator - with ensure_clean_store(setup_path) as store: - - expected = tm.makeTimeDataFrame(100064, "S") - _maybe_remove(store, "df") - store.append("df", expected) - - beg_dt = expected.index[0] - end_dt = expected.index[-1] - - # select w/o iteration and no where clause works - result = store.select("df") - tm.assert_frame_equal(expected, result) - - # select w/o iterator and where clause, single term, begin - # of range, works - where = "index >= '{beg_dt}'".format(beg_dt=beg_dt) - result = store.select("df", where=where) - tm.assert_frame_equal(expected, result) - - # select w/o iterator and where clause, single term, end - # of range, works - where = "index <= '{end_dt}'".format(end_dt=end_dt) - result = store.select("df", where=where) - tm.assert_frame_equal(expected, result) - - # select w/o iterator and where clause, inclusive range, - # works - where = "index >= '{beg_dt}' & index <= '{end_dt}'".format( - beg_dt=beg_dt, end_dt=end_dt - ) - result = store.select("df", where=where) - tm.assert_frame_equal(expected, result) - - # with iterator, full range - with ensure_clean_store(setup_path) as store: - - expected = tm.makeTimeDataFrame(100064, "S") - _maybe_remove(store, "df") - store.append("df", expected) - - beg_dt = expected.index[0] - end_dt = expected.index[-1] - - # select w/iterator and no where clause works - results = [s for s in store.select("df", chunksize=chunksize)] - result = concat(results) - tm.assert_frame_equal(expected, result) - - # select w/iterator and where clause, single term, begin of range - where = "index >= '{beg_dt}'".format(beg_dt=beg_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] - result = concat(results) - tm.assert_frame_equal(expected, result) - - # select w/iterator and where clause, single term, end of range - where = "index <= '{end_dt}'".format(end_dt=end_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] - result = concat(results) - tm.assert_frame_equal(expected, result) - - # select w/iterator and where clause, inclusive range - where = "index >= '{beg_dt}' & index <= '{end_dt}'".format( - beg_dt=beg_dt, end_dt=end_dt - ) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] - result = concat(results) - tm.assert_frame_equal(expected, result) - - def test_select_iterator_non_complete_8014(self, setup_path): - - # GH 8014 - # using iterator and where clause - chunksize = 1e4 - - # with iterator, non complete range - with ensure_clean_store(setup_path) as store: - - expected = tm.makeTimeDataFrame(100064, "S") - _maybe_remove(store, "df") - store.append("df", expected) - - beg_dt = expected.index[1] - end_dt = expected.index[-2] - - # select w/iterator and where clause, single term, begin of range - where = "index >= '{beg_dt}'".format(beg_dt=beg_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] - result = concat(results) - rexpected = expected[expected.index >= beg_dt] - tm.assert_frame_equal(rexpected, result) - - # select w/iterator and where clause, single term, end of range - where = "index <= '{end_dt}'".format(end_dt=end_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] - result = concat(results) - rexpected = expected[expected.index <= end_dt] - tm.assert_frame_equal(rexpected, result) - - # select w/iterator and where clause, inclusive range - where = "index >= '{beg_dt}' & index <= '{end_dt}'".format( - beg_dt=beg_dt, end_dt=end_dt - ) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] - result = concat(results) - rexpected = expected[ - (expected.index >= beg_dt) & (expected.index <= end_dt) - ] - tm.assert_frame_equal(rexpected, result) - - # with iterator, empty where - with ensure_clean_store(setup_path) as store: - - expected = tm.makeTimeDataFrame(100064, "S") - _maybe_remove(store, "df") - store.append("df", expected) - - end_dt = expected.index[-1] - - # select w/iterator and where clause, single term, begin of range - where = "index > '{end_dt}'".format(end_dt=end_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] - assert 0 == len(results) - - def test_select_iterator_many_empty_frames(self, setup_path): - - # GH 8014 - # using iterator and where clause can return many empty - # frames. - chunksize = int(1e4) - - # with iterator, range limited to the first chunk - with ensure_clean_store(setup_path) as store: - - expected = tm.makeTimeDataFrame(100000, "S") - _maybe_remove(store, "df") - store.append("df", expected) - - beg_dt = expected.index[0] - end_dt = expected.index[chunksize - 1] - - # select w/iterator and where clause, single term, begin of range - where = "index >= '{beg_dt}'".format(beg_dt=beg_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] - result = concat(results) - rexpected = expected[expected.index >= beg_dt] - tm.assert_frame_equal(rexpected, result) - - # select w/iterator and where clause, single term, end of range - where = "index <= '{end_dt}'".format(end_dt=end_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] - - assert len(results) == 1 - result = concat(results) - rexpected = expected[expected.index <= end_dt] - tm.assert_frame_equal(rexpected, result) - - # select w/iterator and where clause, inclusive range - where = "index >= '{beg_dt}' & index <= '{end_dt}'".format( - beg_dt=beg_dt, end_dt=end_dt - ) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] - - # should be 1, is 10 - assert len(results) == 1 - result = concat(results) - rexpected = expected[ - (expected.index >= beg_dt) & (expected.index <= end_dt) - ] - tm.assert_frame_equal(rexpected, result) - - # select w/iterator and where clause which selects - # *nothing*. - # - # To be consistent with Python idiom I suggest this should - # return [] e.g. `for e in []: print True` never prints - # True. - - where = "index <= '{beg_dt}' & index >= '{end_dt}'".format( - beg_dt=beg_dt, end_dt=end_dt - ) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] - - # should be [] - assert len(results) == 0 - - @pytest.mark.filterwarnings( - "ignore:\\nthe :pandas.io.pytables.AttributeConflictWarning" - ) - def test_retain_index_attributes(self, setup_path): - - # GH 3499, losing frequency info on index recreation - df = DataFrame( - dict(A=Series(range(3), index=date_range("2000-1-1", periods=3, freq="H"))) - ) - - with ensure_clean_store(setup_path) as store: - _maybe_remove(store, "data") - store.put("data", df, format="table") - - result = store.get("data") - tm.assert_frame_equal(df, result) - - for attr in ["freq", "tz", "name"]: - for idx in ["index", "columns"]: - assert getattr(getattr(df, idx), attr, None) == getattr( - getattr(result, idx), attr, None - ) - - # try to append a table with a different frequency - with catch_warnings(record=True): - df2 = DataFrame( - dict( - A=Series( - range(3), index=date_range("2002-1-1", periods=3, freq="D") - ) - ) - ) - store.append("data", df2) - - assert store.get_storer("data").info["index"]["freq"] is None - - # this is ok - _maybe_remove(store, "df2") - df2 = DataFrame( - dict( - A=Series( - range(3), - index=[ - Timestamp("20010101"), - Timestamp("20010102"), - Timestamp("20020101"), - ], - ) - ) - ) - store.append("df2", df2) - df3 = DataFrame( - dict( - A=Series( - range(3), index=date_range("2002-1-1", periods=3, freq="D") - ) - ) - ) - store.append("df2", df3) - - @pytest.mark.filterwarnings( - "ignore:\\nthe :pandas.io.pytables.AttributeConflictWarning" - ) - def test_retain_index_attributes2(self, setup_path): - with ensure_clean_path(setup_path) as path: - - with catch_warnings(record=True): - - df = DataFrame( - dict( - A=Series( - range(3), index=date_range("2000-1-1", periods=3, freq="H") - ) - ) - ) - df.to_hdf(path, "data", mode="w", append=True) - df2 = DataFrame( - dict( - A=Series( - range(3), index=date_range("2002-1-1", periods=3, freq="D") - ) - ) - ) - df2.to_hdf(path, "data", append=True) - - idx = date_range("2000-1-1", periods=3, freq="H") - idx.name = "foo" - df = DataFrame(dict(A=Series(range(3), index=idx))) - df.to_hdf(path, "data", mode="w", append=True) - - assert read_hdf(path, "data").index.name == "foo" - - with catch_warnings(record=True): - - idx2 = date_range("2001-1-1", periods=3, freq="H") - idx2.name = "bar" - df2 = DataFrame(dict(A=Series(range(3), index=idx2))) - df2.to_hdf(path, "data", append=True) - - assert read_hdf(path, "data").index.name is None - - def test_frame_select(self, setup_path): - - df = tm.makeTimeDataFrame() - - with ensure_clean_store(setup_path) as store: - store.put("frame", df, format="table") - date = df.index[len(df) // 2] - - crit1 = Term("index>=date") - assert crit1.env.scope["date"] == date - - crit2 = "columns=['A', 'D']" - crit3 = "columns=A" - - result = store.select("frame", [crit1, crit2]) - expected = df.loc[date:, ["A", "D"]] - tm.assert_frame_equal(result, expected) - - result = store.select("frame", [crit3]) - expected = df.loc[:, ["A"]] - tm.assert_frame_equal(result, expected) - - # invalid terms - df = tm.makeTimeDataFrame() - store.append("df_time", df) - with pytest.raises(ValueError): - store.select("df_time", "index>0") - - # can't select if not written as table - # store['frame'] = df - # with pytest.raises(ValueError): - # store.select('frame', [crit1, crit2]) - - def test_frame_select_complex(self, setup_path): - # select via complex criteria - - df = tm.makeTimeDataFrame() - df["string"] = "foo" - df.loc[df.index[0:4], "string"] = "bar" - - with ensure_clean_store(setup_path) as store: - store.put("df", df, format="table", data_columns=["string"]) - - # empty - result = store.select("df", 'index>df.index[3] & string="bar"') - expected = df.loc[(df.index > df.index[3]) & (df.string == "bar")] - tm.assert_frame_equal(result, expected) - - result = store.select("df", 'index>df.index[3] & string="foo"') - expected = df.loc[(df.index > df.index[3]) & (df.string == "foo")] - tm.assert_frame_equal(result, expected) - - # or - result = store.select("df", 'index>df.index[3] | string="bar"') - expected = df.loc[(df.index > df.index[3]) | (df.string == "bar")] - tm.assert_frame_equal(result, expected) - - result = store.select( - "df", "(index>df.index[3] & " 'index<=df.index[6]) | string="bar"' - ) - expected = df.loc[ - ((df.index > df.index[3]) & (df.index <= df.index[6])) - | (df.string == "bar") - ] - tm.assert_frame_equal(result, expected) - - # invert - result = store.select("df", 'string!="bar"') - expected = df.loc[df.string != "bar"] - tm.assert_frame_equal(result, expected) - - # invert not implemented in numexpr :( - with pytest.raises(NotImplementedError): - store.select("df", '~(string="bar")') - - # invert ok for filters - result = store.select("df", "~(columns=['A','B'])") - expected = df.loc[:, df.columns.difference(["A", "B"])] - tm.assert_frame_equal(result, expected) - - # in - result = store.select("df", "index>df.index[3] & columns in ['A','B']") - expected = df.loc[df.index > df.index[3]].reindex(columns=["A", "B"]) - tm.assert_frame_equal(result, expected) - - def test_frame_select_complex2(self, setup_path): - - with ensure_clean_path(["parms.hdf", "hist.hdf"]) as paths: - - pp, hh = paths - - # use non-trivial selection criteria - parms = DataFrame({"A": [1, 1, 2, 2, 3]}) - parms.to_hdf(pp, "df", mode="w", format="table", data_columns=["A"]) - - selection = read_hdf(pp, "df", where="A=[2,3]") - hist = DataFrame( - np.random.randn(25, 1), - columns=["data"], - index=MultiIndex.from_tuples( - [(i, j) for i in range(5) for j in range(5)], names=["l1", "l2"] - ), - ) - - hist.to_hdf(hh, "df", mode="w", format="table") - - expected = read_hdf(hh, "df", where="l1=[2, 3, 4]") - - # scope with list like - l = selection.index.tolist() # noqa - store = HDFStore(hh) - result = store.select("df", where="l1=l") - assert_frame_equal(result, expected) - store.close() - - result = read_hdf(hh, "df", where="l1=l") - assert_frame_equal(result, expected) - - # index - index = selection.index # noqa - result = read_hdf(hh, "df", where="l1=index") - assert_frame_equal(result, expected) - - result = read_hdf(hh, "df", where="l1=selection.index") - assert_frame_equal(result, expected) - - result = read_hdf(hh, "df", where="l1=selection.index.tolist()") - assert_frame_equal(result, expected) - - result = read_hdf(hh, "df", where="l1=list(selection.index)") - assert_frame_equal(result, expected) - - # scope with index - store = HDFStore(hh) - - result = store.select("df", where="l1=index") - assert_frame_equal(result, expected) - - result = store.select("df", where="l1=selection.index") - assert_frame_equal(result, expected) - - result = store.select("df", where="l1=selection.index.tolist()") - assert_frame_equal(result, expected) - - result = store.select("df", where="l1=list(selection.index)") - assert_frame_equal(result, expected) - - store.close() - - def test_invalid_filtering(self, setup_path): - - # can't use more than one filter (atm) - - df = tm.makeTimeDataFrame() - - with ensure_clean_store(setup_path) as store: - store.put("df", df, format="table") - - # not implemented - with pytest.raises(NotImplementedError): - store.select("df", "columns=['A'] | columns=['B']") - - # in theory we could deal with this - with pytest.raises(NotImplementedError): - store.select("df", "columns=['A','B'] & columns=['C']") - - def test_string_select(self, setup_path): - # GH 2973 - with ensure_clean_store(setup_path) as store: - - df = tm.makeTimeDataFrame() - - # test string ==/!= - df["x"] = "none" - df.loc[2:7, "x"] = "" - - store.append("df", df, data_columns=["x"]) - - result = store.select("df", "x=none") - expected = df[df.x == "none"] - assert_frame_equal(result, expected) - - result = store.select("df", "x!=none") - expected = df[df.x != "none"] - assert_frame_equal(result, expected) - - df2 = df.copy() - df2.loc[df2.x == "", "x"] = np.nan - - store.append("df2", df2, data_columns=["x"]) - result = store.select("df2", "x!=none") - expected = df2[isna(df2.x)] - assert_frame_equal(result, expected) - - # int ==/!= - df["int"] = 1 - df.loc[2:7, "int"] = 2 - - store.append("df3", df, data_columns=["int"]) - - result = store.select("df3", "int=2") - expected = df[df.int == 2] - assert_frame_equal(result, expected) - - result = store.select("df3", "int!=2") - expected = df[df.int != 2] - assert_frame_equal(result, expected) - - def test_read_column(self, setup_path): - - df = tm.makeTimeDataFrame() - - with ensure_clean_store(setup_path) as store: - _maybe_remove(store, "df") - - # GH 17912 - # HDFStore.select_column should raise a KeyError - # exception if the key is not a valid store - with pytest.raises(KeyError, match="No object named df in the file"): - store.select_column("df", "index") - - store.append("df", df) - # error - with pytest.raises( - KeyError, match=re.escape("'column [foo] not found in the table'") - ): - store.select_column("df", "foo") - - with pytest.raises(Exception): - store.select_column("df", "index", where=["index>5"]) - - # valid - result = store.select_column("df", "index") - tm.assert_almost_equal(result.values, Series(df.index).values) - assert isinstance(result, Series) - - # not a data indexable column - with pytest.raises(ValueError): - store.select_column("df", "values_block_0") - - # a data column - df2 = df.copy() - df2["string"] = "foo" - store.append("df2", df2, data_columns=["string"]) - result = store.select_column("df2", "string") - tm.assert_almost_equal(result.values, df2["string"].values) - - # a data column with NaNs, result excludes the NaNs - df3 = df.copy() - df3["string"] = "foo" - df3.loc[4:6, "string"] = np.nan - store.append("df3", df3, data_columns=["string"]) - result = store.select_column("df3", "string") - tm.assert_almost_equal(result.values, df3["string"].values) - - # start/stop - result = store.select_column("df3", "string", start=2) - tm.assert_almost_equal(result.values, df3["string"].values[2:]) - - result = store.select_column("df3", "string", start=-2) - tm.assert_almost_equal(result.values, df3["string"].values[-2:]) - - result = store.select_column("df3", "string", stop=2) - tm.assert_almost_equal(result.values, df3["string"].values[:2]) - - result = store.select_column("df3", "string", stop=-2) - tm.assert_almost_equal(result.values, df3["string"].values[:-2]) - - result = store.select_column("df3", "string", start=2, stop=-2) - tm.assert_almost_equal(result.values, df3["string"].values[2:-2]) - - result = store.select_column("df3", "string", start=-2, stop=2) - tm.assert_almost_equal(result.values, df3["string"].values[-2:2]) - - # GH 10392 - make sure column name is preserved - df4 = DataFrame({"A": np.random.randn(10), "B": "foo"}) - store.append("df4", df4, data_columns=True) - expected = df4["B"] - result = store.select_column("df4", "B") - tm.assert_series_equal(result, expected) - - def test_coordinates(self, setup_path): - df = tm.makeTimeDataFrame() - - with ensure_clean_store(setup_path) as store: - - _maybe_remove(store, "df") - store.append("df", df) - - # all - c = store.select_as_coordinates("df") - assert (c.values == np.arange(len(df.index))).all() - - # get coordinates back & test vs frame - _maybe_remove(store, "df") - - df = DataFrame(dict(A=range(5), B=range(5))) - store.append("df", df) - c = store.select_as_coordinates("df", ["index<3"]) - assert (c.values == np.arange(3)).all() - result = store.select("df", where=c) - expected = df.loc[0:2, :] - tm.assert_frame_equal(result, expected) - - c = store.select_as_coordinates("df", ["index>=3", "index<=4"]) - assert (c.values == np.arange(2) + 3).all() - result = store.select("df", where=c) - expected = df.loc[3:4, :] - tm.assert_frame_equal(result, expected) - assert isinstance(c, Index) - - # multiple tables - _maybe_remove(store, "df1") - _maybe_remove(store, "df2") - df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) - store.append("df1", df1, data_columns=["A", "B"]) - store.append("df2", df2) - - c = store.select_as_coordinates("df1", ["A>0", "B>0"]) - df1_result = store.select("df1", c) - df2_result = store.select("df2", c) - result = concat([df1_result, df2_result], axis=1) - - expected = concat([df1, df2], axis=1) - expected = expected[(expected.A > 0) & (expected.B > 0)] - tm.assert_frame_equal(result, expected) - - # pass array/mask as the coordinates - with ensure_clean_store(setup_path) as store: - - df = DataFrame( - np.random.randn(1000, 2), index=date_range("20000101", periods=1000) - ) - store.append("df", df) - c = store.select_column("df", "index") - where = c[DatetimeIndex(c).month == 5].index - expected = df.iloc[where] - - # locations - result = store.select("df", where=where) - tm.assert_frame_equal(result, expected) - - # boolean - result = store.select("df", where=where) - tm.assert_frame_equal(result, expected) - - # invalid - with pytest.raises(ValueError): - store.select("df", where=np.arange(len(df), dtype="float64")) - - with pytest.raises(ValueError): - store.select("df", where=np.arange(len(df) + 1)) - - with pytest.raises(ValueError): - store.select("df", where=np.arange(len(df)), start=5) - - with pytest.raises(ValueError): - store.select("df", where=np.arange(len(df)), start=5, stop=10) - - # selection with filter - selection = date_range("20000101", periods=500) - result = store.select("df", where="index in selection") - expected = df[df.index.isin(selection)] - tm.assert_frame_equal(result, expected) - - # list - df = DataFrame(np.random.randn(10, 2)) - store.append("df2", df) - result = store.select("df2", where=[0, 3, 5]) - expected = df.iloc[[0, 3, 5]] - tm.assert_frame_equal(result, expected) - - # boolean - where = [True] * 10 - where[-2] = False - result = store.select("df2", where=where) - expected = df.loc[where] - tm.assert_frame_equal(result, expected) - - # start/stop - result = store.select("df2", start=5, stop=10) - expected = df[5:10] - tm.assert_frame_equal(result, expected) - - def test_append_to_multiple(self, setup_path): - df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) - df2["foo"] = "bar" - df = concat([df1, df2], axis=1) - - with ensure_clean_store(setup_path) as store: - - # exceptions - with pytest.raises(ValueError): - store.append_to_multiple( - {"df1": ["A", "B"], "df2": None}, df, selector="df3" - ) - - with pytest.raises(ValueError): - store.append_to_multiple({"df1": None, "df2": None}, df, selector="df3") - - with pytest.raises(ValueError): - store.append_to_multiple("df1", df, "df1") - - # regular operation - store.append_to_multiple( - {"df1": ["A", "B"], "df2": None}, df, selector="df1" - ) - result = store.select_as_multiple( - ["df1", "df2"], where=["A>0", "B>0"], selector="df1" - ) - expected = df[(df.A > 0) & (df.B > 0)] - tm.assert_frame_equal(result, expected) - - def test_append_to_multiple_dropna(self, setup_path): - df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) - df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan - df = concat([df1, df2], axis=1) - - with ensure_clean_store(setup_path) as store: - - # dropna=True should guarantee rows are synchronized - store.append_to_multiple( - {"df1": ["A", "B"], "df2": None}, df, selector="df1", dropna=True - ) - result = store.select_as_multiple(["df1", "df2"]) - expected = df.dropna() - tm.assert_frame_equal(result, expected) - tm.assert_index_equal(store.select("df1").index, store.select("df2").index) - - @pytest.mark.xfail( - run=False, reason="append_to_multiple_dropna_false is not raising as failed" - ) - def test_append_to_multiple_dropna_false(self, setup_path): - df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) - df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan - df = concat([df1, df2], axis=1) - - with ensure_clean_store(setup_path) as store: - - # dropna=False shouldn't synchronize row indexes - store.append_to_multiple( - {"df1a": ["A", "B"], "df2a": None}, df, selector="df1a", dropna=False - ) - - with pytest.raises(ValueError): - store.select_as_multiple(["df1a", "df2a"]) - - assert not store.select("df1a").index.equals(store.select("df2a").index) - - def test_select_as_multiple(self, setup_path): - - df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) - df2["foo"] = "bar" - - with ensure_clean_store(setup_path) as store: - - # no tables stored - with pytest.raises(Exception): - store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1") - - store.append("df1", df1, data_columns=["A", "B"]) - store.append("df2", df2) - - # exceptions - with pytest.raises(Exception): - store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1") - - with pytest.raises(Exception): - store.select_as_multiple([None], where=["A>0", "B>0"], selector="df1") - - msg = "'No object named df3 in the file'" - with pytest.raises(KeyError, match=msg): - store.select_as_multiple( - ["df1", "df3"], where=["A>0", "B>0"], selector="df1" - ) - - with pytest.raises(KeyError, match=msg): - store.select_as_multiple(["df3"], where=["A>0", "B>0"], selector="df1") - - with pytest.raises(KeyError, match="'No object named df4 in the file'"): - store.select_as_multiple( - ["df1", "df2"], where=["A>0", "B>0"], selector="df4" - ) - - # default select - result = store.select("df1", ["A>0", "B>0"]) - expected = store.select_as_multiple( - ["df1"], where=["A>0", "B>0"], selector="df1" - ) - tm.assert_frame_equal(result, expected) - expected = store.select_as_multiple( - "df1", where=["A>0", "B>0"], selector="df1" - ) - tm.assert_frame_equal(result, expected) - - # multiple - result = store.select_as_multiple( - ["df1", "df2"], where=["A>0", "B>0"], selector="df1" - ) - expected = concat([df1, df2], axis=1) - expected = expected[(expected.A > 0) & (expected.B > 0)] - tm.assert_frame_equal(result, expected) - - # multiple (diff selector) - result = store.select_as_multiple( - ["df1", "df2"], where="index>df2.index[4]", selector="df2" - ) - expected = concat([df1, df2], axis=1) - expected = expected[5:] - tm.assert_frame_equal(result, expected) - - # test exception for diff rows - store.append("df3", tm.makeTimeDataFrame(nper=50)) - with pytest.raises(ValueError): - store.select_as_multiple( - ["df1", "df3"], where=["A>0", "B>0"], selector="df1" - ) - - @pytest.mark.skipif( - LooseVersion(tables.__version__) < LooseVersion("3.1.0"), - reason=("tables version does not support fix for nan selection bug: GH 4858"), - ) - def test_nan_selection_bug_4858(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - df = DataFrame(dict(cols=range(6), values=range(6)), dtype="float64") - df["cols"] = (df["cols"] + 10).apply(str) - df.iloc[0] = np.nan - - expected = DataFrame( - dict(cols=["13.0", "14.0", "15.0"], values=[3.0, 4.0, 5.0]), - index=[3, 4, 5], - ) - - # write w/o the index on that particular column - store.append("df", df, data_columns=True, index=["cols"]) - result = store.select("df", where="values>2.0") - assert_frame_equal(result, expected) - - def test_start_stop_table(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - # table - df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20))) - store.append("df", df) - - result = store.select("df", "columns=['A']", start=0, stop=5) - expected = df.loc[0:4, ["A"]] - tm.assert_frame_equal(result, expected) - - # out of range - result = store.select("df", "columns=['A']", start=30, stop=40) - assert len(result) == 0 - expected = df.loc[30:40, ["A"]] - tm.assert_frame_equal(result, expected) - - def test_start_stop_multiple(self, setup_path): - - # GH 16209 - with ensure_clean_store(setup_path) as store: - - df = DataFrame({"foo": [1, 2], "bar": [1, 2]}) - - store.append_to_multiple( - {"selector": ["foo"], "data": None}, df, selector="selector" - ) - result = store.select_as_multiple( - ["selector", "data"], selector="selector", start=0, stop=1 - ) - expected = df.loc[[0], ["foo", "bar"]] - tm.assert_frame_equal(result, expected) - - def test_start_stop_fixed(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - # fixed, GH 8287 - df = DataFrame( - dict(A=np.random.rand(20), B=np.random.rand(20)), - index=pd.date_range("20130101", periods=20), - ) - store.put("df", df) - - result = store.select("df", start=0, stop=5) - expected = df.iloc[0:5, :] - tm.assert_frame_equal(result, expected) - - result = store.select("df", start=5, stop=10) - expected = df.iloc[5:10, :] - tm.assert_frame_equal(result, expected) - - # out of range - result = store.select("df", start=30, stop=40) - expected = df.iloc[30:40, :] - tm.assert_frame_equal(result, expected) - - # series - s = df.A - store.put("s", s) - result = store.select("s", start=0, stop=5) - expected = s.iloc[0:5] - tm.assert_series_equal(result, expected) - - result = store.select("s", start=5, stop=10) - expected = s.iloc[5:10] - tm.assert_series_equal(result, expected) - - # sparse; not implemented - df = tm.makeDataFrame() - df.iloc[3:5, 1:3] = np.nan - df.iloc[8:10, -2] = np.nan - - def test_select_filter_corner(self, setup_path): - - df = DataFrame(np.random.randn(50, 100)) - df.index = ["{c:3d}".format(c=c) for c in df.index] - df.columns = ["{c:3d}".format(c=c) for c in df.columns] - - with ensure_clean_store(setup_path) as store: - store.put("frame", df, format="table") - - crit = "columns=df.columns[:75]" - result = store.select("frame", [crit]) - tm.assert_frame_equal(result, df.loc[:, df.columns[:75]]) - - crit = "columns=df.columns[:75:2]" - result = store.select("frame", [crit]) - tm.assert_frame_equal(result, df.loc[:, df.columns[:75:2]]) - - def test_path_pathlib(self, setup_path): - df = tm.makeDataFrame() - - result = tm.round_trip_pathlib( - lambda p: df.to_hdf(p, "df"), lambda p: pd.read_hdf(p, "df") - ) - tm.assert_frame_equal(df, result) - - @pytest.mark.parametrize("start, stop", [(0, 2), (1, 2), (None, None)]) - def test_contiguous_mixed_data_table(self, start, stop, setup_path): - # GH 17021 - # ValueError when reading a contiguous mixed-data table ft. VLArray - df = DataFrame( - { - "a": Series([20111010, 20111011, 20111012]), - "b": Series(["ab", "cd", "ab"]), - } - ) - - with ensure_clean_store(setup_path) as store: - store.append("test_dataset", df) - - result = store.select("test_dataset", start=start, stop=stop) - assert_frame_equal(df[start:stop], result) - - def test_path_pathlib_hdfstore(self, setup_path): - df = tm.makeDataFrame() - - def writer(path): - with pd.HDFStore(path) as store: - df.to_hdf(store, "df") - - def reader(path): - with pd.HDFStore(path) as store: - return pd.read_hdf(store, "df") - - result = tm.round_trip_pathlib(writer, reader) - tm.assert_frame_equal(df, result) - - def test_pickle_path_localpath(self, setup_path): - df = tm.makeDataFrame() - result = tm.round_trip_pathlib( - lambda p: df.to_hdf(p, "df"), lambda p: pd.read_hdf(p, "df") - ) - tm.assert_frame_equal(df, result) - - def test_path_localpath_hdfstore(self, setup_path): - df = tm.makeDataFrame() - - def writer(path): - with pd.HDFStore(path) as store: - df.to_hdf(store, "df") - - def reader(path): - with pd.HDFStore(path) as store: - return pd.read_hdf(store, "df") - - result = tm.round_trip_localpath(writer, reader) - tm.assert_frame_equal(df, result) - - def _check_roundtrip(self, obj, comparator, path, compression=False, **kwargs): - - options = {} - if compression: - options["complib"] = _default_compressor - - with ensure_clean_store(path, "w", **options) as store: - store["obj"] = obj - retrieved = store["obj"] - comparator(retrieved, obj, **kwargs) - - def _check_double_roundtrip( - self, obj, comparator, path, compression=False, **kwargs - ): - options = {} - if compression: - options["complib"] = compression or _default_compressor - - with ensure_clean_store(path, "w", **options) as store: - store["obj"] = obj - retrieved = store["obj"] - comparator(retrieved, obj, **kwargs) - store["obj"] = retrieved - again = store["obj"] - comparator(again, obj, **kwargs) - - def _check_roundtrip_table(self, obj, comparator, path, compression=False): - options = {} - if compression: - options["complib"] = _default_compressor - - with ensure_clean_store(path, "w", **options) as store: - store.put("obj", obj, format="table") - retrieved = store["obj"] - - comparator(retrieved, obj) - - def test_multiple_open_close(self, setup_path): - # gh-4409: open & close multiple times - - with ensure_clean_path(setup_path) as path: - - df = tm.makeDataFrame() - df.to_hdf(path, "df", mode="w", format="table") - - # single - store = HDFStore(path) - assert "CLOSED" not in store.info() - assert store.is_open - - store.close() - assert "CLOSED" in store.info() - assert not store.is_open - - with ensure_clean_path(setup_path) as path: - - if pytables._table_file_open_policy_is_strict: - - # multiples - store1 = HDFStore(path) - - with pytest.raises(ValueError): - HDFStore(path) - - store1.close() - else: - - # multiples - store1 = HDFStore(path) - store2 = HDFStore(path) - - assert "CLOSED" not in store1.info() - assert "CLOSED" not in store2.info() - assert store1.is_open - assert store2.is_open - - store1.close() - assert "CLOSED" in store1.info() - assert not store1.is_open - assert "CLOSED" not in store2.info() - assert store2.is_open - - store2.close() - assert "CLOSED" in store1.info() - assert "CLOSED" in store2.info() - assert not store1.is_open - assert not store2.is_open - - # nested close - store = HDFStore(path, mode="w") - store.append("df", df) - - store2 = HDFStore(path) - store2.append("df2", df) - store2.close() - assert "CLOSED" in store2.info() - assert not store2.is_open - - store.close() - assert "CLOSED" in store.info() - assert not store.is_open - - # double closing - store = HDFStore(path, mode="w") - store.append("df", df) - - store2 = HDFStore(path) - store.close() - assert "CLOSED" in store.info() - assert not store.is_open - - store2.close() - assert "CLOSED" in store2.info() - assert not store2.is_open - - # ops on a closed store - with ensure_clean_path(setup_path) as path: - - df = tm.makeDataFrame() - df.to_hdf(path, "df", mode="w", format="table") - - store = HDFStore(path) - store.close() - - with pytest.raises(ClosedFileError): - store.keys() - - with pytest.raises(ClosedFileError): - "df" in store - - with pytest.raises(ClosedFileError): - len(store) - - with pytest.raises(ClosedFileError): - store["df"] - - with pytest.raises(AttributeError): - store.df - - with pytest.raises(ClosedFileError): - store.select("df") - - with pytest.raises(ClosedFileError): - store.get("df") - - with pytest.raises(ClosedFileError): - store.append("df2", df) - - with pytest.raises(ClosedFileError): - store.put("df3", df) - - with pytest.raises(ClosedFileError): - store.get_storer("df2") - - with pytest.raises(ClosedFileError): - store.remove("df2") - - with pytest.raises(ClosedFileError, match="file is not open"): - store.select("df") - - def test_pytables_native_read(self, datapath, setup_path): - with ensure_clean_store( - datapath("io", "data", "legacy_hdf/pytables_native.h5"), mode="r" - ) as store: - d2 = store["detector/readout"] - assert isinstance(d2, DataFrame) - - @pytest.mark.skipif( - is_platform_windows(), reason="native2 read fails oddly on windows" - ) - def test_pytables_native2_read(self, datapath, setup_path): - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "pytables_native2.h5"), mode="r" - ) as store: - str(store) - d1 = store["detector"] - assert isinstance(d1, DataFrame) - - @xfail_non_writeable - def test_legacy_table_fixed_format_read_py2(self, datapath, setup_path): - # GH 24510 - # legacy table with fixed format written in Python 2 - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "legacy_table_fixed_py2.h5"), mode="r" - ) as store: - result = store.select("df") - expected = pd.DataFrame( - [[1, 2, 3, "D"]], - columns=["A", "B", "C", "D"], - index=pd.Index(["ABC"], name="INDEX_NAME"), - ) - assert_frame_equal(expected, result) - - def test_legacy_table_read_py2(self, datapath, setup_path): - # issue: 24925 - # legacy table written in Python 2 - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "legacy_table_py2.h5"), mode="r" - ) as store: - result = store.select("table") - - expected = pd.DataFrame({"a": ["a", "b"], "b": [2, 3]}) - assert_frame_equal(expected, result) - - def test_copy(self, setup_path): - - with catch_warnings(record=True): - - def do_copy(f, new_f=None, keys=None, propindexes=True, **kwargs): - try: - store = HDFStore(f, "r") - - if new_f is None: - import tempfile - - fd, new_f = tempfile.mkstemp() - - tstore = store.copy( - new_f, keys=keys, propindexes=propindexes, **kwargs - ) - - # check keys - if keys is None: - keys = store.keys() - assert set(keys) == set(tstore.keys()) - - # check indices & nrows - for k in tstore.keys(): - if tstore.get_storer(k).is_table: - new_t = tstore.get_storer(k) - orig_t = store.get_storer(k) - - assert orig_t.nrows == new_t.nrows - - # check propindixes - if propindexes: - for a in orig_t.axes: - if a.is_indexed: - assert new_t[a.name].is_indexed - - finally: - safe_close(store) - safe_close(tstore) - try: - os.close(fd) - except (OSError, ValueError): - pass - safe_remove(new_f) - - # new table - df = tm.makeDataFrame() - - try: - path = create_tempfile(setup_path) - st = HDFStore(path) - st.append("df", df, data_columns=["A"]) - st.close() - do_copy(f=path) - do_copy(f=path, propindexes=False) - finally: - safe_remove(path) - - def test_store_datetime_fractional_secs(self, setup_path): - - with ensure_clean_store(setup_path) as store: - dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456) - series = Series([0], [dt]) - store["a"] = series - assert store["a"].index[0] == dt - - def test_tseries_indices_series(self, setup_path): - - with ensure_clean_store(setup_path) as store: - idx = tm.makeDateIndex(10) - ser = Series(np.random.randn(len(idx)), idx) - store["a"] = ser - result = store["a"] - - tm.assert_series_equal(result, ser) - assert result.index.freq == ser.index.freq - tm.assert_class_equal(result.index, ser.index, obj="series index") - - idx = tm.makePeriodIndex(10) - ser = Series(np.random.randn(len(idx)), idx) - store["a"] = ser - result = store["a"] - - tm.assert_series_equal(result, ser) - assert result.index.freq == ser.index.freq - tm.assert_class_equal(result.index, ser.index, obj="series index") - - def test_tseries_indices_frame(self, setup_path): - - with ensure_clean_store(setup_path) as store: - idx = tm.makeDateIndex(10) - df = DataFrame(np.random.randn(len(idx), 3), index=idx) - store["a"] = df - result = store["a"] - - assert_frame_equal(result, df) - assert result.index.freq == df.index.freq - tm.assert_class_equal(result.index, df.index, obj="dataframe index") - - idx = tm.makePeriodIndex(10) - df = DataFrame(np.random.randn(len(idx), 3), idx) - store["a"] = df - result = store["a"] - - assert_frame_equal(result, df) - assert result.index.freq == df.index.freq - tm.assert_class_equal(result.index, df.index, obj="dataframe index") - - def test_unicode_index(self, setup_path): - - unicode_values = ["\u03c3", "\u03c3\u03c3"] - - # PerformanceWarning - with catch_warnings(record=True): - simplefilter("ignore", pd.errors.PerformanceWarning) - s = Series(np.random.randn(len(unicode_values)), unicode_values) - self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) - - def test_unicode_longer_encoded(self, setup_path): - # GH 11234 - char = "\u0394" - df = pd.DataFrame({"A": [char]}) - with ensure_clean_store(setup_path) as store: - store.put("df", df, format="table", encoding="utf-8") - result = store.get("df") - tm.assert_frame_equal(result, df) - - df = pd.DataFrame({"A": ["a", char], "B": ["b", "b"]}) - with ensure_clean_store(setup_path) as store: - store.put("df", df, format="table", encoding="utf-8") - result = store.get("df") - tm.assert_frame_equal(result, df) - - @xfail_non_writeable - def test_store_datetime_mixed(self, setup_path): - - df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]}) - ts = tm.makeTimeSeries() - df["d"] = ts.index[:3] - self._check_roundtrip(df, tm.assert_frame_equal, path=setup_path) - - # FIXME: don't leave commented-out code - # def test_cant_write_multiindex_table(self): - # # for now, #1848 - # df = DataFrame(np.random.randn(10, 4), - # index=[np.arange(5).repeat(2), - # np.tile(np.arange(2), 5)]) - # - # with pytest.raises(Exception): - # store.put('foo', df, format='table') - - def test_append_with_diff_col_name_types_raises_value_error(self, setup_path): - df = DataFrame(np.random.randn(10, 1)) - df2 = DataFrame({"a": np.random.randn(10)}) - df3 = DataFrame({(1, 2): np.random.randn(10)}) - df4 = DataFrame({("1", 2): np.random.randn(10)}) - df5 = DataFrame({("1", 2, object): np.random.randn(10)}) - - with ensure_clean_store(setup_path) as store: - name = "df_{}".format(tm.rands(10)) - store.append(name, df) - - for d in (df2, df3, df4, df5): - with pytest.raises(ValueError): - store.append(name, d) - - def test_query_with_nested_special_character(self, setup_path): - df = DataFrame( - { - "a": ["a", "a", "c", "b", "test & test", "c", "b", "e"], - "b": [1, 2, 3, 4, 5, 6, 7, 8], - } - ) - expected = df[df.a == "test & test"] - with ensure_clean_store(setup_path) as store: - store.append("test", df, format="table", data_columns=True) - result = store.select("test", 'a = "test & test"') - tm.assert_frame_equal(expected, result) - - def test_categorical(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - # Basic - _maybe_remove(store, "s") - s = Series( - Categorical( - ["a", "b", "b", "a", "a", "c"], - categories=["a", "b", "c", "d"], - ordered=False, - ) - ) - store.append("s", s, format="table") - result = store.select("s") - tm.assert_series_equal(s, result) - - _maybe_remove(store, "s_ordered") - s = Series( - Categorical( - ["a", "b", "b", "a", "a", "c"], - categories=["a", "b", "c", "d"], - ordered=True, - ) - ) - store.append("s_ordered", s, format="table") - result = store.select("s_ordered") - tm.assert_series_equal(s, result) - - _maybe_remove(store, "df") - df = DataFrame({"s": s, "vals": [1, 2, 3, 4, 5, 6]}) - store.append("df", df, format="table") - result = store.select("df") - tm.assert_frame_equal(result, df) - - # Dtypes - _maybe_remove(store, "si") - s = Series([1, 1, 2, 2, 3, 4, 5]).astype("category") - store.append("si", s) - result = store.select("si") - tm.assert_series_equal(result, s) - - _maybe_remove(store, "si2") - s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype("category") - store.append("si2", s) - result = store.select("si2") - tm.assert_series_equal(result, s) - - # Multiple - _maybe_remove(store, "df2") - df2 = df.copy() - df2["s2"] = Series(list("abcdefg")).astype("category") - store.append("df2", df2) - result = store.select("df2") - tm.assert_frame_equal(result, df2) - - # Make sure the metadata is OK - info = store.info() - assert "/df2 " in info - # assert '/df2/meta/values_block_0/meta' in info - assert "/df2/meta/values_block_1/meta" in info - - # unordered - _maybe_remove(store, "s2") - s = Series( - Categorical( - ["a", "b", "b", "a", "a", "c"], - categories=["a", "b", "c", "d"], - ordered=False, - ) - ) - store.append("s2", s, format="table") - result = store.select("s2") - tm.assert_series_equal(result, s) - - # Query - _maybe_remove(store, "df3") - store.append("df3", df, data_columns=["s"]) - expected = df[df.s.isin(["b", "c"])] - result = store.select("df3", where=['s in ["b","c"]']) - tm.assert_frame_equal(result, expected) - - expected = df[df.s.isin(["b", "c"])] - result = store.select("df3", where=['s = ["b","c"]']) - tm.assert_frame_equal(result, expected) - - expected = df[df.s.isin(["d"])] - result = store.select("df3", where=['s in ["d"]']) - tm.assert_frame_equal(result, expected) - - expected = df[df.s.isin(["f"])] - result = store.select("df3", where=['s in ["f"]']) - tm.assert_frame_equal(result, expected) - - # Appending with same categories is ok - store.append("df3", df) - - df = concat([df, df]) - expected = df[df.s.isin(["b", "c"])] - result = store.select("df3", where=['s in ["b","c"]']) - tm.assert_frame_equal(result, expected) - - # Appending must have the same categories - df3 = df.copy() - df3["s"].cat.remove_unused_categories(inplace=True) - - with pytest.raises(ValueError): - store.append("df3", df3) - - # Remove, and make sure meta data is removed (its a recursive - # removal so should be). - result = store.select("df3/meta/s/meta") - assert result is not None - store.remove("df3") - - with pytest.raises( - KeyError, match="'No object named df3/meta/s/meta in the file'" - ): - store.select("df3/meta/s/meta") - - def test_categorical_conversion(self, setup_path): - - # GH13322 - # Check that read_hdf with categorical columns doesn't return rows if - # where criteria isn't met. - obsids = ["ESP_012345_6789", "ESP_987654_3210"] - imgids = ["APF00006np", "APF0001imm"] - data = [4.3, 9.8] - - # Test without categories - df = DataFrame(dict(obsids=obsids, imgids=imgids, data=data)) - - # We are expecting an empty DataFrame matching types of df - expected = df.iloc[[], :] - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", format="table", data_columns=True) - result = read_hdf(path, "df", where="obsids=B") - tm.assert_frame_equal(result, expected) - - # Test with categories - df.obsids = df.obsids.astype("category") - df.imgids = df.imgids.astype("category") - - # We are expecting an empty DataFrame matching types of df - expected = df.iloc[[], :] - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", format="table", data_columns=True) - result = read_hdf(path, "df", where="obsids=B") - tm.assert_frame_equal(result, expected) - - def test_categorical_nan_only_columns(self, setup_path): - # GH18413 - # Check that read_hdf with categorical columns with NaN-only values can - # be read back. - df = pd.DataFrame( - { - "a": ["a", "b", "c", np.nan], - "b": [np.nan, np.nan, np.nan, np.nan], - "c": [1, 2, 3, 4], - "d": pd.Series([None] * 4, dtype=object), - } - ) - df["a"] = df.a.astype("category") - df["b"] = df.b.astype("category") - df["d"] = df.b.astype("category") - expected = df - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", format="table", data_columns=True) - result = read_hdf(path, "df") - tm.assert_frame_equal(result, expected) - - def test_duplicate_column_name(self, setup_path): - df = DataFrame(columns=["a", "a"], data=[[0, 0]]) - - with ensure_clean_path(setup_path) as path: - with pytest.raises(ValueError): - df.to_hdf(path, "df", format="fixed") - - df.to_hdf(path, "df", format="table") - other = read_hdf(path, "df") - - tm.assert_frame_equal(df, other) - assert df.equals(other) - assert other.equals(df) - - def test_round_trip_equals(self, setup_path): - # GH 9330 - df = DataFrame({"B": [1, 2], "A": ["x", "y"]}) - - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", format="table") - other = read_hdf(path, "df") - tm.assert_frame_equal(df, other) - assert df.equals(other) - assert other.equals(df) - - def test_preserve_timedeltaindex_type(self, setup_path): - # GH9635 - # Storing TimedeltaIndexed DataFrames in fixed stores did not preserve - # the type of the index. - df = DataFrame(np.random.normal(size=(10, 5))) - df.index = timedelta_range(start="0s", periods=10, freq="1s", name="example") - - with ensure_clean_store(setup_path) as store: - - store["df"] = df - assert_frame_equal(store["df"], df) - - def test_columns_multiindex_modified(self, setup_path): - # BUG: 7212 - # read_hdf store.select modified the passed columns parameters - # when multi-indexed. - - df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) - df.index.name = "letters" - df = df.set_index(keys="E", append=True) - - data_columns = df.index.names + df.columns.tolist() - with ensure_clean_path(setup_path) as path: - df.to_hdf( - path, - "df", - mode="a", - append=True, - data_columns=data_columns, - index=False, - ) - cols2load = list("BCD") - cols2load_original = list(cols2load) - df_loaded = read_hdf(path, "df", columns=cols2load) # noqa - assert cols2load_original == cols2load - - @ignore_natural_naming_warning - def test_to_hdf_with_object_column_names(self, setup_path): - # GH9057 - # Writing HDF5 table format should only work for string-like - # column types - - types_should_fail = [ - tm.makeIntIndex, - tm.makeFloatIndex, - tm.makeDateIndex, - tm.makeTimedeltaIndex, - tm.makePeriodIndex, - ] - types_should_run = [ - tm.makeStringIndex, - tm.makeCategoricalIndex, - tm.makeUnicodeIndex, - ] - - for index in types_should_fail: - df = DataFrame(np.random.randn(10, 2), columns=index(2)) - with ensure_clean_path(setup_path) as path: - with catch_warnings(record=True): - msg = "cannot have non-object label DataIndexableCol" - with pytest.raises(ValueError, match=msg): - df.to_hdf(path, "df", format="table", data_columns=True) - - for index in types_should_run: - df = DataFrame(np.random.randn(10, 2), columns=index(2)) - with ensure_clean_path(setup_path) as path: - with catch_warnings(record=True): - df.to_hdf(path, "df", format="table", data_columns=True) - result = pd.read_hdf( - path, "df", where="index = [{0}]".format(df.index[0]) - ) - assert len(result) - - def test_read_hdf_open_store(self, setup_path): - # GH10330 - # No check for non-string path_or-buf, and no test of open store - df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) - df.index.name = "letters" - df = df.set_index(keys="E", append=True) - - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", mode="w") - direct = read_hdf(path, "df") - store = HDFStore(path, mode="r") - indirect = read_hdf(store, "df") - tm.assert_frame_equal(direct, indirect) - assert store.is_open - store.close() - - def test_read_hdf_iterator(self, setup_path): - df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) - df.index.name = "letters" - df = df.set_index(keys="E", append=True) - - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", mode="w", format="t") - direct = read_hdf(path, "df") - iterator = read_hdf(path, "df", iterator=True) - assert isinstance(iterator, TableIterator) - indirect = next(iterator.__iter__()) - tm.assert_frame_equal(direct, indirect) - iterator.store.close() - - def test_read_hdf_errors(self, setup_path): - df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) - - with ensure_clean_path(setup_path) as path: - with pytest.raises(IOError): - read_hdf(path, "key") - - df.to_hdf(path, "df") - store = HDFStore(path, mode="r") - store.close() - - with pytest.raises(IOError): - read_hdf(store, "df") - - def test_read_hdf_generic_buffer_errors(self): - with pytest.raises(NotImplementedError): - read_hdf(BytesIO(b""), "df") - - def test_invalid_complib(self, setup_path): - df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) - with ensure_clean_path(setup_path) as path: - with pytest.raises(ValueError): - df.to_hdf(path, "df", complib="foolib") - - # GH10443 - - def test_read_nokey(self, setup_path): - df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) - - # Categorical dtype not supported for "fixed" format. So no need - # to test with that dtype in the dataframe here. - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", mode="a") - reread = read_hdf(path) - assert_frame_equal(df, reread) - df.to_hdf(path, "df2", mode="a") - - with pytest.raises(ValueError): - read_hdf(path) - - def test_read_nokey_table(self, setup_path): - # GH13231 - df = DataFrame({"i": range(5), "c": Series(list("abacd"), dtype="category")}) - - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", mode="a", format="table") - reread = read_hdf(path) - assert_frame_equal(df, reread) - df.to_hdf(path, "df2", mode="a", format="table") - - with pytest.raises(ValueError): - read_hdf(path) - - def test_read_nokey_empty(self, setup_path): - with ensure_clean_path(setup_path) as path: - store = HDFStore(path) - store.close() - - with pytest.raises(ValueError): - read_hdf(path) - - @td.skip_if_no("pathlib") - def test_read_from_pathlib_path(self, setup_path): - - # GH11773 - from pathlib import Path - - expected = DataFrame( - np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE") - ) - with ensure_clean_path(setup_path) as filename: - path_obj = Path(filename) - - expected.to_hdf(path_obj, "df", mode="a") - actual = read_hdf(path_obj, "df") - - tm.assert_frame_equal(expected, actual) - - @td.skip_if_no("py.path") - def test_read_from_py_localpath(self, setup_path): - - # GH11773 - from py.path import local as LocalPath - - expected = DataFrame( - np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE") - ) - with ensure_clean_path(setup_path) as filename: - path_obj = LocalPath(filename) - - expected.to_hdf(path_obj, "df", mode="a") - actual = read_hdf(path_obj, "df") - - tm.assert_frame_equal(expected, actual) - - def test_query_long_float_literal(self, setup_path): - # GH 14241 - df = pd.DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]}) - - with ensure_clean_store(setup_path) as store: - store.append("test", df, format="table", data_columns=True) - - cutoff = 1000000000.0006 - result = store.select("test", "A < {cutoff:.4f}".format(cutoff=cutoff)) - assert result.empty - - cutoff = 1000000000.0010 - result = store.select("test", "A > {cutoff:.4f}".format(cutoff=cutoff)) - expected = df.loc[[1, 2], :] - tm.assert_frame_equal(expected, result) - - exact = 1000000000.0011 - result = store.select("test", "A == {exact:.4f}".format(exact=exact)) - expected = df.loc[[1], :] - tm.assert_frame_equal(expected, result) - - def test_query_compare_column_type(self, setup_path): - # GH 15492 - df = pd.DataFrame( - { - "date": ["2014-01-01", "2014-01-02"], - "real_date": date_range("2014-01-01", periods=2), - "float": [1.1, 1.2], - "int": [1, 2], - }, - columns=["date", "real_date", "float", "int"], - ) - - with ensure_clean_store(setup_path) as store: - store.append("test", df, format="table", data_columns=True) - - ts = pd.Timestamp("2014-01-01") # noqa - result = store.select("test", where="real_date > ts") - expected = df.loc[[1], :] - tm.assert_frame_equal(expected, result) - - for op in ["<", ">", "=="]: - # non strings to string column always fail - for v in [2.1, True, pd.Timestamp("2014-01-01"), pd.Timedelta(1, "s")]: - query = "date {op} v".format(op=op) - with pytest.raises(TypeError): - store.select("test", where=query) - - # strings to other columns must be convertible to type - v = "a" - for col in ["int", "float", "real_date"]: - query = "{col} {op} v".format(op=op, col=col) - with pytest.raises(ValueError): - store.select("test", where=query) - - for v, col in zip( - ["1", "1.1", "2014-01-01"], ["int", "float", "real_date"] - ): - query = "{col} {op} v".format(op=op, col=col) - result = store.select("test", where=query) - - if op == "==": - expected = df.loc[[0], :] - elif op == ">": - expected = df.loc[[1], :] - else: - expected = df.loc[[], :] - tm.assert_frame_equal(expected, result) - - @pytest.mark.parametrize("format", ["fixed", "table"]) - def test_read_hdf_series_mode_r(self, format, setup_path): - # GH 16583 - # Tests that reading a Series saved to an HDF file - # still works if a mode='r' argument is supplied - series = tm.makeFloatSeries() - with ensure_clean_path(setup_path) as path: - series.to_hdf(path, key="data", format=format) - result = pd.read_hdf(path, key="data", mode="r") - tm.assert_series_equal(result, series) - - @pytest.mark.skipif(not PY36, reason="Need python 3.6") - def test_fspath(self): - with tm.ensure_clean("foo.h5") as path: - with pd.HDFStore(path) as store: - assert os.fspath(store) == str(path) - - def test_read_py2_hdf_file_in_py3(self, datapath): - # GH 16781 - - # tests reading a PeriodIndex DataFrame written in Python2 in Python3 - - # the file was generated in Python 2.7 like so: - # - # df = pd.DataFrame([1.,2,3], index=pd.PeriodIndex( - # ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B')) - # df.to_hdf('periodindex_0.20.1_x86_64_darwin_2.7.13.h5', 'p') - - expected = pd.DataFrame( - [1.0, 2, 3], - index=pd.PeriodIndex(["2015-01-01", "2015-01-02", "2015-01-05"], freq="B"), - ) - - with ensure_clean_store( - datapath( - "io", "data", "legacy_hdf", "periodindex_0.20.1_x86_64_darwin_2.7.13.h5" - ), - mode="r", - ) as store: - result = store["p"] - assert_frame_equal(result, expected) - - @pytest.mark.parametrize("where", ["", (), (None,), [], [None]]) - def test_select_empty_where(self, where): - # GH26610 - - # Using keyword `where` as '' or (), or [None], etc - # while reading from HDF store raises - # "SyntaxError: only a single expression is allowed" - - df = pd.DataFrame([1, 2, 3]) - with ensure_clean_path("empty_where.h5") as path: - with pd.HDFStore(path) as store: - store.put("df", df, "t") - result = pd.read_hdf(store, "df", where=where) - assert_frame_equal(result, df) - - @pytest.mark.parametrize( - "idx", - [ - date_range("2019", freq="D", periods=3, tz="UTC"), - CategoricalIndex(list("abc")), - ], - ) - def test_to_hdf_multiindex_extension_dtype(self, idx, setup_path): - # GH 7775 - mi = MultiIndex.from_arrays([idx, idx]) - df = pd.DataFrame(0, index=mi, columns=["a"]) - with ensure_clean_path(setup_path) as path: - with pytest.raises(NotImplementedError, match="Saving a MultiIndex"): - df.to_hdf(path, "df") - - -class TestHDFComplexValues: - # GH10447 - - def test_complex_fixed(self, setup_path): - df = DataFrame( - np.random.rand(4, 5).astype(np.complex64), - index=list("abcd"), - columns=list("ABCDE"), - ) - - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df") - reread = read_hdf(path, "df") - assert_frame_equal(df, reread) - - df = DataFrame( - np.random.rand(4, 5).astype(np.complex128), - index=list("abcd"), - columns=list("ABCDE"), - ) - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df") - reread = read_hdf(path, "df") - assert_frame_equal(df, reread) - - def test_complex_table(self, setup_path): - df = DataFrame( - np.random.rand(4, 5).astype(np.complex64), - index=list("abcd"), - columns=list("ABCDE"), - ) - - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", format="table") - reread = read_hdf(path, "df") - assert_frame_equal(df, reread) - - df = DataFrame( - np.random.rand(4, 5).astype(np.complex128), - index=list("abcd"), - columns=list("ABCDE"), - ) - - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", format="table", mode="w") - reread = read_hdf(path, "df") - assert_frame_equal(df, reread) - - @xfail_non_writeable - def test_complex_mixed_fixed(self, setup_path): - complex64 = np.array( - [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64 - ) - complex128 = np.array( - [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 - ) - df = DataFrame( - { - "A": [1, 2, 3, 4], - "B": ["a", "b", "c", "d"], - "C": complex64, - "D": complex128, - "E": [1.0, 2.0, 3.0, 4.0], - }, - index=list("abcd"), - ) - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df") - reread = read_hdf(path, "df") - assert_frame_equal(df, reread) - - def test_complex_mixed_table(self, setup_path): - complex64 = np.array( - [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64 - ) - complex128 = np.array( - [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 - ) - df = DataFrame( - { - "A": [1, 2, 3, 4], - "B": ["a", "b", "c", "d"], - "C": complex64, - "D": complex128, - "E": [1.0, 2.0, 3.0, 4.0], - }, - index=list("abcd"), - ) - - with ensure_clean_store(setup_path) as store: - store.append("df", df, data_columns=["A", "B"]) - result = store.select("df", where="A>2") - assert_frame_equal(df.loc[df.A > 2], result) - - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", format="table") - reread = read_hdf(path, "df") - assert_frame_equal(df, reread) - - def test_complex_across_dimensions_fixed(self, setup_path): - with catch_warnings(record=True): - complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) - s = Series(complex128, index=list("abcd")) - df = DataFrame({"A": s, "B": s}) - - objs = [s, df] - comps = [tm.assert_series_equal, tm.assert_frame_equal] - for obj, comp in zip(objs, comps): - with ensure_clean_path(setup_path) as path: - obj.to_hdf(path, "obj", format="fixed") - reread = read_hdf(path, "obj") - comp(obj, reread) - - def test_complex_across_dimensions(self, setup_path): - complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) - s = Series(complex128, index=list("abcd")) - df = DataFrame({"A": s, "B": s}) - - with catch_warnings(record=True): - - objs = [df] - comps = [tm.assert_frame_equal] - for obj, comp in zip(objs, comps): - with ensure_clean_path(setup_path) as path: - obj.to_hdf(path, "obj", format="table") - reread = read_hdf(path, "obj") - comp(obj, reread) - - def test_complex_indexing_error(self, setup_path): - complex128 = np.array( - [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 - ) - df = DataFrame( - {"A": [1, 2, 3, 4], "B": ["a", "b", "c", "d"], "C": complex128}, - index=list("abcd"), - ) - with ensure_clean_store(setup_path) as store: - with pytest.raises(TypeError): - store.append("df", df, data_columns=["C"]) - - def test_complex_series_error(self, setup_path): - complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) - s = Series(complex128, index=list("abcd")) - - with ensure_clean_path(setup_path) as path: - with pytest.raises(TypeError): - s.to_hdf(path, "obj", format="t") - - with ensure_clean_path(setup_path) as path: - s.to_hdf(path, "obj", format="t", index=False) - reread = read_hdf(path, "obj") - tm.assert_series_equal(s, reread) - - def test_complex_append(self, setup_path): - df = DataFrame( - {"a": np.random.randn(100).astype(np.complex128), "b": np.random.randn(100)} - ) - - with ensure_clean_store(setup_path) as store: - store.append("df", df, data_columns=["b"]) - store.append("df", df) - result = store.select("df") - assert_frame_equal(pd.concat([df, df], 0), result) - - -# @pytest.mark.usefixtures("setup_path") -class TestTimezones: - def _compare_with_tz(self, a, b): - tm.assert_frame_equal(a, b) - - # compare the zones on each element - for c in a.columns: - for i in a.index: - a_e = a.loc[i, c] - b_e = b.loc[i, c] - if not (a_e == b_e and a_e.tz == b_e.tz): - raise AssertionError( - "invalid tz comparison [{a_e}] [{b_e}]".format(a_e=a_e, b_e=b_e) - ) - - def test_append_with_timezones_dateutil(self, setup_path): - - from datetime import timedelta - - # use maybe_get_tz instead of dateutil.tz.gettz to handle the windows - # filename issues. - from pandas._libs.tslibs.timezones import maybe_get_tz - - gettz = lambda x: maybe_get_tz("dateutil/" + x) - - # as columns - with ensure_clean_store(setup_path) as store: - - _maybe_remove(store, "df_tz") - df = DataFrame( - dict( - A=[ - Timestamp("20130102 2:00:00", tz=gettz("US/Eastern")) - + timedelta(hours=1) * i - for i in range(5) - ] - ) - ) - - store.append("df_tz", df, data_columns=["A"]) - result = store["df_tz"] - self._compare_with_tz(result, df) - assert_frame_equal(result, df) - - # select with tz aware - expected = df[df.A >= df.A[3]] - result = store.select("df_tz", where="A>=df.A[3]") - self._compare_with_tz(result, expected) - - # ensure we include dates in DST and STD time here. - _maybe_remove(store, "df_tz") - df = DataFrame( - dict( - A=Timestamp("20130102", tz=gettz("US/Eastern")), - B=Timestamp("20130603", tz=gettz("US/Eastern")), - ), - index=range(5), - ) - store.append("df_tz", df) - result = store["df_tz"] - self._compare_with_tz(result, df) - assert_frame_equal(result, df) - - df = DataFrame( - dict( - A=Timestamp("20130102", tz=gettz("US/Eastern")), - B=Timestamp("20130102", tz=gettz("EET")), - ), - index=range(5), - ) - with pytest.raises(ValueError): - store.append("df_tz", df) - - # this is ok - _maybe_remove(store, "df_tz") - store.append("df_tz", df, data_columns=["A", "B"]) - result = store["df_tz"] - self._compare_with_tz(result, df) - assert_frame_equal(result, df) - - # can't append with diff timezone - df = DataFrame( - dict( - A=Timestamp("20130102", tz=gettz("US/Eastern")), - B=Timestamp("20130102", tz=gettz("CET")), - ), - index=range(5), - ) - with pytest.raises(ValueError): - store.append("df_tz", df) - - # as index - with ensure_clean_store(setup_path) as store: - - # GH 4098 example - df = DataFrame( - dict( - A=Series( - range(3), - index=date_range( - "2000-1-1", periods=3, freq="H", tz=gettz("US/Eastern") - ), - ) - ) - ) - - _maybe_remove(store, "df") - store.put("df", df) - result = store.select("df") - assert_frame_equal(result, df) - - _maybe_remove(store, "df") - store.append("df", df) - result = store.select("df") - assert_frame_equal(result, df) - - def test_append_with_timezones_pytz(self, setup_path): - - from datetime import timedelta - - # as columns - with ensure_clean_store(setup_path) as store: - - _maybe_remove(store, "df_tz") - df = DataFrame( - dict( - A=[ - Timestamp("20130102 2:00:00", tz="US/Eastern") - + timedelta(hours=1) * i - for i in range(5) - ] - ) - ) - store.append("df_tz", df, data_columns=["A"]) - result = store["df_tz"] - self._compare_with_tz(result, df) - assert_frame_equal(result, df) - - # select with tz aware - self._compare_with_tz( - store.select("df_tz", where="A>=df.A[3]"), df[df.A >= df.A[3]] - ) - - _maybe_remove(store, "df_tz") - # ensure we include dates in DST and STD time here. - df = DataFrame( - dict( - A=Timestamp("20130102", tz="US/Eastern"), - B=Timestamp("20130603", tz="US/Eastern"), - ), - index=range(5), - ) - store.append("df_tz", df) - result = store["df_tz"] - self._compare_with_tz(result, df) - assert_frame_equal(result, df) - - df = DataFrame( - dict( - A=Timestamp("20130102", tz="US/Eastern"), - B=Timestamp("20130102", tz="EET"), - ), - index=range(5), - ) - with pytest.raises(ValueError): - store.append("df_tz", df) - - # this is ok - _maybe_remove(store, "df_tz") - store.append("df_tz", df, data_columns=["A", "B"]) - result = store["df_tz"] - self._compare_with_tz(result, df) - assert_frame_equal(result, df) - - # can't append with diff timezone - df = DataFrame( - dict( - A=Timestamp("20130102", tz="US/Eastern"), - B=Timestamp("20130102", tz="CET"), - ), - index=range(5), - ) - with pytest.raises(ValueError): - store.append("df_tz", df) - - # as index - with ensure_clean_store(setup_path) as store: - - # GH 4098 example - df = DataFrame( - dict( - A=Series( - range(3), - index=date_range( - "2000-1-1", periods=3, freq="H", tz="US/Eastern" - ), - ) - ) - ) - - _maybe_remove(store, "df") - store.put("df", df) - result = store.select("df") - assert_frame_equal(result, df) - - _maybe_remove(store, "df") - store.append("df", df) - result = store.select("df") - assert_frame_equal(result, df) - - def test_tseries_select_index_column(self, setup_path): - # GH7777 - # selecting a UTC datetimeindex column did - # not preserve UTC tzinfo set before storing - - # check that no tz still works - rng = date_range("1/1/2000", "1/30/2000") - frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - - with ensure_clean_store(setup_path) as store: - store.append("frame", frame) - result = store.select_column("frame", "index") - assert rng.tz == DatetimeIndex(result.values).tz - - # check utc - rng = date_range("1/1/2000", "1/30/2000", tz="UTC") - frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - - with ensure_clean_store(setup_path) as store: - store.append("frame", frame) - result = store.select_column("frame", "index") - assert rng.tz == result.dt.tz - - # double check non-utc - rng = date_range("1/1/2000", "1/30/2000", tz="US/Eastern") - frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - - with ensure_clean_store(setup_path) as store: - store.append("frame", frame) - result = store.select_column("frame", "index") - assert rng.tz == result.dt.tz - - def test_timezones_fixed(self, setup_path): - with ensure_clean_store(setup_path) as store: - - # index - rng = date_range("1/1/2000", "1/30/2000", tz="US/Eastern") - df = DataFrame(np.random.randn(len(rng), 4), index=rng) - store["df"] = df - result = store["df"] - assert_frame_equal(result, df) - - # as data - # GH11411 - _maybe_remove(store, "df") - df = DataFrame( - { - "A": rng, - "B": rng.tz_convert("UTC").tz_localize(None), - "C": rng.tz_convert("CET"), - "D": range(len(rng)), - }, - index=rng, - ) - store["df"] = df - result = store["df"] - assert_frame_equal(result, df) - - def test_fixed_offset_tz(self, setup_path): - rng = date_range("1/1/2000 00:00:00-07:00", "1/30/2000 00:00:00-07:00") - frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - - with ensure_clean_store(setup_path) as store: - store["frame"] = frame - recons = store["frame"] - tm.assert_index_equal(recons.index, rng) - assert rng.tz == recons.index.tz - - @td.skip_if_windows - def test_store_timezone(self, setup_path): - # GH2852 - # issue storing datetime.date with a timezone as it resets when read - # back in a new timezone - - # original method - with ensure_clean_store(setup_path) as store: - - today = datetime.date(2013, 9, 10) - df = DataFrame([1, 2, 3], index=[today, today, today]) - store["obj1"] = df - result = store["obj1"] - assert_frame_equal(result, df) - - # with tz setting - with ensure_clean_store(setup_path) as store: - - with set_timezone("EST5EDT"): - today = datetime.date(2013, 9, 10) - df = DataFrame([1, 2, 3], index=[today, today, today]) - store["obj1"] = df - - with set_timezone("CST6CDT"): - result = store["obj1"] - - assert_frame_equal(result, df) - - def test_legacy_datetimetz_object(self, datapath, setup_path): - # legacy from < 0.17.0 - # 8260 - expected = DataFrame( - dict( - A=Timestamp("20130102", tz="US/Eastern"), - B=Timestamp("20130603", tz="CET"), - ), - index=range(5), - ) - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "datetimetz_object.h5"), mode="r" - ) as store: - result = store["df"] - assert_frame_equal(result, expected) - - def test_dst_transitions(self, setup_path): - # make sure we are not failing on transitions - with ensure_clean_store(setup_path) as store: - times = pd.date_range( - "2013-10-26 23:00", - "2013-10-27 01:00", - tz="Europe/London", - freq="H", - ambiguous="infer", - ) - - for i in [times, times + pd.Timedelta("10min")]: - _maybe_remove(store, "df") - df = DataFrame({"A": range(len(i)), "B": i}, index=i) - store.append("df", df) - result = store.select("df") - assert_frame_equal(result, df) - - def test_read_with_where_tz_aware_index(self, setup_path): - # GH 11926 - periods = 10 - dts = pd.date_range("20151201", periods=periods, freq="D", tz="UTC") - mi = pd.MultiIndex.from_arrays([dts, range(periods)], names=["DATE", "NO"]) - expected = pd.DataFrame({"MYCOL": 0}, index=mi) - - key = "mykey" - with ensure_clean_path(setup_path) as path: - with pd.HDFStore(path) as store: - store.append(key, expected, format="table", append=True) - result = pd.read_hdf(path, key, where="DATE > 20151130") - assert_frame_equal(result, expected) - - def test_py2_created_with_datetimez(self, datapath, setup_path): - # The test HDF5 file was created in Python 2, but could not be read in - # Python 3. - # - # GH26443 - index = [pd.Timestamp("2019-01-01T18:00").tz_localize("America/New_York")] - expected = DataFrame({"data": 123}, index=index) - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "gh26443.h5"), mode="r" - ) as store: - result = store["key"] - assert_frame_equal(result, expected) From 54483fb4f3821b1dcbc10f0551f8f52bbff53286 Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Wed, 2 Oct 2019 10:36:03 +0100 Subject: [PATCH 10/28] created test_hdf_store module to move class TestHDFStore --- pandas/tests/io/pytables/test_hdf_store.py | 4758 ++++++++++++++++++++ 1 file changed, 4758 insertions(+) diff --git a/pandas/tests/io/pytables/test_hdf_store.py b/pandas/tests/io/pytables/test_hdf_store.py index e69de29bb2d1d..a5a8b0591d048 100644 --- a/pandas/tests/io/pytables/test_hdf_store.py +++ b/pandas/tests/io/pytables/test_hdf_store.py @@ -0,0 +1,4758 @@ +import datetime +from datetime import timedelta +from distutils.version import LooseVersion +from io import BytesIO +import os +import re +from warnings import catch_warnings, simplefilter + +import numpy as np +import pytest + +from pandas.compat import PY36, is_platform_little_endian, is_platform_windows +import pandas.util._test_decorators as td + +from pandas.core.dtypes.common import is_categorical_dtype + +import pandas as pd +from pandas import ( + Categorical, + CategoricalIndex, + DataFrame, + DatetimeIndex, + Index, + Int64Index, + MultiIndex, + RangeIndex, + Series, + Timestamp, + bdate_range, + concat, + date_range, + isna, + timedelta_range, +) +import pandas.util.testing as tm +from pandas.util.testing import assert_frame_equal, assert_series_equal + +from pandas.io.pytables import ( + ClosedFileError, + HDFStore, + PossibleDataLossError, + Term, + read_hdf, +) +from pandas.tests.io.pytables.common import (xfail_non_writeable, + tables, + ensure_clean_path, + ensure_clean_store, + create_tempfile, + safe_close, + safe_remove, + _maybe_remove) + +from pandas.io import pytables as pytables # noqa: E402 isort:skip +from pandas.io.pytables import TableIterator # noqa: E402 isort:skip + + +_default_compressor = "blosc" +ignore_natural_naming_warning = pytest.mark.filterwarnings( + "ignore:object name:tables.exceptions.NaturalNameWarning" +) + +@pytest.mark.single +class TestHDFStore: + def test_format_kwarg_in_constructor(self, setup_path): + # GH 13291 + with ensure_clean_path(setup_path) as path: + with pytest.raises(ValueError): + HDFStore(path, format="table") + + def test_context(self, setup_path): + path = create_tempfile(setup_path) + try: + with HDFStore(path) as tbl: + raise ValueError("blah") + except ValueError: + pass + finally: + safe_remove(path) + + try: + with HDFStore(path) as tbl: + tbl["a"] = tm.makeDataFrame() + + with HDFStore(path) as tbl: + assert len(tbl) == 1 + assert type(tbl["a"]) == DataFrame + finally: + safe_remove(path) + + def test_conv_read_write(self, setup_path): + path = create_tempfile(setup_path) + try: + + def roundtrip(key, obj, **kwargs): + obj.to_hdf(path, key, **kwargs) + return read_hdf(path, key) + + o = tm.makeTimeSeries() + assert_series_equal(o, roundtrip("series", o)) + + o = tm.makeStringSeries() + assert_series_equal(o, roundtrip("string_series", o)) + + o = tm.makeDataFrame() + assert_frame_equal(o, roundtrip("frame", o)) + + # table + df = DataFrame(dict(A=range(5), B=range(5))) + df.to_hdf(path, "table", append=True) + result = read_hdf(path, "table", where=["index>2"]) + assert_frame_equal(df[df.index > 2], result) + + finally: + safe_remove(path) + + def test_long_strings(self, setup_path): + + # GH6166 + df = DataFrame( + {"a": tm.rands_array(100, size=10)}, index=tm.rands_array(100, size=10) + ) + + with ensure_clean_store(setup_path) as store: + store.append("df", df, data_columns=["a"]) + + result = store.select("df") + assert_frame_equal(df, result) + + def test_api(self, setup_path): + + # GH4584 + # API issue when to_hdf doesn't accept append AND format args + with ensure_clean_path(setup_path) as path: + + df = tm.makeDataFrame() + df.iloc[:10].to_hdf(path, "df", append=True, format="table") + df.iloc[10:].to_hdf(path, "df", append=True, format="table") + assert_frame_equal(read_hdf(path, "df"), df) + + # append to False + df.iloc[:10].to_hdf(path, "df", append=False, format="table") + df.iloc[10:].to_hdf(path, "df", append=True, format="table") + assert_frame_equal(read_hdf(path, "df"), df) + + with ensure_clean_path(setup_path) as path: + + df = tm.makeDataFrame() + df.iloc[:10].to_hdf(path, "df", append=True) + df.iloc[10:].to_hdf(path, "df", append=True, format="table") + assert_frame_equal(read_hdf(path, "df"), df) + + # append to False + df.iloc[:10].to_hdf(path, "df", append=False, format="table") + df.iloc[10:].to_hdf(path, "df", append=True) + assert_frame_equal(read_hdf(path, "df"), df) + + with ensure_clean_path(setup_path) as path: + + df = tm.makeDataFrame() + df.to_hdf(path, "df", append=False, format="fixed") + assert_frame_equal(read_hdf(path, "df"), df) + + df.to_hdf(path, "df", append=False, format="f") + assert_frame_equal(read_hdf(path, "df"), df) + + df.to_hdf(path, "df", append=False) + assert_frame_equal(read_hdf(path, "df"), df) + + df.to_hdf(path, "df") + assert_frame_equal(read_hdf(path, "df"), df) + + with ensure_clean_store(setup_path) as store: + + path = store._path + df = tm.makeDataFrame() + + _maybe_remove(store, "df") + store.append("df", df.iloc[:10], append=True, format="table") + store.append("df", df.iloc[10:], append=True, format="table") + assert_frame_equal(store.select("df"), df) + + # append to False + _maybe_remove(store, "df") + store.append("df", df.iloc[:10], append=False, format="table") + store.append("df", df.iloc[10:], append=True, format="table") + assert_frame_equal(store.select("df"), df) + + # formats + _maybe_remove(store, "df") + store.append("df", df.iloc[:10], append=False, format="table") + store.append("df", df.iloc[10:], append=True, format="table") + assert_frame_equal(store.select("df"), df) + + _maybe_remove(store, "df") + store.append("df", df.iloc[:10], append=False, format="table") + store.append("df", df.iloc[10:], append=True, format=None) + assert_frame_equal(store.select("df"), df) + + with ensure_clean_path(setup_path) as path: + # Invalid. + df = tm.makeDataFrame() + + with pytest.raises(ValueError): + df.to_hdf(path, "df", append=True, format="f") + + with pytest.raises(ValueError): + df.to_hdf(path, "df", append=True, format="fixed") + + with pytest.raises(TypeError): + df.to_hdf(path, "df", append=True, format="foo") + + with pytest.raises(TypeError): + df.to_hdf(path, "df", append=False, format="bar") + + # File path doesn't exist + path = "" + with pytest.raises(FileNotFoundError): + read_hdf(path, "df") + + def test_api_default_format(self, setup_path): + + # default_format option + with ensure_clean_store(setup_path) as store: + df = tm.makeDataFrame() + + pd.set_option("io.hdf.default_format", "fixed") + _maybe_remove(store, "df") + store.put("df", df) + assert not store.get_storer("df").is_table + with pytest.raises(ValueError): + store.append("df2", df) + + pd.set_option("io.hdf.default_format", "table") + _maybe_remove(store, "df") + store.put("df", df) + assert store.get_storer("df").is_table + _maybe_remove(store, "df2") + store.append("df2", df) + assert store.get_storer("df").is_table + + pd.set_option("io.hdf.default_format", None) + + with ensure_clean_path(setup_path) as path: + + df = tm.makeDataFrame() + + pd.set_option("io.hdf.default_format", "fixed") + df.to_hdf(path, "df") + with HDFStore(path) as store: + assert not store.get_storer("df").is_table + with pytest.raises(ValueError): + df.to_hdf(path, "df2", append=True) + + pd.set_option("io.hdf.default_format", "table") + df.to_hdf(path, "df3") + with HDFStore(path) as store: + assert store.get_storer("df3").is_table + df.to_hdf(path, "df4", append=True) + with HDFStore(path) as store: + assert store.get_storer("df4").is_table + + pd.set_option("io.hdf.default_format", None) + + def test_keys(self, setup_path): + + with ensure_clean_store(setup_path) as store: + store["a"] = tm.makeTimeSeries() + store["b"] = tm.makeStringSeries() + store["c"] = tm.makeDataFrame() + + assert len(store) == 3 + expected = {"/a", "/b", "/c"} + assert set(store.keys()) == expected + assert set(store) == expected + + def test_keys_ignore_hdf_softlink(self, setup_path): + + # GH 20523 + # Puts a softlink into HDF file and rereads + + with ensure_clean_store(setup_path) as store: + + df = DataFrame(dict(A=range(5), B=range(5))) + store.put("df", df) + + assert store.keys() == ["/df"] + + store._handle.create_soft_link(store._handle.root, "symlink", "df") + + # Should ignore the softlink + assert store.keys() == ["/df"] + + def test_iter_empty(self, setup_path): + + with ensure_clean_store(setup_path) as store: + # GH 12221 + assert list(store) == [] + + def test_repr(self, setup_path): + + with ensure_clean_store(setup_path) as store: + repr(store) + store.info() + store["a"] = tm.makeTimeSeries() + store["b"] = tm.makeStringSeries() + store["c"] = tm.makeDataFrame() + + df = tm.makeDataFrame() + df["obj1"] = "foo" + df["obj2"] = "bar" + df["bool1"] = df["A"] > 0 + df["bool2"] = df["B"] > 0 + df["bool3"] = True + df["int1"] = 1 + df["int2"] = 2 + df["timestamp1"] = Timestamp("20010102") + df["timestamp2"] = Timestamp("20010103") + df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) + df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) + df.loc[3:6, ["obj1"]] = np.nan + df = df._consolidate()._convert(datetime=True) + + with catch_warnings(record=True): + simplefilter("ignore", pd.errors.PerformanceWarning) + store["df"] = df + + # make a random group in hdf space + store._handle.create_group(store._handle.root, "bah") + + assert store.filename in repr(store) + assert store.filename in str(store) + store.info() + + # storers + with ensure_clean_store(setup_path) as store: + + df = tm.makeDataFrame() + store.append("df", df) + + s = store.get_storer("df") + repr(s) + str(s) + + @ignore_natural_naming_warning + def test_contains(self, setup_path): + + with ensure_clean_store(setup_path) as store: + store["a"] = tm.makeTimeSeries() + store["b"] = tm.makeDataFrame() + store["foo/bar"] = tm.makeDataFrame() + assert "a" in store + assert "b" in store + assert "c" not in store + assert "foo/bar" in store + assert "/foo/bar" in store + assert "/foo/b" not in store + assert "bar" not in store + + # gh-2694: tables.NaturalNameWarning + with catch_warnings(record=True): + store["node())"] = tm.makeDataFrame() + assert "node())" in store + + def test_versioning(self, setup_path): + + with ensure_clean_store(setup_path) as store: + store["a"] = tm.makeTimeSeries() + store["b"] = tm.makeDataFrame() + df = tm.makeTimeDataFrame() + _maybe_remove(store, "df1") + store.append("df1", df[:10]) + store.append("df1", df[10:]) + assert store.root.a._v_attrs.pandas_version == "0.15.2" + assert store.root.b._v_attrs.pandas_version == "0.15.2" + assert store.root.df1._v_attrs.pandas_version == "0.15.2" + + # write a file and wipe its versioning + _maybe_remove(store, "df2") + store.append("df2", df) + + # this is an error because its table_type is appendable, but no + # version info + store.get_node("df2")._v_attrs.pandas_version = None + with pytest.raises(Exception): + store.select("df2") + + def test_mode(self, setup_path): + + df = tm.makeTimeDataFrame() + + def check(mode): + + with ensure_clean_path(setup_path) as path: + + # constructor + if mode in ["r", "r+"]: + with pytest.raises(IOError): + HDFStore(path, mode=mode) + + else: + store = HDFStore(path, mode=mode) + assert store._handle.mode == mode + store.close() + + with ensure_clean_path(setup_path) as path: + + # context + if mode in ["r", "r+"]: + with pytest.raises(IOError): + with HDFStore(path, mode=mode) as store: # noqa + pass + else: + with HDFStore(path, mode=mode) as store: + assert store._handle.mode == mode + + with ensure_clean_path(setup_path) as path: + + # conv write + if mode in ["r", "r+"]: + with pytest.raises(IOError): + df.to_hdf(path, "df", mode=mode) + df.to_hdf(path, "df", mode="w") + else: + df.to_hdf(path, "df", mode=mode) + + # conv read + if mode in ["w"]: + with pytest.raises(ValueError): + read_hdf(path, "df", mode=mode) + else: + result = read_hdf(path, "df", mode=mode) + assert_frame_equal(result, df) + + def check_default_mode(): + + # read_hdf uses default mode + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", mode="w") + result = read_hdf(path, "df") + assert_frame_equal(result, df) + + check("r") + check("r+") + check("a") + check("w") + check_default_mode() + + def test_reopen_handle(self, setup_path): + + with ensure_clean_path(setup_path) as path: + + store = HDFStore(path, mode="a") + store["a"] = tm.makeTimeSeries() + + # invalid mode change + with pytest.raises(PossibleDataLossError): + store.open("w") + + store.close() + assert not store.is_open + + # truncation ok here + store.open("w") + assert store.is_open + assert len(store) == 0 + store.close() + assert not store.is_open + + store = HDFStore(path, mode="a") + store["a"] = tm.makeTimeSeries() + + # reopen as read + store.open("r") + assert store.is_open + assert len(store) == 1 + assert store._mode == "r" + store.close() + assert not store.is_open + + # reopen as append + store.open("a") + assert store.is_open + assert len(store) == 1 + assert store._mode == "a" + store.close() + assert not store.is_open + + # reopen as append (again) + store.open("a") + assert store.is_open + assert len(store) == 1 + assert store._mode == "a" + store.close() + assert not store.is_open + + def test_open_args(self, setup_path): + + with ensure_clean_path(setup_path) as path: + + df = tm.makeDataFrame() + + # create an in memory store + store = HDFStore( + path, mode="a", driver="H5FD_CORE", driver_core_backing_store=0 + ) + store["df"] = df + store.append("df2", df) + + tm.assert_frame_equal(store["df"], df) + tm.assert_frame_equal(store["df2"], df) + + store.close() + + # the file should not have actually been written + assert not os.path.exists(path) + + def test_flush(self, setup_path): + + with ensure_clean_store(setup_path) as store: + store["a"] = tm.makeTimeSeries() + store.flush() + store.flush(fsync=True) + + def test_get(self, setup_path): + + with ensure_clean_store(setup_path) as store: + store["a"] = tm.makeTimeSeries() + left = store.get("a") + right = store["a"] + tm.assert_series_equal(left, right) + + left = store.get("/a") + right = store["/a"] + tm.assert_series_equal(left, right) + + with pytest.raises(KeyError, match="'No object named b in the file'"): + store.get("b") + + @pytest.mark.parametrize( + "where, expected", + [ + ( + "/", + { + "": ({"first_group", "second_group"}, set()), + "/first_group": (set(), {"df1", "df2"}), + "/second_group": ({"third_group"}, {"df3", "s1"}), + "/second_group/third_group": (set(), {"df4"}), + }, + ), + ( + "/second_group", + { + "/second_group": ({"third_group"}, {"df3", "s1"}), + "/second_group/third_group": (set(), {"df4"}), + }, + ), + ], + ) + def test_walk(self, where, expected, setup_path): + # GH10143 + objs = { + "df1": pd.DataFrame([1, 2, 3]), + "df2": pd.DataFrame([4, 5, 6]), + "df3": pd.DataFrame([6, 7, 8]), + "df4": pd.DataFrame([9, 10, 11]), + "s1": pd.Series([10, 9, 8]), + # Next 3 items aren't pandas objects and should be ignored + "a1": np.array([[1, 2, 3], [4, 5, 6]]), + "tb1": np.array([(1, 2, 3), (4, 5, 6)], dtype="i,i,i"), + "tb2": np.array([(7, 8, 9), (10, 11, 12)], dtype="i,i,i"), + } + + with ensure_clean_store("walk_groups.hdf", mode="w") as store: + store.put("/first_group/df1", objs["df1"]) + store.put("/first_group/df2", objs["df2"]) + store.put("/second_group/df3", objs["df3"]) + store.put("/second_group/s1", objs["s1"]) + store.put("/second_group/third_group/df4", objs["df4"]) + # Create non-pandas objects + store._handle.create_array("/first_group", "a1", objs["a1"]) + store._handle.create_table("/first_group", "tb1", obj=objs["tb1"]) + store._handle.create_table("/second_group", "tb2", obj=objs["tb2"]) + + assert len(list(store.walk(where=where))) == len(expected) + for path, groups, leaves in store.walk(where=where): + assert path in expected + expected_groups, expected_frames = expected[path] + assert expected_groups == set(groups) + assert expected_frames == set(leaves) + for leaf in leaves: + frame_path = "/".join([path, leaf]) + obj = store.get(frame_path) + if "df" in leaf: + tm.assert_frame_equal(obj, objs[leaf]) + else: + tm.assert_series_equal(obj, objs[leaf]) + + def test_getattr(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + s = tm.makeTimeSeries() + store["a"] = s + + # test attribute access + result = store.a + tm.assert_series_equal(result, s) + result = getattr(store, "a") + tm.assert_series_equal(result, s) + + df = tm.makeTimeDataFrame() + store["df"] = df + result = store.df + tm.assert_frame_equal(result, df) + + # errors + for x in ["d", "mode", "path", "handle", "complib"]: + with pytest.raises(AttributeError): + getattr(store, x) + + # not stores + for x in ["mode", "path", "handle", "complib"]: + getattr(store, "_{x}".format(x=x)) + + def test_put(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + ts = tm.makeTimeSeries() + df = tm.makeTimeDataFrame() + store["a"] = ts + store["b"] = df[:10] + store["foo/bar/bah"] = df[:10] + store["foo"] = df[:10] + store["/foo"] = df[:10] + store.put("c", df[:10], format="table") + + # not OK, not a table + with pytest.raises(ValueError): + store.put("b", df[10:], append=True) + + # node does not currently exist, test _is_table_type returns False + # in this case + _maybe_remove(store, "f") + with pytest.raises(ValueError): + store.put("f", df[10:], append=True) + + # can't put to a table (use append instead) + with pytest.raises(ValueError): + store.put("c", df[10:], append=True) + + # overwrite table + store.put("c", df[:10], format="table", append=False) + tm.assert_frame_equal(df[:10], store["c"]) + + def test_put_string_index(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + index = Index( + ["I am a very long string index: {i}".format(i=i) for i in range(20)] + ) + s = Series(np.arange(20), index=index) + df = DataFrame({"A": s, "B": s}) + + store["a"] = s + tm.assert_series_equal(store["a"], s) + + store["b"] = df + tm.assert_frame_equal(store["b"], df) + + # mixed length + index = Index( + ["abcdefghijklmnopqrstuvwxyz1234567890"] + + ["I am a very long string index: {i}".format(i=i) for i in range(20)] + ) + s = Series(np.arange(21), index=index) + df = DataFrame({"A": s, "B": s}) + store["a"] = s + tm.assert_series_equal(store["a"], s) + + store["b"] = df + tm.assert_frame_equal(store["b"], df) + + def test_put_compression(self, setup_path): + + with ensure_clean_store(setup_path) as store: + df = tm.makeTimeDataFrame() + + store.put("c", df, format="table", complib="zlib") + tm.assert_frame_equal(store["c"], df) + + # can't compress if format='fixed' + with pytest.raises(ValueError): + store.put("b", df, format="fixed", complib="zlib") + + @td.skip_if_windows_python_3 + def test_put_compression_blosc(self, setup_path): + df = tm.makeTimeDataFrame() + + with ensure_clean_store(setup_path) as store: + + # can't compress if format='fixed' + with pytest.raises(ValueError): + store.put("b", df, format="fixed", complib="blosc") + + store.put("c", df, format="table", complib="blosc") + tm.assert_frame_equal(store["c"], df) + + def test_complibs_default_settings(self, setup_path): + # GH15943 + df = tm.makeDataFrame() + + # Set complevel and check if complib is automatically set to + # default value + with ensure_clean_path(setup_path) as tmpfile: + df.to_hdf(tmpfile, "df", complevel=9) + result = pd.read_hdf(tmpfile, "df") + tm.assert_frame_equal(result, df) + + with tables.open_file(tmpfile, mode="r") as h5file: + for node in h5file.walk_nodes(where="/df", classname="Leaf"): + assert node.filters.complevel == 9 + assert node.filters.complib == "zlib" + + # Set complib and check to see if compression is disabled + with ensure_clean_path(setup_path) as tmpfile: + df.to_hdf(tmpfile, "df", complib="zlib") + result = pd.read_hdf(tmpfile, "df") + tm.assert_frame_equal(result, df) + + with tables.open_file(tmpfile, mode="r") as h5file: + for node in h5file.walk_nodes(where="/df", classname="Leaf"): + assert node.filters.complevel == 0 + assert node.filters.complib is None + + # Check if not setting complib or complevel results in no compression + with ensure_clean_path(setup_path) as tmpfile: + df.to_hdf(tmpfile, "df") + result = pd.read_hdf(tmpfile, "df") + tm.assert_frame_equal(result, df) + + with tables.open_file(tmpfile, mode="r") as h5file: + for node in h5file.walk_nodes(where="/df", classname="Leaf"): + assert node.filters.complevel == 0 + assert node.filters.complib is None + + # Check if file-defaults can be overridden on a per table basis + with ensure_clean_path(setup_path) as tmpfile: + store = pd.HDFStore(tmpfile) + store.append("dfc", df, complevel=9, complib="blosc") + store.append("df", df) + store.close() + + with tables.open_file(tmpfile, mode="r") as h5file: + for node in h5file.walk_nodes(where="/df", classname="Leaf"): + assert node.filters.complevel == 0 + assert node.filters.complib is None + for node in h5file.walk_nodes(where="/dfc", classname="Leaf"): + assert node.filters.complevel == 9 + assert node.filters.complib == "blosc" + + def test_complibs(self, setup_path): + # GH14478 + df = tm.makeDataFrame() + + # Building list of all complibs and complevels tuples + all_complibs = tables.filters.all_complibs + # Remove lzo if its not available on this platform + if not tables.which_lib_version("lzo"): + all_complibs.remove("lzo") + # Remove bzip2 if its not available on this platform + if not tables.which_lib_version("bzip2"): + all_complibs.remove("bzip2") + + all_levels = range(0, 10) + all_tests = [(lib, lvl) for lib in all_complibs for lvl in all_levels] + + for (lib, lvl) in all_tests: + with ensure_clean_path(setup_path) as tmpfile: + gname = "foo" + + # Write and read file to see if data is consistent + df.to_hdf(tmpfile, gname, complib=lib, complevel=lvl) + result = pd.read_hdf(tmpfile, gname) + tm.assert_frame_equal(result, df) + + # Open file and check metadata + # for correct amount of compression + h5table = tables.open_file(tmpfile, mode="r") + for node in h5table.walk_nodes(where="/" + gname, classname="Leaf"): + assert node.filters.complevel == lvl + if lvl == 0: + assert node.filters.complib is None + else: + assert node.filters.complib == lib + h5table.close() + + def test_put_integer(self, setup_path): + # non-date, non-string index + df = DataFrame(np.random.randn(50, 100)) + self._check_roundtrip(df, tm.assert_frame_equal, setup_path) + + @xfail_non_writeable + def test_put_mixed_type(self, setup_path): + df = tm.makeTimeDataFrame() + df["obj1"] = "foo" + df["obj2"] = "bar" + df["bool1"] = df["A"] > 0 + df["bool2"] = df["B"] > 0 + df["bool3"] = True + df["int1"] = 1 + df["int2"] = 2 + df["timestamp1"] = Timestamp("20010102") + df["timestamp2"] = Timestamp("20010103") + df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) + df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) + df.loc[3:6, ["obj1"]] = np.nan + df = df._consolidate()._convert(datetime=True) + + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "df") + + # PerformanceWarning + with catch_warnings(record=True): + simplefilter("ignore", pd.errors.PerformanceWarning) + store.put("df", df) + + expected = store.get("df") + tm.assert_frame_equal(expected, df) + + @pytest.mark.filterwarnings( + "ignore:object name:tables.exceptions.NaturalNameWarning" + ) + def test_append(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + # this is allowed by almost always don't want to do it + # tables.NaturalNameWarning): + with catch_warnings(record=True): + + df = tm.makeTimeDataFrame() + _maybe_remove(store, "df1") + store.append("df1", df[:10]) + store.append("df1", df[10:]) + tm.assert_frame_equal(store["df1"], df) + + _maybe_remove(store, "df2") + store.put("df2", df[:10], format="table") + store.append("df2", df[10:]) + tm.assert_frame_equal(store["df2"], df) + + _maybe_remove(store, "df3") + store.append("/df3", df[:10]) + store.append("/df3", df[10:]) + tm.assert_frame_equal(store["df3"], df) + + # this is allowed by almost always don't want to do it + # tables.NaturalNameWarning + _maybe_remove(store, "/df3 foo") + store.append("/df3 foo", df[:10]) + store.append("/df3 foo", df[10:]) + tm.assert_frame_equal(store["df3 foo"], df) + + # dtype issues - mizxed type in a single object column + df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]]) + df["mixed_column"] = "testing" + df.loc[2, "mixed_column"] = np.nan + _maybe_remove(store, "df") + store.append("df", df) + tm.assert_frame_equal(store["df"], df) + + # uints - test storage of uints + uint_data = DataFrame( + { + "u08": Series( + np.random.randint(0, high=255, size=5), dtype=np.uint8 + ), + "u16": Series( + np.random.randint(0, high=65535, size=5), dtype=np.uint16 + ), + "u32": Series( + np.random.randint(0, high=2 ** 30, size=5), dtype=np.uint32 + ), + "u64": Series( + [2 ** 58, 2 ** 59, 2 ** 60, 2 ** 61, 2 ** 62], + dtype=np.uint64, + ), + }, + index=np.arange(5), + ) + _maybe_remove(store, "uints") + store.append("uints", uint_data) + tm.assert_frame_equal(store["uints"], uint_data) + + # uints - test storage of uints in indexable columns + _maybe_remove(store, "uints") + # 64-bit indices not yet supported + store.append("uints", uint_data, data_columns=["u08", "u16", "u32"]) + tm.assert_frame_equal(store["uints"], uint_data) + + def test_append_series(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + # basic + ss = tm.makeStringSeries() + ts = tm.makeTimeSeries() + ns = Series(np.arange(100)) + + store.append("ss", ss) + result = store["ss"] + tm.assert_series_equal(result, ss) + assert result.name is None + + store.append("ts", ts) + result = store["ts"] + tm.assert_series_equal(result, ts) + assert result.name is None + + ns.name = "foo" + store.append("ns", ns) + result = store["ns"] + tm.assert_series_equal(result, ns) + assert result.name == ns.name + + # select on the values + expected = ns[ns > 60] + result = store.select("ns", "foo>60") + tm.assert_series_equal(result, expected) + + # select on the index and values + expected = ns[(ns > 70) & (ns.index < 90)] + result = store.select("ns", "foo>70 and index<90") + tm.assert_series_equal(result, expected) + + # multi-index + mi = DataFrame(np.random.randn(5, 1), columns=["A"]) + mi["B"] = np.arange(len(mi)) + mi["C"] = "foo" + mi.loc[3:5, "C"] = "bar" + mi.set_index(["C", "B"], inplace=True) + s = mi.stack() + s.index = s.index.droplevel(2) + store.append("mi", s) + tm.assert_series_equal(store["mi"], s) + + def test_store_index_types(self, setup_path): + # GH5386 + # test storing various index types + + with ensure_clean_store(setup_path) as store: + + def check(format, index): + df = DataFrame(np.random.randn(10, 2), columns=list("AB")) + df.index = index(len(df)) + + _maybe_remove(store, "df") + store.put("df", df, format=format) + assert_frame_equal(df, store["df"]) + + for index in [ + tm.makeFloatIndex, + tm.makeStringIndex, + tm.makeIntIndex, + tm.makeDateIndex, + ]: + + check("table", index) + check("fixed", index) + + # period index currently broken for table + # seee GH7796 FIXME + check("fixed", tm.makePeriodIndex) + # check('table',tm.makePeriodIndex) + + # unicode + index = tm.makeUnicodeIndex + check("table", index) + check("fixed", index) + + @pytest.mark.skipif( + not is_platform_little_endian(), reason="reason platform is not little endian" + ) + def test_encoding(self, setup_path): + + with ensure_clean_store(setup_path) as store: + df = DataFrame(dict(A="foo", B="bar"), index=range(5)) + df.loc[2, "A"] = np.nan + df.loc[3, "B"] = np.nan + _maybe_remove(store, "df") + store.append("df", df, encoding="ascii") + tm.assert_frame_equal(store["df"], df) + + expected = df.reindex(columns=["A"]) + result = store.select("df", Term("columns=A", encoding="ascii")) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "val", + [ + [b"E\xc9, 17", b"", b"a", b"b", b"c"], + [b"E\xc9, 17", b"a", b"b", b"c"], + [b"EE, 17", b"", b"a", b"b", b"c"], + [b"E\xc9, 17", b"\xf8\xfc", b"a", b"b", b"c"], + [b"", b"a", b"b", b"c"], + [b"\xf8\xfc", b"a", b"b", b"c"], + [b"A\xf8\xfc", b"", b"a", b"b", b"c"], + [np.nan, b"", b"b", b"c"], + [b"A\xf8\xfc", np.nan, b"", b"b", b"c"], + ], + ) + @pytest.mark.parametrize("dtype", ["category", object]) + def test_latin_encoding(self, setup_path, dtype, val): + enc = "latin-1" + nan_rep = "" + key = "data" + + val = [x.decode(enc) if isinstance(x, bytes) else x for x in val] + ser = pd.Series(val, dtype=dtype) + + with ensure_clean_path(setup_path) as store: + ser.to_hdf(store, key, format="table", encoding=enc, nan_rep=nan_rep) + retr = read_hdf(store, key) + + s_nan = ser.replace(nan_rep, np.nan) + + if is_categorical_dtype(s_nan): + assert is_categorical_dtype(retr) + assert_series_equal(s_nan, retr, check_dtype=False, check_categorical=False) + else: + assert_series_equal(s_nan, retr) + + # FIXME: don't leave commented-out + # fails: + # for x in examples: + # roundtrip(s, nan_rep=b'\xf8\xfc') + + def test_append_some_nans(self, setup_path): + + with ensure_clean_store(setup_path) as store: + df = DataFrame( + { + "A": Series(np.random.randn(20)).astype("int32"), + "A1": np.random.randn(20), + "A2": np.random.randn(20), + "B": "foo", + "C": "bar", + "D": Timestamp("20010101"), + "E": datetime.datetime(2001, 1, 2, 0, 0), + }, + index=np.arange(20), + ) + # some nans + _maybe_remove(store, "df1") + df.loc[0:15, ["A1", "B", "D", "E"]] = np.nan + store.append("df1", df[:10]) + store.append("df1", df[10:]) + tm.assert_frame_equal(store["df1"], df) + + # first column + df1 = df.copy() + df1.loc[:, "A1"] = np.nan + _maybe_remove(store, "df1") + store.append("df1", df1[:10]) + store.append("df1", df1[10:]) + tm.assert_frame_equal(store["df1"], df1) + + # 2nd column + df2 = df.copy() + df2.loc[:, "A2"] = np.nan + _maybe_remove(store, "df2") + store.append("df2", df2[:10]) + store.append("df2", df2[10:]) + tm.assert_frame_equal(store["df2"], df2) + + # datetimes + df3 = df.copy() + df3.loc[:, "E"] = np.nan + _maybe_remove(store, "df3") + store.append("df3", df3[:10]) + store.append("df3", df3[10:]) + tm.assert_frame_equal(store["df3"], df3) + + def test_append_all_nans(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + df = DataFrame( + {"A1": np.random.randn(20), "A2": np.random.randn(20)}, + index=np.arange(20), + ) + df.loc[0:15, :] = np.nan + + # nan some entire rows (dropna=True) + _maybe_remove(store, "df") + store.append("df", df[:10], dropna=True) + store.append("df", df[10:], dropna=True) + tm.assert_frame_equal(store["df"], df[-4:]) + + # nan some entire rows (dropna=False) + _maybe_remove(store, "df2") + store.append("df2", df[:10], dropna=False) + store.append("df2", df[10:], dropna=False) + tm.assert_frame_equal(store["df2"], df) + + # tests the option io.hdf.dropna_table + pd.set_option("io.hdf.dropna_table", False) + _maybe_remove(store, "df3") + store.append("df3", df[:10]) + store.append("df3", df[10:]) + tm.assert_frame_equal(store["df3"], df) + + pd.set_option("io.hdf.dropna_table", True) + _maybe_remove(store, "df4") + store.append("df4", df[:10]) + store.append("df4", df[10:]) + tm.assert_frame_equal(store["df4"], df[-4:]) + + # nan some entire rows (string are still written!) + df = DataFrame( + { + "A1": np.random.randn(20), + "A2": np.random.randn(20), + "B": "foo", + "C": "bar", + }, + index=np.arange(20), + ) + + df.loc[0:15, :] = np.nan + + _maybe_remove(store, "df") + store.append("df", df[:10], dropna=True) + store.append("df", df[10:], dropna=True) + tm.assert_frame_equal(store["df"], df) + + _maybe_remove(store, "df2") + store.append("df2", df[:10], dropna=False) + store.append("df2", df[10:], dropna=False) + tm.assert_frame_equal(store["df2"], df) + + # nan some entire rows (but since we have dates they are still + # written!) + df = DataFrame( + { + "A1": np.random.randn(20), + "A2": np.random.randn(20), + "B": "foo", + "C": "bar", + "D": Timestamp("20010101"), + "E": datetime.datetime(2001, 1, 2, 0, 0), + }, + index=np.arange(20), + ) + + df.loc[0:15, :] = np.nan + + _maybe_remove(store, "df") + store.append("df", df[:10], dropna=True) + store.append("df", df[10:], dropna=True) + tm.assert_frame_equal(store["df"], df) + + _maybe_remove(store, "df2") + store.append("df2", df[:10], dropna=False) + store.append("df2", df[10:], dropna=False) + tm.assert_frame_equal(store["df2"], df) + + # Test to make sure defaults are to not drop. + # Corresponding to Issue 9382 + df_with_missing = DataFrame( + {"col1": [0, np.nan, 2], "col2": [1, np.nan, np.nan]} + ) + + with ensure_clean_path(setup_path) as path: + df_with_missing.to_hdf(path, "df_with_missing", format="table") + reloaded = read_hdf(path, "df_with_missing") + tm.assert_frame_equal(df_with_missing, reloaded) + + def test_read_missing_key_close_store(self, setup_path): + # GH 25766 + with ensure_clean_path(setup_path) as path: + df = pd.DataFrame({"a": range(2), "b": range(2)}) + df.to_hdf(path, "k1") + + with pytest.raises(KeyError, match="'No object named k2 in the file'"): + pd.read_hdf(path, "k2") + + # smoke test to test that file is properly closed after + # read with KeyError before another write + df.to_hdf(path, "k2") + + def test_append_frame_column_oriented(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + # column oriented + df = tm.makeTimeDataFrame() + _maybe_remove(store, "df1") + store.append("df1", df.iloc[:, :2], axes=["columns"]) + store.append("df1", df.iloc[:, 2:]) + tm.assert_frame_equal(store["df1"], df) + + result = store.select("df1", "columns=A") + expected = df.reindex(columns=["A"]) + tm.assert_frame_equal(expected, result) + + # selection on the non-indexable + result = store.select("df1", ("columns=A", "index=df.index[0:4]")) + expected = df.reindex(columns=["A"], index=df.index[0:4]) + tm.assert_frame_equal(expected, result) + + # this isn't supported + with pytest.raises(TypeError): + store.select("df1", "columns=A and index>df.index[4]") + + def test_append_with_different_block_ordering(self, setup_path): + + # GH 4096; using same frames, but different block orderings + with ensure_clean_store(setup_path) as store: + + for i in range(10): + + df = DataFrame(np.random.randn(10, 2), columns=list("AB")) + df["index"] = range(10) + df["index"] += i * 10 + df["int64"] = Series([1] * len(df), dtype="int64") + df["int16"] = Series([1] * len(df), dtype="int16") + + if i % 2 == 0: + del df["int64"] + df["int64"] = Series([1] * len(df), dtype="int64") + if i % 3 == 0: + a = df.pop("A") + df["A"] = a + + df.set_index("index", inplace=True) + + store.append("df", df) + + # test a different ordering but with more fields (like invalid + # combinate) + with ensure_clean_store(setup_path) as store: + + df = DataFrame(np.random.randn(10, 2), columns=list("AB"), dtype="float64") + df["int64"] = Series([1] * len(df), dtype="int64") + df["int16"] = Series([1] * len(df), dtype="int16") + store.append("df", df) + + # store additional fields in different blocks + df["int16_2"] = Series([1] * len(df), dtype="int16") + with pytest.raises(ValueError): + store.append("df", df) + + # store multile additional fields in different blocks + df["float_3"] = Series([1.0] * len(df), dtype="float64") + with pytest.raises(ValueError): + store.append("df", df) + + def test_append_with_strings(self, setup_path): + + with ensure_clean_store(setup_path) as store: + with catch_warnings(record=True): + + def check_col(key, name, size): + assert ( + getattr(store.get_storer(key).table.description, name).itemsize + == size + ) + + # avoid truncation on elements + df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]]) + store.append("df_big", df) + tm.assert_frame_equal(store.select("df_big"), df) + check_col("df_big", "values_block_1", 15) + + # appending smaller string ok + df2 = DataFrame([[124, "asdqy"], [346, "dggnhefbdfb"]]) + store.append("df_big", df2) + expected = concat([df, df2]) + tm.assert_frame_equal(store.select("df_big"), expected) + check_col("df_big", "values_block_1", 15) + + # avoid truncation on elements + df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]]) + store.append("df_big2", df, min_itemsize={"values": 50}) + tm.assert_frame_equal(store.select("df_big2"), df) + check_col("df_big2", "values_block_1", 50) + + # bigger string on next append + store.append("df_new", df) + df_new = DataFrame( + [[124, "abcdefqhij"], [346, "abcdefghijklmnopqrtsuvwxyz"]] + ) + with pytest.raises(ValueError): + store.append("df_new", df_new) + + # min_itemsize on Series index (GH 11412) + df = tm.makeMixedDataFrame().set_index("C") + store.append("ss", df["B"], min_itemsize={"index": 4}) + tm.assert_series_equal(store.select("ss"), df["B"]) + + # same as above, with data_columns=True + store.append( + "ss2", df["B"], data_columns=True, min_itemsize={"index": 4} + ) + tm.assert_series_equal(store.select("ss2"), df["B"]) + + # min_itemsize in index without appending (GH 10381) + store.put("ss3", df, format="table", min_itemsize={"index": 6}) + # just make sure there is a longer string: + df2 = df.copy().reset_index().assign(C="longer").set_index("C") + store.append("ss3", df2) + tm.assert_frame_equal(store.select("ss3"), pd.concat([df, df2])) + + # same as above, with a Series + store.put("ss4", df["B"], format="table", min_itemsize={"index": 6}) + store.append("ss4", df2["B"]) + tm.assert_series_equal( + store.select("ss4"), pd.concat([df["B"], df2["B"]]) + ) + + # with nans + _maybe_remove(store, "df") + df = tm.makeTimeDataFrame() + df["string"] = "foo" + df.loc[1:4, "string"] = np.nan + df["string2"] = "bar" + df.loc[4:8, "string2"] = np.nan + df["string3"] = "bah" + df.loc[1:, "string3"] = np.nan + store.append("df", df) + result = store.select("df") + tm.assert_frame_equal(result, df) + + with ensure_clean_store(setup_path) as store: + + def check_col(key, name, size): + assert getattr( + store.get_storer(key).table.description, name + ).itemsize, size + + df = DataFrame(dict(A="foo", B="bar"), index=range(10)) + + # a min_itemsize that creates a data_column + _maybe_remove(store, "df") + store.append("df", df, min_itemsize={"A": 200}) + check_col("df", "A", 200) + assert store.get_storer("df").data_columns == ["A"] + + # a min_itemsize that creates a data_column2 + _maybe_remove(store, "df") + store.append("df", df, data_columns=["B"], min_itemsize={"A": 200}) + check_col("df", "A", 200) + assert store.get_storer("df").data_columns == ["B", "A"] + + # a min_itemsize that creates a data_column2 + _maybe_remove(store, "df") + store.append("df", df, data_columns=["B"], min_itemsize={"values": 200}) + check_col("df", "B", 200) + check_col("df", "values_block_0", 200) + assert store.get_storer("df").data_columns == ["B"] + + # infer the .typ on subsequent appends + _maybe_remove(store, "df") + store.append("df", df[:5], min_itemsize=200) + store.append("df", df[5:], min_itemsize=200) + tm.assert_frame_equal(store["df"], df) + + # invalid min_itemsize keys + df = DataFrame(["foo", "foo", "foo", "barh", "barh", "barh"], columns=["A"]) + _maybe_remove(store, "df") + with pytest.raises(ValueError): + store.append("df", df, min_itemsize={"foo": 20, "foobar": 20}) + + def test_append_with_empty_string(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + # with all empty strings (GH 12242) + df = DataFrame({"x": ["a", "b", "c", "d", "e", "f", ""]}) + store.append("df", df[:-1], min_itemsize={"x": 1}) + store.append("df", df[-1:], min_itemsize={"x": 1}) + tm.assert_frame_equal(store.select("df"), df) + + def test_to_hdf_with_min_itemsize(self, setup_path): + + with ensure_clean_path(setup_path) as path: + + # min_itemsize in index with to_hdf (GH 10381) + df = tm.makeMixedDataFrame().set_index("C") + df.to_hdf(path, "ss3", format="table", min_itemsize={"index": 6}) + # just make sure there is a longer string: + df2 = df.copy().reset_index().assign(C="longer").set_index("C") + df2.to_hdf(path, "ss3", append=True, format="table") + tm.assert_frame_equal(pd.read_hdf(path, "ss3"), pd.concat([df, df2])) + + # same as above, with a Series + df["B"].to_hdf(path, "ss4", format="table", min_itemsize={"index": 6}) + df2["B"].to_hdf(path, "ss4", append=True, format="table") + tm.assert_series_equal( + pd.read_hdf(path, "ss4"), pd.concat([df["B"], df2["B"]]) + ) + + @pytest.mark.parametrize( + "format", [pytest.param("fixed", marks=xfail_non_writeable), "table"] + ) + def test_to_hdf_errors(self, format, setup_path): + + data = ["\ud800foo"] + ser = pd.Series(data, index=pd.Index(data)) + with ensure_clean_path(setup_path) as path: + # GH 20835 + ser.to_hdf(path, "table", format=format, errors="surrogatepass") + + result = pd.read_hdf(path, "table", errors="surrogatepass") + tm.assert_series_equal(result, ser) + + def test_append_with_data_columns(self, setup_path): + + with ensure_clean_store(setup_path) as store: + df = tm.makeTimeDataFrame() + df.iloc[0, df.columns.get_loc("B")] = 1.0 + _maybe_remove(store, "df") + store.append("df", df[:2], data_columns=["B"]) + store.append("df", df[2:]) + tm.assert_frame_equal(store["df"], df) + + # check that we have indices created + assert store._handle.root.df.table.cols.index.is_indexed is True + assert store._handle.root.df.table.cols.B.is_indexed is True + + # data column searching + result = store.select("df", "B>0") + expected = df[df.B > 0] + tm.assert_frame_equal(result, expected) + + # data column searching (with an indexable and a data_columns) + result = store.select("df", "B>0 and index>df.index[3]") + df_new = df.reindex(index=df.index[4:]) + expected = df_new[df_new.B > 0] + tm.assert_frame_equal(result, expected) + + # data column selection with a string data_column + df_new = df.copy() + df_new["string"] = "foo" + df_new.loc[1:4, "string"] = np.nan + df_new.loc[5:6, "string"] = "bar" + _maybe_remove(store, "df") + store.append("df", df_new, data_columns=["string"]) + result = store.select("df", "string='foo'") + expected = df_new[df_new.string == "foo"] + tm.assert_frame_equal(result, expected) + + # using min_itemsize and a data column + def check_col(key, name, size): + assert ( + getattr(store.get_storer(key).table.description, name).itemsize + == size + ) + + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "df") + store.append( + "df", df_new, data_columns=["string"], min_itemsize={"string": 30} + ) + check_col("df", "string", 30) + _maybe_remove(store, "df") + store.append("df", df_new, data_columns=["string"], min_itemsize=30) + check_col("df", "string", 30) + _maybe_remove(store, "df") + store.append( + "df", df_new, data_columns=["string"], min_itemsize={"values": 30} + ) + check_col("df", "string", 30) + + with ensure_clean_store(setup_path) as store: + df_new["string2"] = "foobarbah" + df_new["string_block1"] = "foobarbah1" + df_new["string_block2"] = "foobarbah2" + _maybe_remove(store, "df") + store.append( + "df", + df_new, + data_columns=["string", "string2"], + min_itemsize={"string": 30, "string2": 40, "values": 50}, + ) + check_col("df", "string", 30) + check_col("df", "string2", 40) + check_col("df", "values_block_1", 50) + + with ensure_clean_store(setup_path) as store: + # multiple data columns + df_new = df.copy() + df_new.iloc[0, df_new.columns.get_loc("A")] = 1.0 + df_new.iloc[0, df_new.columns.get_loc("B")] = -1.0 + df_new["string"] = "foo" + + sl = df_new.columns.get_loc("string") + df_new.iloc[1:4, sl] = np.nan + df_new.iloc[5:6, sl] = "bar" + + df_new["string2"] = "foo" + sl = df_new.columns.get_loc("string2") + df_new.iloc[2:5, sl] = np.nan + df_new.iloc[7:8, sl] = "bar" + _maybe_remove(store, "df") + store.append("df", df_new, data_columns=["A", "B", "string", "string2"]) + result = store.select( + "df", "string='foo' and string2='foo' and A>0 and B<0" + ) + expected = df_new[ + (df_new.string == "foo") + & (df_new.string2 == "foo") + & (df_new.A > 0) + & (df_new.B < 0) + ] + tm.assert_frame_equal(result, expected, check_index_type=False) + + # yield an empty frame + result = store.select("df", "string='foo' and string2='cool'") + expected = df_new[(df_new.string == "foo") & (df_new.string2 == "cool")] + tm.assert_frame_equal(result, expected, check_index_type=False) + + with ensure_clean_store(setup_path) as store: + # doc example + df_dc = df.copy() + df_dc["string"] = "foo" + df_dc.loc[4:6, "string"] = np.nan + df_dc.loc[7:9, "string"] = "bar" + df_dc["string2"] = "cool" + df_dc["datetime"] = Timestamp("20010102") + df_dc = df_dc._convert(datetime=True) + df_dc.loc[3:5, ["A", "B", "datetime"]] = np.nan + + _maybe_remove(store, "df_dc") + store.append( + "df_dc", df_dc, data_columns=["B", "C", "string", "string2", "datetime"] + ) + result = store.select("df_dc", "B>0") + + expected = df_dc[df_dc.B > 0] + tm.assert_frame_equal(result, expected, check_index_type=False) + + result = store.select("df_dc", ["B > 0", "C > 0", "string == foo"]) + expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] + tm.assert_frame_equal(result, expected, check_index_type=False) + + with ensure_clean_store(setup_path) as store: + # doc example part 2 + np.random.seed(1234) + index = date_range("1/1/2000", periods=8) + df_dc = DataFrame( + np.random.randn(8, 3), index=index, columns=["A", "B", "C"] + ) + df_dc["string"] = "foo" + df_dc.loc[4:6, "string"] = np.nan + df_dc.loc[7:9, "string"] = "bar" + df_dc.loc[:, ["B", "C"]] = df_dc.loc[:, ["B", "C"]].abs() + df_dc["string2"] = "cool" + + # on-disk operations + store.append("df_dc", df_dc, data_columns=["B", "C", "string", "string2"]) + + result = store.select("df_dc", "B>0") + expected = df_dc[df_dc.B > 0] + tm.assert_frame_equal(result, expected) + + result = store.select("df_dc", ["B > 0", "C > 0", 'string == "foo"']) + expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] + tm.assert_frame_equal(result, expected) + + def test_create_table_index(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + with catch_warnings(record=True): + + def col(t, column): + return getattr(store.get_storer(t).table.cols, column) + + # data columns + df = tm.makeTimeDataFrame() + df["string"] = "foo" + df["string2"] = "bar" + store.append("f", df, data_columns=["string", "string2"]) + assert col("f", "index").is_indexed is True + assert col("f", "string").is_indexed is True + assert col("f", "string2").is_indexed is True + + # specify index=columns + store.append( + "f2", df, index=["string"], data_columns=["string", "string2"] + ) + assert col("f2", "index").is_indexed is False + assert col("f2", "string").is_indexed is True + assert col("f2", "string2").is_indexed is False + + # try to index a non-table + _maybe_remove(store, "f2") + store.put("f2", df) + with pytest.raises(TypeError): + store.create_table_index("f2") + + def test_append_hierarchical(self, setup_path): + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["foo", "bar"], + ) + df = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) + + with ensure_clean_store(setup_path) as store: + store.append("mi", df) + result = store.select("mi") + tm.assert_frame_equal(result, df) + + # GH 3748 + result = store.select("mi", columns=["A", "B"]) + expected = df.reindex(columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + with ensure_clean_path("test.hdf") as path: + df.to_hdf(path, "df", format="table") + result = read_hdf(path, "df", columns=["A", "B"]) + expected = df.reindex(columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + def test_column_multiindex(self, setup_path): + # GH 4710 + # recreate multi-indexes properly + + index = MultiIndex.from_tuples( + [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")], names=["first", "second"] + ) + df = DataFrame(np.arange(12).reshape(3, 4), columns=index) + expected = df.copy() + if isinstance(expected.index, RangeIndex): + expected.index = Int64Index(expected.index) + + with ensure_clean_store(setup_path) as store: + + store.put("df", df) + tm.assert_frame_equal( + store["df"], expected, check_index_type=True, check_column_type=True + ) + + store.put("df1", df, format="table") + tm.assert_frame_equal( + store["df1"], expected, check_index_type=True, check_column_type=True + ) + + with pytest.raises(ValueError): + store.put("df2", df, format="table", data_columns=["A"]) + with pytest.raises(ValueError): + store.put("df3", df, format="table", data_columns=True) + + # appending multi-column on existing table (see GH 6167) + with ensure_clean_store(setup_path) as store: + store.append("df2", df) + store.append("df2", df) + + tm.assert_frame_equal(store["df2"], concat((df, df))) + + # non_index_axes name + df = DataFrame( + np.arange(12).reshape(3, 4), columns=Index(list("ABCD"), name="foo") + ) + expected = df.copy() + if isinstance(expected.index, RangeIndex): + expected.index = Int64Index(expected.index) + + with ensure_clean_store(setup_path) as store: + + store.put("df1", df, format="table") + tm.assert_frame_equal( + store["df1"], expected, check_index_type=True, check_column_type=True + ) + + def test_store_multiindex(self, setup_path): + + # validate multi-index names + # GH 5527 + with ensure_clean_store(setup_path) as store: + + def make_index(names=None): + return MultiIndex.from_tuples( + [ + (datetime.datetime(2013, 12, d), s, t) + for d in range(1, 3) + for s in range(2) + for t in range(3) + ], + names=names, + ) + + # no names + _maybe_remove(store, "df") + df = DataFrame(np.zeros((12, 2)), columns=["a", "b"], index=make_index()) + store.append("df", df) + tm.assert_frame_equal(store.select("df"), df) + + # partial names + _maybe_remove(store, "df") + df = DataFrame( + np.zeros((12, 2)), + columns=["a", "b"], + index=make_index(["date", None, None]), + ) + store.append("df", df) + tm.assert_frame_equal(store.select("df"), df) + + # series + _maybe_remove(store, "s") + s = Series(np.zeros(12), index=make_index(["date", None, None])) + store.append("s", s) + xp = Series(np.zeros(12), index=make_index(["date", "level_1", "level_2"])) + tm.assert_series_equal(store.select("s"), xp) + + # dup with column + _maybe_remove(store, "df") + df = DataFrame( + np.zeros((12, 2)), + columns=["a", "b"], + index=make_index(["date", "a", "t"]), + ) + with pytest.raises(ValueError): + store.append("df", df) + + # dup within level + _maybe_remove(store, "df") + df = DataFrame( + np.zeros((12, 2)), + columns=["a", "b"], + index=make_index(["date", "date", "date"]), + ) + with pytest.raises(ValueError): + store.append("df", df) + + # fully names + _maybe_remove(store, "df") + df = DataFrame( + np.zeros((12, 2)), + columns=["a", "b"], + index=make_index(["date", "s", "t"]), + ) + store.append("df", df) + tm.assert_frame_equal(store.select("df"), df) + + def test_select_columns_in_where(self, setup_path): + + # GH 6169 + # recreate multi-indexes when columns is passed + # in the `where` argument + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["foo_name", "bar_name"], + ) + + # With a DataFrame + df = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) + + with ensure_clean_store(setup_path) as store: + store.put("df", df, format="table") + expected = df[["A"]] + + tm.assert_frame_equal(store.select("df", columns=["A"]), expected) + + tm.assert_frame_equal(store.select("df", where="columns=['A']"), expected) + + # With a Series + s = Series(np.random.randn(10), index=index, name="A") + with ensure_clean_store(setup_path) as store: + store.put("s", s, format="table") + tm.assert_series_equal(store.select("s", where="columns=['A']"), s) + + def test_mi_data_columns(self, setup_path): + # GH 14435 + idx = pd.MultiIndex.from_arrays( + [date_range("2000-01-01", periods=5), range(5)], names=["date", "id"] + ) + df = pd.DataFrame({"a": [1.1, 1.2, 1.3, 1.4, 1.5]}, index=idx) + + with ensure_clean_store(setup_path) as store: + store.append("df", df, data_columns=True) + + actual = store.select("df", where="id == 1") + expected = df.iloc[[1], :] + tm.assert_frame_equal(actual, expected) + + def test_pass_spec_to_storer(self, setup_path): + + df = tm.makeDataFrame() + + with ensure_clean_store(setup_path) as store: + store.put("df", df) + with pytest.raises(TypeError): + store.select("df", columns=["A"]) + with pytest.raises(TypeError): + store.select("df", where=[("columns=A")]) + + @xfail_non_writeable + def test_append_misc(self, setup_path): + + with ensure_clean_store(setup_path) as store: + df = tm.makeDataFrame() + store.append("df", df, chunksize=1) + result = store.select("df") + tm.assert_frame_equal(result, df) + + store.append("df1", df, expectedrows=10) + result = store.select("df1") + tm.assert_frame_equal(result, df) + + # more chunksize in append tests + def check(obj, comparator): + for c in [10, 200, 1000]: + with ensure_clean_store(setup_path, mode="w") as store: + store.append("obj", obj, chunksize=c) + result = store.select("obj") + comparator(result, obj) + + df = tm.makeDataFrame() + df["string"] = "foo" + df["float322"] = 1.0 + df["float322"] = df["float322"].astype("float32") + df["bool"] = df["float322"] > 0 + df["time1"] = Timestamp("20130101") + df["time2"] = Timestamp("20130102") + check(df, tm.assert_frame_equal) + + # empty frame, GH4273 + with ensure_clean_store(setup_path) as store: + + # 0 len + df_empty = DataFrame(columns=list("ABC")) + store.append("df", df_empty) + with pytest.raises(KeyError, match="'No object named df in the file'"): + store.select("df") + + # repeated append of 0/non-zero frames + df = DataFrame(np.random.rand(10, 3), columns=list("ABC")) + store.append("df", df) + assert_frame_equal(store.select("df"), df) + store.append("df", df_empty) + assert_frame_equal(store.select("df"), df) + + # store + df = DataFrame(columns=list("ABC")) + store.put("df2", df) + assert_frame_equal(store.select("df2"), df) + + def test_append_raise(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + # test append with invalid input to get good error messages + + # list in column + df = tm.makeDataFrame() + df["invalid"] = [["a"]] * len(df) + assert df.dtypes["invalid"] == np.object_ + with pytest.raises(TypeError): + store.append("df", df) + + # multiple invalid columns + df["invalid2"] = [["a"]] * len(df) + df["invalid3"] = [["a"]] * len(df) + with pytest.raises(TypeError): + store.append("df", df) + + # datetime with embedded nans as object + df = tm.makeDataFrame() + s = Series(datetime.datetime(2001, 1, 2), index=df.index) + s = s.astype(object) + s[0:5] = np.nan + df["invalid"] = s + assert df.dtypes["invalid"] == np.object_ + with pytest.raises(TypeError): + store.append("df", df) + + # directly ndarray + with pytest.raises(TypeError): + store.append("df", np.arange(10)) + + # series directly + with pytest.raises(TypeError): + store.append("df", Series(np.arange(10))) + + # appending an incompatible table + df = tm.makeDataFrame() + store.append("df", df) + + df["foo"] = "foo" + with pytest.raises(ValueError): + store.append("df", df) + + def test_table_index_incompatible_dtypes(self, setup_path): + df1 = DataFrame({"a": [1, 2, 3]}) + df2 = DataFrame({"a": [4, 5, 6]}, index=date_range("1/1/2000", periods=3)) + + with ensure_clean_store(setup_path) as store: + store.put("frame", df1, format="table") + with pytest.raises(TypeError): + store.put("frame", df2, format="table", append=True) + + def test_table_values_dtypes_roundtrip(self, setup_path): + + with ensure_clean_store(setup_path) as store: + df1 = DataFrame({"a": [1, 2, 3]}, dtype="f8") + store.append("df_f8", df1) + assert_series_equal(df1.dtypes, store["df_f8"].dtypes) + + df2 = DataFrame({"a": [1, 2, 3]}, dtype="i8") + store.append("df_i8", df2) + assert_series_equal(df2.dtypes, store["df_i8"].dtypes) + + # incompatible dtype + with pytest.raises(ValueError): + store.append("df_i8", df1) + + # check creation/storage/retrieval of float32 (a bit hacky to + # actually create them thought) + df1 = DataFrame(np.array([[1], [2], [3]], dtype="f4"), columns=["A"]) + store.append("df_f4", df1) + assert_series_equal(df1.dtypes, store["df_f4"].dtypes) + assert df1.dtypes[0] == "float32" + + # check with mixed dtypes + df1 = DataFrame( + { + c: Series(np.random.randint(5), dtype=c) + for c in ["float32", "float64", "int32", "int64", "int16", "int8"] + } + ) + df1["string"] = "foo" + df1["float322"] = 1.0 + df1["float322"] = df1["float322"].astype("float32") + df1["bool"] = df1["float32"] > 0 + df1["time1"] = Timestamp("20130101") + df1["time2"] = Timestamp("20130102") + + store.append("df_mixed_dtypes1", df1) + result = store.select("df_mixed_dtypes1").dtypes.value_counts() + result.index = [str(i) for i in result.index] + expected = Series( + { + "float32": 2, + "float64": 1, + "int32": 1, + "bool": 1, + "int16": 1, + "int8": 1, + "int64": 1, + "object": 1, + "datetime64[ns]": 2, + } + ) + result = result.sort_index() + expected = expected.sort_index() + tm.assert_series_equal(result, expected) + + def test_table_mixed_dtypes(self, setup_path): + + # frame + df = tm.makeDataFrame() + df["obj1"] = "foo" + df["obj2"] = "bar" + df["bool1"] = df["A"] > 0 + df["bool2"] = df["B"] > 0 + df["bool3"] = True + df["int1"] = 1 + df["int2"] = 2 + df["timestamp1"] = Timestamp("20010102") + df["timestamp2"] = Timestamp("20010103") + df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) + df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) + df.loc[3:6, ["obj1"]] = np.nan + df = df._consolidate()._convert(datetime=True) + + with ensure_clean_store(setup_path) as store: + store.append("df1_mixed", df) + tm.assert_frame_equal(store.select("df1_mixed"), df) + + def test_unimplemented_dtypes_table_columns(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + dtypes = [("date", datetime.date(2001, 1, 2))] + + # currently not supported dtypes #### + for n, f in dtypes: + df = tm.makeDataFrame() + df[n] = f + with pytest.raises(TypeError): + store.append("df1_{n}".format(n=n), df) + + # frame + df = tm.makeDataFrame() + df["obj1"] = "foo" + df["obj2"] = "bar" + df["datetime1"] = datetime.date(2001, 1, 2) + df = df._consolidate()._convert(datetime=True) + + with ensure_clean_store(setup_path) as store: + # this fails because we have a date in the object block...... + with pytest.raises(TypeError): + store.append("df_unimplemented", df) + + @xfail_non_writeable + @pytest.mark.skipif( + LooseVersion(np.__version__) == LooseVersion("1.15.0"), + reason=( + "Skipping pytables test when numpy version is " + "exactly equal to 1.15.0: gh-22098" + ), + ) + def test_calendar_roundtrip_issue(self, setup_path): + + # 8591 + # doc example from tseries holiday section + weekmask_egypt = "Sun Mon Tue Wed Thu" + holidays = [ + "2012-05-01", + datetime.datetime(2013, 5, 1), + np.datetime64("2014-05-01"), + ] + bday_egypt = pd.offsets.CustomBusinessDay( + holidays=holidays, weekmask=weekmask_egypt + ) + dt = datetime.datetime(2013, 4, 30) + dts = date_range(dt, periods=5, freq=bday_egypt) + + s = Series(dts.weekday, dts).map(Series("Mon Tue Wed Thu Fri Sat Sun".split())) + + with ensure_clean_store(setup_path) as store: + + store.put("fixed", s) + result = store.select("fixed") + assert_series_equal(result, s) + + store.append("table", s) + result = store.select("table") + assert_series_equal(result, s) + + def test_roundtrip_tz_aware_index(self, setup_path): + # GH 17618 + time = pd.Timestamp("2000-01-01 01:00:00", tz="US/Eastern") + df = pd.DataFrame(data=[0], index=[time]) + + with ensure_clean_store(setup_path) as store: + store.put("frame", df, format="fixed") + recons = store["frame"] + tm.assert_frame_equal(recons, df) + assert recons.index[0].value == 946706400000000000 + + def test_append_with_timedelta(self, setup_path): + # GH 3577 + # append timedelta + + df = DataFrame( + dict( + A=Timestamp("20130101"), + B=[ + Timestamp("20130101") + timedelta(days=i, seconds=10) + for i in range(10) + ], + ) + ) + df["C"] = df["A"] - df["B"] + df.loc[3:5, "C"] = np.nan + + with ensure_clean_store(setup_path) as store: + + # table + _maybe_remove(store, "df") + store.append("df", df, data_columns=True) + result = store.select("df") + assert_frame_equal(result, df) + + result = store.select("df", where="C<100000") + assert_frame_equal(result, df) + + result = store.select("df", where="C") + + # from the docs + with ensure_clean_path(setup_path) as path: + dfq = DataFrame( + np.random.randn(10, 4), + columns=list("ABCD"), + index=date_range("20130101", periods=10), + ) + dfq.to_hdf(path, "dfq", format="table", data_columns=True) + + # check ok + read_hdf( + path, "dfq", where="index>Timestamp('20130104') & columns=['A', 'B']" + ) + read_hdf(path, "dfq", where="A>0 or C>0") + + # catch the invalid reference + with ensure_clean_path(setup_path) as path: + dfq = DataFrame( + np.random.randn(10, 4), + columns=list("ABCD"), + index=date_range("20130101", periods=10), + ) + dfq.to_hdf(path, "dfq", format="table") + + with pytest.raises(ValueError): + read_hdf(path, "dfq", where="A>0 or C>0") + + def test_same_name_scoping(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + import pandas as pd + + df = DataFrame( + np.random.randn(20, 2), index=pd.date_range("20130101", periods=20) + ) + store.put("df", df, format="table") + expected = df[df.index > pd.Timestamp("20130105")] + + import datetime # noqa + + result = store.select("df", "index>datetime.datetime(2013,1,5)") + assert_frame_equal(result, expected) + + from datetime import datetime # noqa + + # technically an error, but allow it + result = store.select("df", "index>datetime.datetime(2013,1,5)") + assert_frame_equal(result, expected) + + result = store.select("df", "index>datetime(2013,1,5)") + assert_frame_equal(result, expected) + + def test_series(self, setup_path): + + s = tm.makeStringSeries() + self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) + + ts = tm.makeTimeSeries() + self._check_roundtrip(ts, tm.assert_series_equal, path=setup_path) + + ts2 = Series(ts.index, Index(ts.index, dtype=object)) + self._check_roundtrip(ts2, tm.assert_series_equal, path=setup_path) + + ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) + self._check_roundtrip( + ts3, tm.assert_series_equal, path=setup_path, check_index_type=False + ) + + def test_float_index(self, setup_path): + + # GH #454 + index = np.random.randn(10) + s = Series(np.random.randn(10), index=index) + self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) + + @xfail_non_writeable + def test_tuple_index(self, setup_path): + + # GH #492 + col = np.arange(10) + idx = [(0.0, 1.0), (2.0, 3.0), (4.0, 5.0)] + data = np.random.randn(30).reshape((3, 10)) + DF = DataFrame(data, index=idx, columns=col) + + with catch_warnings(record=True): + simplefilter("ignore", pd.errors.PerformanceWarning) + self._check_roundtrip(DF, tm.assert_frame_equal, path=setup_path) + + @xfail_non_writeable + @pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning") + def test_index_types(self, setup_path): + + with catch_warnings(record=True): + values = np.random.randn(2) + + func = lambda l, r: tm.assert_series_equal( + l, r, check_dtype=True, check_index_type=True, check_series_type=True + ) + + with catch_warnings(record=True): + ser = Series(values, [0, "y"]) + self._check_roundtrip(ser, func, path=setup_path) + + with catch_warnings(record=True): + ser = Series(values, [datetime.datetime.today(), 0]) + self._check_roundtrip(ser, func, path=setup_path) + + with catch_warnings(record=True): + ser = Series(values, ["y", 0]) + self._check_roundtrip(ser, func, path=setup_path) + + with catch_warnings(record=True): + ser = Series(values, [datetime.date.today(), "a"]) + self._check_roundtrip(ser, func, path=setup_path) + + with catch_warnings(record=True): + + ser = Series(values, [0, "y"]) + self._check_roundtrip(ser, func, path=setup_path) + + ser = Series(values, [datetime.datetime.today(), 0]) + self._check_roundtrip(ser, func, path=setup_path) + + ser = Series(values, ["y", 0]) + self._check_roundtrip(ser, func, path=setup_path) + + ser = Series(values, [datetime.date.today(), "a"]) + self._check_roundtrip(ser, func, path=setup_path) + + ser = Series(values, [1.23, "b"]) + self._check_roundtrip(ser, func, path=setup_path) + + ser = Series(values, [1, 1.53]) + self._check_roundtrip(ser, func, path=setup_path) + + ser = Series(values, [1, 5]) + self._check_roundtrip(ser, func, path=setup_path) + + ser = Series( + values, [datetime.datetime(2012, 1, 1), datetime.datetime(2012, 1, 2)] + ) + self._check_roundtrip(ser, func, path=setup_path) + + def test_timeseries_preepoch(self, setup_path): + + dr = bdate_range("1/1/1940", "1/1/1960") + ts = Series(np.random.randn(len(dr)), index=dr) + try: + self._check_roundtrip(ts, tm.assert_series_equal, path=setup_path) + except OverflowError: + pytest.skip("known failer on some windows platforms") + + @xfail_non_writeable + @pytest.mark.parametrize( + "compression", [False, pytest.param(True, marks=td.skip_if_windows_python_3)] + ) + def test_frame(self, compression, setup_path): + + df = tm.makeDataFrame() + + # put in some random NAs + df.values[0, 0] = np.nan + df.values[5, 3] = np.nan + + self._check_roundtrip_table( + df, tm.assert_frame_equal, path=setup_path, compression=compression + ) + self._check_roundtrip( + df, tm.assert_frame_equal, path=setup_path, compression=compression + ) + + tdf = tm.makeTimeDataFrame() + self._check_roundtrip( + tdf, tm.assert_frame_equal, path=setup_path, compression=compression + ) + + with ensure_clean_store(setup_path) as store: + # not consolidated + df["foo"] = np.random.randn(len(df)) + store["df"] = df + recons = store["df"] + assert recons._data.is_consolidated() + + # empty + self._check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path) + + @xfail_non_writeable + def test_empty_series_frame(self, setup_path): + s0 = Series() + s1 = Series(name="myseries") + df0 = DataFrame() + df1 = DataFrame(index=["a", "b", "c"]) + df2 = DataFrame(columns=["d", "e", "f"]) + + self._check_roundtrip(s0, tm.assert_series_equal, path=setup_path) + self._check_roundtrip(s1, tm.assert_series_equal, path=setup_path) + self._check_roundtrip(df0, tm.assert_frame_equal, path=setup_path) + self._check_roundtrip(df1, tm.assert_frame_equal, path=setup_path) + self._check_roundtrip(df2, tm.assert_frame_equal, path=setup_path) + + @xfail_non_writeable + @pytest.mark.parametrize( + "dtype", [np.int64, np.float64, np.object, "m8[ns]", "M8[ns]"] + ) + def test_empty_series(self, dtype, setup_path): + s = Series(dtype=dtype) + self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) + + def test_can_serialize_dates(self, setup_path): + + rng = [x.date() for x in bdate_range("1/1/2000", "1/30/2000")] + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + + self._check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) + + def test_store_hierarchical(self, setup_path): + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["foo", "bar"], + ) + frame = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) + + self._check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) + self._check_roundtrip(frame.T, tm.assert_frame_equal, path=setup_path) + self._check_roundtrip(frame["A"], tm.assert_series_equal, path=setup_path) + + # check that the names are stored + with ensure_clean_store(setup_path) as store: + store["frame"] = frame + recons = store["frame"] + tm.assert_frame_equal(recons, frame) + + def test_store_index_name(self, setup_path): + df = tm.makeDataFrame() + df.index.name = "foo" + + with ensure_clean_store(setup_path) as store: + store["frame"] = df + recons = store["frame"] + tm.assert_frame_equal(recons, df) + + def test_store_index_name_with_tz(self, setup_path): + # GH 13884 + df = pd.DataFrame({"A": [1, 2]}) + df.index = pd.DatetimeIndex([1234567890123456787, 1234567890123456788]) + df.index = df.index.tz_localize("UTC") + df.index.name = "foo" + + with ensure_clean_store(setup_path) as store: + store.put("frame", df, format="table") + recons = store["frame"] + tm.assert_frame_equal(recons, df) + + @pytest.mark.parametrize("table_format", ["table", "fixed"]) + def test_store_index_name_numpy_str(self, table_format, setup_path): + # GH #13492 + idx = pd.Index( + pd.to_datetime([datetime.date(2000, 1, 1), datetime.date(2000, 1, 2)]), + name="cols\u05d2", + ) + idx1 = pd.Index( + pd.to_datetime([datetime.date(2010, 1, 1), datetime.date(2010, 1, 2)]), + name="rows\u05d0", + ) + df = pd.DataFrame(np.arange(4).reshape(2, 2), columns=idx, index=idx1) + + # This used to fail, returning numpy strings instead of python strings. + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", format=table_format) + df2 = read_hdf(path, "df") + + assert_frame_equal(df, df2, check_names=True) + + assert type(df2.index.name) == str + assert type(df2.columns.name) == str + + def test_store_series_name(self, setup_path): + df = tm.makeDataFrame() + series = df["A"] + + with ensure_clean_store(setup_path) as store: + store["series"] = series + recons = store["series"] + tm.assert_series_equal(recons, series) + + @xfail_non_writeable + @pytest.mark.parametrize( + "compression", [False, pytest.param(True, marks=td.skip_if_windows_python_3)] + ) + def test_store_mixed(self, compression, setup_path): + def _make_one(): + df = tm.makeDataFrame() + df["obj1"] = "foo" + df["obj2"] = "bar" + df["bool1"] = df["A"] > 0 + df["bool2"] = df["B"] > 0 + df["int1"] = 1 + df["int2"] = 2 + return df._consolidate() + + df1 = _make_one() + df2 = _make_one() + + self._check_roundtrip(df1, tm.assert_frame_equal, path=setup_path) + self._check_roundtrip(df2, tm.assert_frame_equal, path=setup_path) + + with ensure_clean_store(setup_path) as store: + store["obj"] = df1 + tm.assert_frame_equal(store["obj"], df1) + store["obj"] = df2 + tm.assert_frame_equal(store["obj"], df2) + + # check that can store Series of all of these types + self._check_roundtrip( + df1["obj1"], + tm.assert_series_equal, + path=setup_path, + compression=compression, + ) + self._check_roundtrip( + df1["bool1"], + tm.assert_series_equal, + path=setup_path, + compression=compression, + ) + self._check_roundtrip( + df1["int1"], + tm.assert_series_equal, + path=setup_path, + compression=compression, + ) + + @pytest.mark.filterwarnings( + "ignore:\\nduplicate:pandas.io.pytables.DuplicateWarning" + ) + def test_select_with_dups(self, setup_path): + + # single dtypes + df = DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]) + df.index = date_range("20130101 9:30", periods=10, freq="T") + + with ensure_clean_store(setup_path) as store: + store.append("df", df) + + result = store.select("df") + expected = df + assert_frame_equal(result, expected, by_blocks=True) + + result = store.select("df", columns=df.columns) + expected = df + assert_frame_equal(result, expected, by_blocks=True) + + result = store.select("df", columns=["A"]) + expected = df.loc[:, ["A"]] + assert_frame_equal(result, expected) + + # dups across dtypes + df = concat( + [ + DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]), + DataFrame( + np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"] + ), + ], + axis=1, + ) + df.index = date_range("20130101 9:30", periods=10, freq="T") + + with ensure_clean_store(setup_path) as store: + store.append("df", df) + + result = store.select("df") + expected = df + assert_frame_equal(result, expected, by_blocks=True) + + result = store.select("df", columns=df.columns) + expected = df + assert_frame_equal(result, expected, by_blocks=True) + + expected = df.loc[:, ["A"]] + result = store.select("df", columns=["A"]) + assert_frame_equal(result, expected, by_blocks=True) + + expected = df.loc[:, ["B", "A"]] + result = store.select("df", columns=["B", "A"]) + assert_frame_equal(result, expected, by_blocks=True) + + # duplicates on both index and columns + with ensure_clean_store(setup_path) as store: + store.append("df", df) + store.append("df", df) + + expected = df.loc[:, ["B", "A"]] + expected = concat([expected, expected]) + result = store.select("df", columns=["B", "A"]) + assert_frame_equal(result, expected, by_blocks=True) + + def test_overwrite_node(self, setup_path): + + with ensure_clean_store(setup_path) as store: + store["a"] = tm.makeTimeDataFrame() + ts = tm.makeTimeSeries() + store["a"] = ts + + tm.assert_series_equal(store["a"], ts) + + def test_select(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + with catch_warnings(record=True): + + # select with columns= + df = tm.makeTimeDataFrame() + _maybe_remove(store, "df") + store.append("df", df) + result = store.select("df", columns=["A", "B"]) + expected = df.reindex(columns=["A", "B"]) + tm.assert_frame_equal(expected, result) + + # equivalently + result = store.select("df", [("columns=['A', 'B']")]) + expected = df.reindex(columns=["A", "B"]) + tm.assert_frame_equal(expected, result) + + # with a data column + _maybe_remove(store, "df") + store.append("df", df, data_columns=["A"]) + result = store.select("df", ["A > 0"], columns=["A", "B"]) + expected = df[df.A > 0].reindex(columns=["A", "B"]) + tm.assert_frame_equal(expected, result) + + # all a data columns + _maybe_remove(store, "df") + store.append("df", df, data_columns=True) + result = store.select("df", ["A > 0"], columns=["A", "B"]) + expected = df[df.A > 0].reindex(columns=["A", "B"]) + tm.assert_frame_equal(expected, result) + + # with a data column, but different columns + _maybe_remove(store, "df") + store.append("df", df, data_columns=["A"]) + result = store.select("df", ["A > 0"], columns=["C", "D"]) + expected = df[df.A > 0].reindex(columns=["C", "D"]) + tm.assert_frame_equal(expected, result) + + def test_select_dtypes(self, setup_path): + + with ensure_clean_store(setup_path) as store: + # with a Timestamp data column (GH #2637) + df = DataFrame( + dict(ts=bdate_range("2012-01-01", periods=300), A=np.random.randn(300)) + ) + _maybe_remove(store, "df") + store.append("df", df, data_columns=["ts", "A"]) + + result = store.select("df", "ts>=Timestamp('2012-02-01')") + expected = df[df.ts >= Timestamp("2012-02-01")] + tm.assert_frame_equal(expected, result) + + # bool columns (GH #2849) + df = DataFrame(np.random.randn(5, 2), columns=["A", "B"]) + df["object"] = "foo" + df.loc[4:5, "object"] = "bar" + df["boolv"] = df["A"] > 0 + _maybe_remove(store, "df") + store.append("df", df, data_columns=True) + + expected = df[df.boolv == True].reindex(columns=["A", "boolv"]) # noqa + for v in [True, "true", 1]: + result = store.select( + "df", "boolv == {v!s}".format(v=v), columns=["A", "boolv"] + ) + tm.assert_frame_equal(expected, result) + + expected = df[df.boolv == False].reindex(columns=["A", "boolv"]) # noqa + for v in [False, "false", 0]: + result = store.select( + "df", "boolv == {v!s}".format(v=v), columns=["A", "boolv"] + ) + tm.assert_frame_equal(expected, result) + + # integer index + df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20))) + _maybe_remove(store, "df_int") + store.append("df_int", df) + result = store.select("df_int", "index<10 and columns=['A']") + expected = df.reindex(index=list(df.index)[0:10], columns=["A"]) + tm.assert_frame_equal(expected, result) + + # float index + df = DataFrame( + dict( + A=np.random.rand(20), + B=np.random.rand(20), + index=np.arange(20, dtype="f8"), + ) + ) + _maybe_remove(store, "df_float") + store.append("df_float", df) + result = store.select("df_float", "index<10.0 and columns=['A']") + expected = df.reindex(index=list(df.index)[0:10], columns=["A"]) + tm.assert_frame_equal(expected, result) + + with ensure_clean_store(setup_path) as store: + + # floats w/o NaN + df = DataFrame(dict(cols=range(11), values=range(11)), dtype="float64") + df["cols"] = (df["cols"] + 10).apply(str) + + store.append("df1", df, data_columns=True) + result = store.select("df1", where="values>2.0") + expected = df[df["values"] > 2.0] + tm.assert_frame_equal(expected, result) + + # floats with NaN + df.iloc[0] = np.nan + expected = df[df["values"] > 2.0] + + store.append("df2", df, data_columns=True, index=False) + result = store.select("df2", where="values>2.0") + tm.assert_frame_equal(expected, result) + + # https://github.com/PyTables/PyTables/issues/282 + # bug in selection when 0th row has a np.nan and an index + # store.append('df3',df,data_columns=True) + # result = store.select( + # 'df3', where='values>2.0') + # tm.assert_frame_equal(expected, result) + + # not in first position float with NaN ok too + df = DataFrame(dict(cols=range(11), values=range(11)), dtype="float64") + df["cols"] = (df["cols"] + 10).apply(str) + + df.iloc[1] = np.nan + expected = df[df["values"] > 2.0] + + store.append("df4", df, data_columns=True) + result = store.select("df4", where="values>2.0") + tm.assert_frame_equal(expected, result) + + # test selection with comparison against numpy scalar + # GH 11283 + with ensure_clean_store(setup_path) as store: + df = tm.makeDataFrame() + + expected = df[df["A"] > 0] + + store.append("df", df, data_columns=True) + np_zero = np.float64(0) # noqa + result = store.select("df", where=["A>np_zero"]) + tm.assert_frame_equal(expected, result) + + def test_select_with_many_inputs(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + df = DataFrame( + dict( + ts=bdate_range("2012-01-01", periods=300), + A=np.random.randn(300), + B=range(300), + users=["a"] * 50 + + ["b"] * 50 + + ["c"] * 100 + + ["a{i:03d}".format(i=i) for i in range(100)], + ) + ) + _maybe_remove(store, "df") + store.append("df", df, data_columns=["ts", "A", "B", "users"]) + + # regular select + result = store.select("df", "ts>=Timestamp('2012-02-01')") + expected = df[df.ts >= Timestamp("2012-02-01")] + tm.assert_frame_equal(expected, result) + + # small selector + result = store.select( + "df", "ts>=Timestamp('2012-02-01') & users=['a','b','c']" + ) + expected = df[ + (df.ts >= Timestamp("2012-02-01")) & df.users.isin(["a", "b", "c"]) + ] + tm.assert_frame_equal(expected, result) + + # big selector along the columns + selector = ["a", "b", "c"] + ["a{i:03d}".format(i=i) for i in range(60)] + result = store.select( + "df", "ts>=Timestamp('2012-02-01') and users=selector" + ) + expected = df[(df.ts >= Timestamp("2012-02-01")) & df.users.isin(selector)] + tm.assert_frame_equal(expected, result) + + selector = range(100, 200) + result = store.select("df", "B=selector") + expected = df[df.B.isin(selector)] + tm.assert_frame_equal(expected, result) + assert len(result) == 100 + + # big selector along the index + selector = Index(df.ts[0:100].values) + result = store.select("df", "ts=selector") + expected = df[df.ts.isin(selector.values)] + tm.assert_frame_equal(expected, result) + assert len(result) == 100 + + def test_select_iterator(self, setup_path): + + # single table + with ensure_clean_store(setup_path) as store: + + df = tm.makeTimeDataFrame(500) + _maybe_remove(store, "df") + store.append("df", df) + + expected = store.select("df") + + results = [s for s in store.select("df", iterator=True)] + result = concat(results) + tm.assert_frame_equal(expected, result) + + results = [s for s in store.select("df", chunksize=100)] + assert len(results) == 5 + result = concat(results) + tm.assert_frame_equal(expected, result) + + results = [s for s in store.select("df", chunksize=150)] + result = concat(results) + tm.assert_frame_equal(result, expected) + + with ensure_clean_path(setup_path) as path: + + df = tm.makeTimeDataFrame(500) + df.to_hdf(path, "df_non_table") + + with pytest.raises(TypeError): + read_hdf(path, "df_non_table", chunksize=100) + + with pytest.raises(TypeError): + read_hdf(path, "df_non_table", iterator=True) + + with ensure_clean_path(setup_path) as path: + + df = tm.makeTimeDataFrame(500) + df.to_hdf(path, "df", format="table") + + results = [s for s in read_hdf(path, "df", chunksize=100)] + result = concat(results) + + assert len(results) == 5 + tm.assert_frame_equal(result, df) + tm.assert_frame_equal(result, read_hdf(path, "df")) + + # multiple + + with ensure_clean_store(setup_path) as store: + + df1 = tm.makeTimeDataFrame(500) + store.append("df1", df1, data_columns=True) + df2 = tm.makeTimeDataFrame(500).rename(columns="{}_2".format) + df2["foo"] = "bar" + store.append("df2", df2) + + df = concat([df1, df2], axis=1) + + # full selection + expected = store.select_as_multiple(["df1", "df2"], selector="df1") + results = [ + s + for s in store.select_as_multiple( + ["df1", "df2"], selector="df1", chunksize=150 + ) + ] + result = concat(results) + tm.assert_frame_equal(expected, result) + + def test_select_iterator_complete_8014(self, setup_path): + + # GH 8014 + # using iterator and where clause + chunksize = 1e4 + + # no iterator + with ensure_clean_store(setup_path) as store: + + expected = tm.makeTimeDataFrame(100064, "S") + _maybe_remove(store, "df") + store.append("df", expected) + + beg_dt = expected.index[0] + end_dt = expected.index[-1] + + # select w/o iteration and no where clause works + result = store.select("df") + tm.assert_frame_equal(expected, result) + + # select w/o iterator and where clause, single term, begin + # of range, works + where = "index >= '{beg_dt}'".format(beg_dt=beg_dt) + result = store.select("df", where=where) + tm.assert_frame_equal(expected, result) + + # select w/o iterator and where clause, single term, end + # of range, works + where = "index <= '{end_dt}'".format(end_dt=end_dt) + result = store.select("df", where=where) + tm.assert_frame_equal(expected, result) + + # select w/o iterator and where clause, inclusive range, + # works + where = "index >= '{beg_dt}' & index <= '{end_dt}'".format( + beg_dt=beg_dt, end_dt=end_dt + ) + result = store.select("df", where=where) + tm.assert_frame_equal(expected, result) + + # with iterator, full range + with ensure_clean_store(setup_path) as store: + + expected = tm.makeTimeDataFrame(100064, "S") + _maybe_remove(store, "df") + store.append("df", expected) + + beg_dt = expected.index[0] + end_dt = expected.index[-1] + + # select w/iterator and no where clause works + results = [s for s in store.select("df", chunksize=chunksize)] + result = concat(results) + tm.assert_frame_equal(expected, result) + + # select w/iterator and where clause, single term, begin of range + where = "index >= '{beg_dt}'".format(beg_dt=beg_dt) + results = [s for s in store.select("df", where=where, chunksize=chunksize)] + result = concat(results) + tm.assert_frame_equal(expected, result) + + # select w/iterator and where clause, single term, end of range + where = "index <= '{end_dt}'".format(end_dt=end_dt) + results = [s for s in store.select("df", where=where, chunksize=chunksize)] + result = concat(results) + tm.assert_frame_equal(expected, result) + + # select w/iterator and where clause, inclusive range + where = "index >= '{beg_dt}' & index <= '{end_dt}'".format( + beg_dt=beg_dt, end_dt=end_dt + ) + results = [s for s in store.select("df", where=where, chunksize=chunksize)] + result = concat(results) + tm.assert_frame_equal(expected, result) + + def test_select_iterator_non_complete_8014(self, setup_path): + + # GH 8014 + # using iterator and where clause + chunksize = 1e4 + + # with iterator, non complete range + with ensure_clean_store(setup_path) as store: + + expected = tm.makeTimeDataFrame(100064, "S") + _maybe_remove(store, "df") + store.append("df", expected) + + beg_dt = expected.index[1] + end_dt = expected.index[-2] + + # select w/iterator and where clause, single term, begin of range + where = "index >= '{beg_dt}'".format(beg_dt=beg_dt) + results = [s for s in store.select("df", where=where, chunksize=chunksize)] + result = concat(results) + rexpected = expected[expected.index >= beg_dt] + tm.assert_frame_equal(rexpected, result) + + # select w/iterator and where clause, single term, end of range + where = "index <= '{end_dt}'".format(end_dt=end_dt) + results = [s for s in store.select("df", where=where, chunksize=chunksize)] + result = concat(results) + rexpected = expected[expected.index <= end_dt] + tm.assert_frame_equal(rexpected, result) + + # select w/iterator and where clause, inclusive range + where = "index >= '{beg_dt}' & index <= '{end_dt}'".format( + beg_dt=beg_dt, end_dt=end_dt + ) + results = [s for s in store.select("df", where=where, chunksize=chunksize)] + result = concat(results) + rexpected = expected[ + (expected.index >= beg_dt) & (expected.index <= end_dt) + ] + tm.assert_frame_equal(rexpected, result) + + # with iterator, empty where + with ensure_clean_store(setup_path) as store: + + expected = tm.makeTimeDataFrame(100064, "S") + _maybe_remove(store, "df") + store.append("df", expected) + + end_dt = expected.index[-1] + + # select w/iterator and where clause, single term, begin of range + where = "index > '{end_dt}'".format(end_dt=end_dt) + results = [s for s in store.select("df", where=where, chunksize=chunksize)] + assert 0 == len(results) + + def test_select_iterator_many_empty_frames(self, setup_path): + + # GH 8014 + # using iterator and where clause can return many empty + # frames. + chunksize = int(1e4) + + # with iterator, range limited to the first chunk + with ensure_clean_store(setup_path) as store: + + expected = tm.makeTimeDataFrame(100000, "S") + _maybe_remove(store, "df") + store.append("df", expected) + + beg_dt = expected.index[0] + end_dt = expected.index[chunksize - 1] + + # select w/iterator and where clause, single term, begin of range + where = "index >= '{beg_dt}'".format(beg_dt=beg_dt) + results = [s for s in store.select("df", where=where, chunksize=chunksize)] + result = concat(results) + rexpected = expected[expected.index >= beg_dt] + tm.assert_frame_equal(rexpected, result) + + # select w/iterator and where clause, single term, end of range + where = "index <= '{end_dt}'".format(end_dt=end_dt) + results = [s for s in store.select("df", where=where, chunksize=chunksize)] + + assert len(results) == 1 + result = concat(results) + rexpected = expected[expected.index <= end_dt] + tm.assert_frame_equal(rexpected, result) + + # select w/iterator and where clause, inclusive range + where = "index >= '{beg_dt}' & index <= '{end_dt}'".format( + beg_dt=beg_dt, end_dt=end_dt + ) + results = [s for s in store.select("df", where=where, chunksize=chunksize)] + + # should be 1, is 10 + assert len(results) == 1 + result = concat(results) + rexpected = expected[ + (expected.index >= beg_dt) & (expected.index <= end_dt) + ] + tm.assert_frame_equal(rexpected, result) + + # select w/iterator and where clause which selects + # *nothing*. + # + # To be consistent with Python idiom I suggest this should + # return [] e.g. `for e in []: print True` never prints + # True. + + where = "index <= '{beg_dt}' & index >= '{end_dt}'".format( + beg_dt=beg_dt, end_dt=end_dt + ) + results = [s for s in store.select("df", where=where, chunksize=chunksize)] + + # should be [] + assert len(results) == 0 + + @pytest.mark.filterwarnings( + "ignore:\\nthe :pandas.io.pytables.AttributeConflictWarning" + ) + def test_retain_index_attributes(self, setup_path): + + # GH 3499, losing frequency info on index recreation + df = DataFrame( + dict(A=Series(range(3), index=date_range("2000-1-1", periods=3, freq="H"))) + ) + + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "data") + store.put("data", df, format="table") + + result = store.get("data") + tm.assert_frame_equal(df, result) + + for attr in ["freq", "tz", "name"]: + for idx in ["index", "columns"]: + assert getattr(getattr(df, idx), attr, None) == getattr( + getattr(result, idx), attr, None + ) + + # try to append a table with a different frequency + with catch_warnings(record=True): + df2 = DataFrame( + dict( + A=Series( + range(3), index=date_range("2002-1-1", periods=3, freq="D") + ) + ) + ) + store.append("data", df2) + + assert store.get_storer("data").info["index"]["freq"] is None + + # this is ok + _maybe_remove(store, "df2") + df2 = DataFrame( + dict( + A=Series( + range(3), + index=[ + Timestamp("20010101"), + Timestamp("20010102"), + Timestamp("20020101"), + ], + ) + ) + ) + store.append("df2", df2) + df3 = DataFrame( + dict( + A=Series( + range(3), index=date_range("2002-1-1", periods=3, freq="D") + ) + ) + ) + store.append("df2", df3) + + @pytest.mark.filterwarnings( + "ignore:\\nthe :pandas.io.pytables.AttributeConflictWarning" + ) + def test_retain_index_attributes2(self, setup_path): + with ensure_clean_path(setup_path) as path: + + with catch_warnings(record=True): + + df = DataFrame( + dict( + A=Series( + range(3), index=date_range("2000-1-1", periods=3, freq="H") + ) + ) + ) + df.to_hdf(path, "data", mode="w", append=True) + df2 = DataFrame( + dict( + A=Series( + range(3), index=date_range("2002-1-1", periods=3, freq="D") + ) + ) + ) + df2.to_hdf(path, "data", append=True) + + idx = date_range("2000-1-1", periods=3, freq="H") + idx.name = "foo" + df = DataFrame(dict(A=Series(range(3), index=idx))) + df.to_hdf(path, "data", mode="w", append=True) + + assert read_hdf(path, "data").index.name == "foo" + + with catch_warnings(record=True): + + idx2 = date_range("2001-1-1", periods=3, freq="H") + idx2.name = "bar" + df2 = DataFrame(dict(A=Series(range(3), index=idx2))) + df2.to_hdf(path, "data", append=True) + + assert read_hdf(path, "data").index.name is None + + def test_frame_select(self, setup_path): + + df = tm.makeTimeDataFrame() + + with ensure_clean_store(setup_path) as store: + store.put("frame", df, format="table") + date = df.index[len(df) // 2] + + crit1 = Term("index>=date") + assert crit1.env.scope["date"] == date + + crit2 = "columns=['A', 'D']" + crit3 = "columns=A" + + result = store.select("frame", [crit1, crit2]) + expected = df.loc[date:, ["A", "D"]] + tm.assert_frame_equal(result, expected) + + result = store.select("frame", [crit3]) + expected = df.loc[:, ["A"]] + tm.assert_frame_equal(result, expected) + + # invalid terms + df = tm.makeTimeDataFrame() + store.append("df_time", df) + with pytest.raises(ValueError): + store.select("df_time", "index>0") + + # can't select if not written as table + # store['frame'] = df + # with pytest.raises(ValueError): + # store.select('frame', [crit1, crit2]) + + def test_frame_select_complex(self, setup_path): + # select via complex criteria + + df = tm.makeTimeDataFrame() + df["string"] = "foo" + df.loc[df.index[0:4], "string"] = "bar" + + with ensure_clean_store(setup_path) as store: + store.put("df", df, format="table", data_columns=["string"]) + + # empty + result = store.select("df", 'index>df.index[3] & string="bar"') + expected = df.loc[(df.index > df.index[3]) & (df.string == "bar")] + tm.assert_frame_equal(result, expected) + + result = store.select("df", 'index>df.index[3] & string="foo"') + expected = df.loc[(df.index > df.index[3]) & (df.string == "foo")] + tm.assert_frame_equal(result, expected) + + # or + result = store.select("df", 'index>df.index[3] | string="bar"') + expected = df.loc[(df.index > df.index[3]) | (df.string == "bar")] + tm.assert_frame_equal(result, expected) + + result = store.select( + "df", "(index>df.index[3] & " 'index<=df.index[6]) | string="bar"' + ) + expected = df.loc[ + ((df.index > df.index[3]) & (df.index <= df.index[6])) + | (df.string == "bar") + ] + tm.assert_frame_equal(result, expected) + + # invert + result = store.select("df", 'string!="bar"') + expected = df.loc[df.string != "bar"] + tm.assert_frame_equal(result, expected) + + # invert not implemented in numexpr :( + with pytest.raises(NotImplementedError): + store.select("df", '~(string="bar")') + + # invert ok for filters + result = store.select("df", "~(columns=['A','B'])") + expected = df.loc[:, df.columns.difference(["A", "B"])] + tm.assert_frame_equal(result, expected) + + # in + result = store.select("df", "index>df.index[3] & columns in ['A','B']") + expected = df.loc[df.index > df.index[3]].reindex(columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + def test_frame_select_complex2(self, setup_path): + + with ensure_clean_path(["parms.hdf", "hist.hdf"]) as paths: + + pp, hh = paths + + # use non-trivial selection criteria + parms = DataFrame({"A": [1, 1, 2, 2, 3]}) + parms.to_hdf(pp, "df", mode="w", format="table", data_columns=["A"]) + + selection = read_hdf(pp, "df", where="A=[2,3]") + hist = DataFrame( + np.random.randn(25, 1), + columns=["data"], + index=MultiIndex.from_tuples( + [(i, j) for i in range(5) for j in range(5)], names=["l1", "l2"] + ), + ) + + hist.to_hdf(hh, "df", mode="w", format="table") + + expected = read_hdf(hh, "df", where="l1=[2, 3, 4]") + + # scope with list like + l = selection.index.tolist() # noqa + store = HDFStore(hh) + result = store.select("df", where="l1=l") + assert_frame_equal(result, expected) + store.close() + + result = read_hdf(hh, "df", where="l1=l") + assert_frame_equal(result, expected) + + # index + index = selection.index # noqa + result = read_hdf(hh, "df", where="l1=index") + assert_frame_equal(result, expected) + + result = read_hdf(hh, "df", where="l1=selection.index") + assert_frame_equal(result, expected) + + result = read_hdf(hh, "df", where="l1=selection.index.tolist()") + assert_frame_equal(result, expected) + + result = read_hdf(hh, "df", where="l1=list(selection.index)") + assert_frame_equal(result, expected) + + # scope with index + store = HDFStore(hh) + + result = store.select("df", where="l1=index") + assert_frame_equal(result, expected) + + result = store.select("df", where="l1=selection.index") + assert_frame_equal(result, expected) + + result = store.select("df", where="l1=selection.index.tolist()") + assert_frame_equal(result, expected) + + result = store.select("df", where="l1=list(selection.index)") + assert_frame_equal(result, expected) + + store.close() + + def test_invalid_filtering(self, setup_path): + + # can't use more than one filter (atm) + + df = tm.makeTimeDataFrame() + + with ensure_clean_store(setup_path) as store: + store.put("df", df, format="table") + + # not implemented + with pytest.raises(NotImplementedError): + store.select("df", "columns=['A'] | columns=['B']") + + # in theory we could deal with this + with pytest.raises(NotImplementedError): + store.select("df", "columns=['A','B'] & columns=['C']") + + def test_string_select(self, setup_path): + # GH 2973 + with ensure_clean_store(setup_path) as store: + + df = tm.makeTimeDataFrame() + + # test string ==/!= + df["x"] = "none" + df.loc[2:7, "x"] = "" + + store.append("df", df, data_columns=["x"]) + + result = store.select("df", "x=none") + expected = df[df.x == "none"] + assert_frame_equal(result, expected) + + result = store.select("df", "x!=none") + expected = df[df.x != "none"] + assert_frame_equal(result, expected) + + df2 = df.copy() + df2.loc[df2.x == "", "x"] = np.nan + + store.append("df2", df2, data_columns=["x"]) + result = store.select("df2", "x!=none") + expected = df2[isna(df2.x)] + assert_frame_equal(result, expected) + + # int ==/!= + df["int"] = 1 + df.loc[2:7, "int"] = 2 + + store.append("df3", df, data_columns=["int"]) + + result = store.select("df3", "int=2") + expected = df[df.int == 2] + assert_frame_equal(result, expected) + + result = store.select("df3", "int!=2") + expected = df[df.int != 2] + assert_frame_equal(result, expected) + + def test_read_column(self, setup_path): + + df = tm.makeTimeDataFrame() + + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "df") + + # GH 17912 + # HDFStore.select_column should raise a KeyError + # exception if the key is not a valid store + with pytest.raises(KeyError, match="No object named df in the file"): + store.select_column("df", "index") + + store.append("df", df) + # error + with pytest.raises( + KeyError, match=re.escape("'column [foo] not found in the table'") + ): + store.select_column("df", "foo") + + with pytest.raises(Exception): + store.select_column("df", "index", where=["index>5"]) + + # valid + result = store.select_column("df", "index") + tm.assert_almost_equal(result.values, Series(df.index).values) + assert isinstance(result, Series) + + # not a data indexable column + with pytest.raises(ValueError): + store.select_column("df", "values_block_0") + + # a data column + df2 = df.copy() + df2["string"] = "foo" + store.append("df2", df2, data_columns=["string"]) + result = store.select_column("df2", "string") + tm.assert_almost_equal(result.values, df2["string"].values) + + # a data column with NaNs, result excludes the NaNs + df3 = df.copy() + df3["string"] = "foo" + df3.loc[4:6, "string"] = np.nan + store.append("df3", df3, data_columns=["string"]) + result = store.select_column("df3", "string") + tm.assert_almost_equal(result.values, df3["string"].values) + + # start/stop + result = store.select_column("df3", "string", start=2) + tm.assert_almost_equal(result.values, df3["string"].values[2:]) + + result = store.select_column("df3", "string", start=-2) + tm.assert_almost_equal(result.values, df3["string"].values[-2:]) + + result = store.select_column("df3", "string", stop=2) + tm.assert_almost_equal(result.values, df3["string"].values[:2]) + + result = store.select_column("df3", "string", stop=-2) + tm.assert_almost_equal(result.values, df3["string"].values[:-2]) + + result = store.select_column("df3", "string", start=2, stop=-2) + tm.assert_almost_equal(result.values, df3["string"].values[2:-2]) + + result = store.select_column("df3", "string", start=-2, stop=2) + tm.assert_almost_equal(result.values, df3["string"].values[-2:2]) + + # GH 10392 - make sure column name is preserved + df4 = DataFrame({"A": np.random.randn(10), "B": "foo"}) + store.append("df4", df4, data_columns=True) + expected = df4["B"] + result = store.select_column("df4", "B") + tm.assert_series_equal(result, expected) + + def test_coordinates(self, setup_path): + df = tm.makeTimeDataFrame() + + with ensure_clean_store(setup_path) as store: + + _maybe_remove(store, "df") + store.append("df", df) + + # all + c = store.select_as_coordinates("df") + assert (c.values == np.arange(len(df.index))).all() + + # get coordinates back & test vs frame + _maybe_remove(store, "df") + + df = DataFrame(dict(A=range(5), B=range(5))) + store.append("df", df) + c = store.select_as_coordinates("df", ["index<3"]) + assert (c.values == np.arange(3)).all() + result = store.select("df", where=c) + expected = df.loc[0:2, :] + tm.assert_frame_equal(result, expected) + + c = store.select_as_coordinates("df", ["index>=3", "index<=4"]) + assert (c.values == np.arange(2) + 3).all() + result = store.select("df", where=c) + expected = df.loc[3:4, :] + tm.assert_frame_equal(result, expected) + assert isinstance(c, Index) + + # multiple tables + _maybe_remove(store, "df1") + _maybe_remove(store, "df2") + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) + store.append("df1", df1, data_columns=["A", "B"]) + store.append("df2", df2) + + c = store.select_as_coordinates("df1", ["A>0", "B>0"]) + df1_result = store.select("df1", c) + df2_result = store.select("df2", c) + result = concat([df1_result, df2_result], axis=1) + + expected = concat([df1, df2], axis=1) + expected = expected[(expected.A > 0) & (expected.B > 0)] + tm.assert_frame_equal(result, expected) + + # pass array/mask as the coordinates + with ensure_clean_store(setup_path) as store: + + df = DataFrame( + np.random.randn(1000, 2), index=date_range("20000101", periods=1000) + ) + store.append("df", df) + c = store.select_column("df", "index") + where = c[DatetimeIndex(c).month == 5].index + expected = df.iloc[where] + + # locations + result = store.select("df", where=where) + tm.assert_frame_equal(result, expected) + + # boolean + result = store.select("df", where=where) + tm.assert_frame_equal(result, expected) + + # invalid + with pytest.raises(ValueError): + store.select("df", where=np.arange(len(df), dtype="float64")) + + with pytest.raises(ValueError): + store.select("df", where=np.arange(len(df) + 1)) + + with pytest.raises(ValueError): + store.select("df", where=np.arange(len(df)), start=5) + + with pytest.raises(ValueError): + store.select("df", where=np.arange(len(df)), start=5, stop=10) + + # selection with filter + selection = date_range("20000101", periods=500) + result = store.select("df", where="index in selection") + expected = df[df.index.isin(selection)] + tm.assert_frame_equal(result, expected) + + # list + df = DataFrame(np.random.randn(10, 2)) + store.append("df2", df) + result = store.select("df2", where=[0, 3, 5]) + expected = df.iloc[[0, 3, 5]] + tm.assert_frame_equal(result, expected) + + # boolean + where = [True] * 10 + where[-2] = False + result = store.select("df2", where=where) + expected = df.loc[where] + tm.assert_frame_equal(result, expected) + + # start/stop + result = store.select("df2", start=5, stop=10) + expected = df[5:10] + tm.assert_frame_equal(result, expected) + + def test_append_to_multiple(self, setup_path): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) + df2["foo"] = "bar" + df = concat([df1, df2], axis=1) + + with ensure_clean_store(setup_path) as store: + + # exceptions + with pytest.raises(ValueError): + store.append_to_multiple( + {"df1": ["A", "B"], "df2": None}, df, selector="df3" + ) + + with pytest.raises(ValueError): + store.append_to_multiple({"df1": None, "df2": None}, df, selector="df3") + + with pytest.raises(ValueError): + store.append_to_multiple("df1", df, "df1") + + # regular operation + store.append_to_multiple( + {"df1": ["A", "B"], "df2": None}, df, selector="df1" + ) + result = store.select_as_multiple( + ["df1", "df2"], where=["A>0", "B>0"], selector="df1" + ) + expected = df[(df.A > 0) & (df.B > 0)] + tm.assert_frame_equal(result, expected) + + def test_append_to_multiple_dropna(self, setup_path): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) + df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan + df = concat([df1, df2], axis=1) + + with ensure_clean_store(setup_path) as store: + + # dropna=True should guarantee rows are synchronized + store.append_to_multiple( + {"df1": ["A", "B"], "df2": None}, df, selector="df1", dropna=True + ) + result = store.select_as_multiple(["df1", "df2"]) + expected = df.dropna() + tm.assert_frame_equal(result, expected) + tm.assert_index_equal(store.select("df1").index, store.select("df2").index) + + @pytest.mark.xfail( + run=False, reason="append_to_multiple_dropna_false is not raising as failed" + ) + def test_append_to_multiple_dropna_false(self, setup_path): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) + df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan + df = concat([df1, df2], axis=1) + + with ensure_clean_store(setup_path) as store: + + # dropna=False shouldn't synchronize row indexes + store.append_to_multiple( + {"df1a": ["A", "B"], "df2a": None}, df, selector="df1a", dropna=False + ) + + with pytest.raises(ValueError): + store.select_as_multiple(["df1a", "df2a"]) + + assert not store.select("df1a").index.equals(store.select("df2a").index) + + def test_select_as_multiple(self, setup_path): + + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) + df2["foo"] = "bar" + + with ensure_clean_store(setup_path) as store: + + # no tables stored + with pytest.raises(Exception): + store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1") + + store.append("df1", df1, data_columns=["A", "B"]) + store.append("df2", df2) + + # exceptions + with pytest.raises(Exception): + store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1") + + with pytest.raises(Exception): + store.select_as_multiple([None], where=["A>0", "B>0"], selector="df1") + + msg = "'No object named df3 in the file'" + with pytest.raises(KeyError, match=msg): + store.select_as_multiple( + ["df1", "df3"], where=["A>0", "B>0"], selector="df1" + ) + + with pytest.raises(KeyError, match=msg): + store.select_as_multiple(["df3"], where=["A>0", "B>0"], selector="df1") + + with pytest.raises(KeyError, match="'No object named df4 in the file'"): + store.select_as_multiple( + ["df1", "df2"], where=["A>0", "B>0"], selector="df4" + ) + + # default select + result = store.select("df1", ["A>0", "B>0"]) + expected = store.select_as_multiple( + ["df1"], where=["A>0", "B>0"], selector="df1" + ) + tm.assert_frame_equal(result, expected) + expected = store.select_as_multiple( + "df1", where=["A>0", "B>0"], selector="df1" + ) + tm.assert_frame_equal(result, expected) + + # multiple + result = store.select_as_multiple( + ["df1", "df2"], where=["A>0", "B>0"], selector="df1" + ) + expected = concat([df1, df2], axis=1) + expected = expected[(expected.A > 0) & (expected.B > 0)] + tm.assert_frame_equal(result, expected) + + # multiple (diff selector) + result = store.select_as_multiple( + ["df1", "df2"], where="index>df2.index[4]", selector="df2" + ) + expected = concat([df1, df2], axis=1) + expected = expected[5:] + tm.assert_frame_equal(result, expected) + + # test exception for diff rows + store.append("df3", tm.makeTimeDataFrame(nper=50)) + with pytest.raises(ValueError): + store.select_as_multiple( + ["df1", "df3"], where=["A>0", "B>0"], selector="df1" + ) + + @pytest.mark.skipif( + LooseVersion(tables.__version__) < LooseVersion("3.1.0"), + reason=("tables version does not support fix for nan selection bug: GH 4858"), + ) + def test_nan_selection_bug_4858(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + df = DataFrame(dict(cols=range(6), values=range(6)), dtype="float64") + df["cols"] = (df["cols"] + 10).apply(str) + df.iloc[0] = np.nan + + expected = DataFrame( + dict(cols=["13.0", "14.0", "15.0"], values=[3.0, 4.0, 5.0]), + index=[3, 4, 5], + ) + + # write w/o the index on that particular column + store.append("df", df, data_columns=True, index=["cols"]) + result = store.select("df", where="values>2.0") + assert_frame_equal(result, expected) + + def test_start_stop_table(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + # table + df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20))) + store.append("df", df) + + result = store.select("df", "columns=['A']", start=0, stop=5) + expected = df.loc[0:4, ["A"]] + tm.assert_frame_equal(result, expected) + + # out of range + result = store.select("df", "columns=['A']", start=30, stop=40) + assert len(result) == 0 + expected = df.loc[30:40, ["A"]] + tm.assert_frame_equal(result, expected) + + def test_start_stop_multiple(self, setup_path): + + # GH 16209 + with ensure_clean_store(setup_path) as store: + + df = DataFrame({"foo": [1, 2], "bar": [1, 2]}) + + store.append_to_multiple( + {"selector": ["foo"], "data": None}, df, selector="selector" + ) + result = store.select_as_multiple( + ["selector", "data"], selector="selector", start=0, stop=1 + ) + expected = df.loc[[0], ["foo", "bar"]] + tm.assert_frame_equal(result, expected) + + def test_start_stop_fixed(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + # fixed, GH 8287 + df = DataFrame( + dict(A=np.random.rand(20), B=np.random.rand(20)), + index=pd.date_range("20130101", periods=20), + ) + store.put("df", df) + + result = store.select("df", start=0, stop=5) + expected = df.iloc[0:5, :] + tm.assert_frame_equal(result, expected) + + result = store.select("df", start=5, stop=10) + expected = df.iloc[5:10, :] + tm.assert_frame_equal(result, expected) + + # out of range + result = store.select("df", start=30, stop=40) + expected = df.iloc[30:40, :] + tm.assert_frame_equal(result, expected) + + # series + s = df.A + store.put("s", s) + result = store.select("s", start=0, stop=5) + expected = s.iloc[0:5] + tm.assert_series_equal(result, expected) + + result = store.select("s", start=5, stop=10) + expected = s.iloc[5:10] + tm.assert_series_equal(result, expected) + + # sparse; not implemented + df = tm.makeDataFrame() + df.iloc[3:5, 1:3] = np.nan + df.iloc[8:10, -2] = np.nan + + def test_select_filter_corner(self, setup_path): + + df = DataFrame(np.random.randn(50, 100)) + df.index = ["{c:3d}".format(c=c) for c in df.index] + df.columns = ["{c:3d}".format(c=c) for c in df.columns] + + with ensure_clean_store(setup_path) as store: + store.put("frame", df, format="table") + + crit = "columns=df.columns[:75]" + result = store.select("frame", [crit]) + tm.assert_frame_equal(result, df.loc[:, df.columns[:75]]) + + crit = "columns=df.columns[:75:2]" + result = store.select("frame", [crit]) + tm.assert_frame_equal(result, df.loc[:, df.columns[:75:2]]) + + def test_path_pathlib(self, setup_path): + df = tm.makeDataFrame() + + result = tm.round_trip_pathlib( + lambda p: df.to_hdf(p, "df"), lambda p: pd.read_hdf(p, "df") + ) + tm.assert_frame_equal(df, result) + + @pytest.mark.parametrize("start, stop", [(0, 2), (1, 2), (None, None)]) + def test_contiguous_mixed_data_table(self, start, stop, setup_path): + # GH 17021 + # ValueError when reading a contiguous mixed-data table ft. VLArray + df = DataFrame( + { + "a": Series([20111010, 20111011, 20111012]), + "b": Series(["ab", "cd", "ab"]), + } + ) + + with ensure_clean_store(setup_path) as store: + store.append("test_dataset", df) + + result = store.select("test_dataset", start=start, stop=stop) + assert_frame_equal(df[start:stop], result) + + def test_path_pathlib_hdfstore(self, setup_path): + df = tm.makeDataFrame() + + def writer(path): + with pd.HDFStore(path) as store: + df.to_hdf(store, "df") + + def reader(path): + with pd.HDFStore(path) as store: + return pd.read_hdf(store, "df") + + result = tm.round_trip_pathlib(writer, reader) + tm.assert_frame_equal(df, result) + + def test_pickle_path_localpath(self, setup_path): + df = tm.makeDataFrame() + result = tm.round_trip_pathlib( + lambda p: df.to_hdf(p, "df"), lambda p: pd.read_hdf(p, "df") + ) + tm.assert_frame_equal(df, result) + + def test_path_localpath_hdfstore(self, setup_path): + df = tm.makeDataFrame() + + def writer(path): + with pd.HDFStore(path) as store: + df.to_hdf(store, "df") + + def reader(path): + with pd.HDFStore(path) as store: + return pd.read_hdf(store, "df") + + result = tm.round_trip_localpath(writer, reader) + tm.assert_frame_equal(df, result) + + def _check_roundtrip(self, obj, comparator, path, compression=False, **kwargs): + + options = {} + if compression: + options["complib"] = _default_compressor + + with ensure_clean_store(path, "w", **options) as store: + store["obj"] = obj + retrieved = store["obj"] + comparator(retrieved, obj, **kwargs) + + def _check_double_roundtrip( + self, obj, comparator, path, compression=False, **kwargs + ): + options = {} + if compression: + options["complib"] = compression or _default_compressor + + with ensure_clean_store(path, "w", **options) as store: + store["obj"] = obj + retrieved = store["obj"] + comparator(retrieved, obj, **kwargs) + store["obj"] = retrieved + again = store["obj"] + comparator(again, obj, **kwargs) + + def _check_roundtrip_table(self, obj, comparator, path, compression=False): + options = {} + if compression: + options["complib"] = _default_compressor + + with ensure_clean_store(path, "w", **options) as store: + store.put("obj", obj, format="table") + retrieved = store["obj"] + + comparator(retrieved, obj) + + def test_multiple_open_close(self, setup_path): + # gh-4409: open & close multiple times + + with ensure_clean_path(setup_path) as path: + + df = tm.makeDataFrame() + df.to_hdf(path, "df", mode="w", format="table") + + # single + store = HDFStore(path) + assert "CLOSED" not in store.info() + assert store.is_open + + store.close() + assert "CLOSED" in store.info() + assert not store.is_open + + with ensure_clean_path(setup_path) as path: + + if pytables._table_file_open_policy_is_strict: + + # multiples + store1 = HDFStore(path) + + with pytest.raises(ValueError): + HDFStore(path) + + store1.close() + else: + + # multiples + store1 = HDFStore(path) + store2 = HDFStore(path) + + assert "CLOSED" not in store1.info() + assert "CLOSED" not in store2.info() + assert store1.is_open + assert store2.is_open + + store1.close() + assert "CLOSED" in store1.info() + assert not store1.is_open + assert "CLOSED" not in store2.info() + assert store2.is_open + + store2.close() + assert "CLOSED" in store1.info() + assert "CLOSED" in store2.info() + assert not store1.is_open + assert not store2.is_open + + # nested close + store = HDFStore(path, mode="w") + store.append("df", df) + + store2 = HDFStore(path) + store2.append("df2", df) + store2.close() + assert "CLOSED" in store2.info() + assert not store2.is_open + + store.close() + assert "CLOSED" in store.info() + assert not store.is_open + + # double closing + store = HDFStore(path, mode="w") + store.append("df", df) + + store2 = HDFStore(path) + store.close() + assert "CLOSED" in store.info() + assert not store.is_open + + store2.close() + assert "CLOSED" in store2.info() + assert not store2.is_open + + # ops on a closed store + with ensure_clean_path(setup_path) as path: + + df = tm.makeDataFrame() + df.to_hdf(path, "df", mode="w", format="table") + + store = HDFStore(path) + store.close() + + with pytest.raises(ClosedFileError): + store.keys() + + with pytest.raises(ClosedFileError): + "df" in store + + with pytest.raises(ClosedFileError): + len(store) + + with pytest.raises(ClosedFileError): + store["df"] + + with pytest.raises(AttributeError): + store.df + + with pytest.raises(ClosedFileError): + store.select("df") + + with pytest.raises(ClosedFileError): + store.get("df") + + with pytest.raises(ClosedFileError): + store.append("df2", df) + + with pytest.raises(ClosedFileError): + store.put("df3", df) + + with pytest.raises(ClosedFileError): + store.get_storer("df2") + + with pytest.raises(ClosedFileError): + store.remove("df2") + + with pytest.raises(ClosedFileError, match="file is not open"): + store.select("df") + + def test_pytables_native_read(self, datapath, setup_path): + with ensure_clean_store( + datapath("io", "data", "legacy_hdf/pytables_native.h5"), mode="r" + ) as store: + d2 = store["detector/readout"] + assert isinstance(d2, DataFrame) + + @pytest.mark.skipif( + is_platform_windows(), reason="native2 read fails oddly on windows" + ) + def test_pytables_native2_read(self, datapath, setup_path): + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "pytables_native2.h5"), mode="r" + ) as store: + str(store) + d1 = store["detector"] + assert isinstance(d1, DataFrame) + + @xfail_non_writeable + def test_legacy_table_fixed_format_read_py2(self, datapath, setup_path): + # GH 24510 + # legacy table with fixed format written in Python 2 + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "legacy_table_fixed_py2.h5"), mode="r" + ) as store: + result = store.select("df") + expected = pd.DataFrame( + [[1, 2, 3, "D"]], + columns=["A", "B", "C", "D"], + index=pd.Index(["ABC"], name="INDEX_NAME"), + ) + assert_frame_equal(expected, result) + + def test_legacy_table_read_py2(self, datapath, setup_path): + # issue: 24925 + # legacy table written in Python 2 + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "legacy_table_py2.h5"), mode="r" + ) as store: + result = store.select("table") + + expected = pd.DataFrame({"a": ["a", "b"], "b": [2, 3]}) + assert_frame_equal(expected, result) + + def test_copy(self, setup_path): + + with catch_warnings(record=True): + + def do_copy(f, new_f=None, keys=None, propindexes=True, **kwargs): + try: + store = HDFStore(f, "r") + + if new_f is None: + import tempfile + + fd, new_f = tempfile.mkstemp() + + tstore = store.copy( + new_f, keys=keys, propindexes=propindexes, **kwargs + ) + + # check keys + if keys is None: + keys = store.keys() + assert set(keys) == set(tstore.keys()) + + # check indices & nrows + for k in tstore.keys(): + if tstore.get_storer(k).is_table: + new_t = tstore.get_storer(k) + orig_t = store.get_storer(k) + + assert orig_t.nrows == new_t.nrows + + # check propindixes + if propindexes: + for a in orig_t.axes: + if a.is_indexed: + assert new_t[a.name].is_indexed + + finally: + safe_close(store) + safe_close(tstore) + try: + os.close(fd) + except (OSError, ValueError): + pass + safe_remove(new_f) + + # new table + df = tm.makeDataFrame() + + try: + path = create_tempfile(setup_path) + st = HDFStore(path) + st.append("df", df, data_columns=["A"]) + st.close() + do_copy(f=path) + do_copy(f=path, propindexes=False) + finally: + safe_remove(path) + + def test_store_datetime_fractional_secs(self, setup_path): + + with ensure_clean_store(setup_path) as store: + dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456) + series = Series([0], [dt]) + store["a"] = series + assert store["a"].index[0] == dt + + def test_tseries_indices_series(self, setup_path): + + with ensure_clean_store(setup_path) as store: + idx = tm.makeDateIndex(10) + ser = Series(np.random.randn(len(idx)), idx) + store["a"] = ser + result = store["a"] + + tm.assert_series_equal(result, ser) + assert result.index.freq == ser.index.freq + tm.assert_class_equal(result.index, ser.index, obj="series index") + + idx = tm.makePeriodIndex(10) + ser = Series(np.random.randn(len(idx)), idx) + store["a"] = ser + result = store["a"] + + tm.assert_series_equal(result, ser) + assert result.index.freq == ser.index.freq + tm.assert_class_equal(result.index, ser.index, obj="series index") + + def test_tseries_indices_frame(self, setup_path): + + with ensure_clean_store(setup_path) as store: + idx = tm.makeDateIndex(10) + df = DataFrame(np.random.randn(len(idx), 3), index=idx) + store["a"] = df + result = store["a"] + + assert_frame_equal(result, df) + assert result.index.freq == df.index.freq + tm.assert_class_equal(result.index, df.index, obj="dataframe index") + + idx = tm.makePeriodIndex(10) + df = DataFrame(np.random.randn(len(idx), 3), idx) + store["a"] = df + result = store["a"] + + assert_frame_equal(result, df) + assert result.index.freq == df.index.freq + tm.assert_class_equal(result.index, df.index, obj="dataframe index") + + def test_unicode_index(self, setup_path): + + unicode_values = ["\u03c3", "\u03c3\u03c3"] + + # PerformanceWarning + with catch_warnings(record=True): + simplefilter("ignore", pd.errors.PerformanceWarning) + s = Series(np.random.randn(len(unicode_values)), unicode_values) + self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) + + def test_unicode_longer_encoded(self, setup_path): + # GH 11234 + char = "\u0394" + df = pd.DataFrame({"A": [char]}) + with ensure_clean_store(setup_path) as store: + store.put("df", df, format="table", encoding="utf-8") + result = store.get("df") + tm.assert_frame_equal(result, df) + + df = pd.DataFrame({"A": ["a", char], "B": ["b", "b"]}) + with ensure_clean_store(setup_path) as store: + store.put("df", df, format="table", encoding="utf-8") + result = store.get("df") + tm.assert_frame_equal(result, df) + + @xfail_non_writeable + def test_store_datetime_mixed(self, setup_path): + + df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]}) + ts = tm.makeTimeSeries() + df["d"] = ts.index[:3] + self._check_roundtrip(df, tm.assert_frame_equal, path=setup_path) + + # FIXME: don't leave commented-out code + # def test_cant_write_multiindex_table(self): + # # for now, #1848 + # df = DataFrame(np.random.randn(10, 4), + # index=[np.arange(5).repeat(2), + # np.tile(np.arange(2), 5)]) + # + # with pytest.raises(Exception): + # store.put('foo', df, format='table') + + def test_append_with_diff_col_name_types_raises_value_error(self, setup_path): + df = DataFrame(np.random.randn(10, 1)) + df2 = DataFrame({"a": np.random.randn(10)}) + df3 = DataFrame({(1, 2): np.random.randn(10)}) + df4 = DataFrame({("1", 2): np.random.randn(10)}) + df5 = DataFrame({("1", 2, object): np.random.randn(10)}) + + with ensure_clean_store(setup_path) as store: + name = "df_{}".format(tm.rands(10)) + store.append(name, df) + + for d in (df2, df3, df4, df5): + with pytest.raises(ValueError): + store.append(name, d) + + def test_query_with_nested_special_character(self, setup_path): + df = DataFrame( + { + "a": ["a", "a", "c", "b", "test & test", "c", "b", "e"], + "b": [1, 2, 3, 4, 5, 6, 7, 8], + } + ) + expected = df[df.a == "test & test"] + with ensure_clean_store(setup_path) as store: + store.append("test", df, format="table", data_columns=True) + result = store.select("test", 'a = "test & test"') + tm.assert_frame_equal(expected, result) + + def test_categorical(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + # Basic + _maybe_remove(store, "s") + s = Series( + Categorical( + ["a", "b", "b", "a", "a", "c"], + categories=["a", "b", "c", "d"], + ordered=False, + ) + ) + store.append("s", s, format="table") + result = store.select("s") + tm.assert_series_equal(s, result) + + _maybe_remove(store, "s_ordered") + s = Series( + Categorical( + ["a", "b", "b", "a", "a", "c"], + categories=["a", "b", "c", "d"], + ordered=True, + ) + ) + store.append("s_ordered", s, format="table") + result = store.select("s_ordered") + tm.assert_series_equal(s, result) + + _maybe_remove(store, "df") + df = DataFrame({"s": s, "vals": [1, 2, 3, 4, 5, 6]}) + store.append("df", df, format="table") + result = store.select("df") + tm.assert_frame_equal(result, df) + + # Dtypes + _maybe_remove(store, "si") + s = Series([1, 1, 2, 2, 3, 4, 5]).astype("category") + store.append("si", s) + result = store.select("si") + tm.assert_series_equal(result, s) + + _maybe_remove(store, "si2") + s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype("category") + store.append("si2", s) + result = store.select("si2") + tm.assert_series_equal(result, s) + + # Multiple + _maybe_remove(store, "df2") + df2 = df.copy() + df2["s2"] = Series(list("abcdefg")).astype("category") + store.append("df2", df2) + result = store.select("df2") + tm.assert_frame_equal(result, df2) + + # Make sure the metadata is OK + info = store.info() + assert "/df2 " in info + # assert '/df2/meta/values_block_0/meta' in info + assert "/df2/meta/values_block_1/meta" in info + + # unordered + _maybe_remove(store, "s2") + s = Series( + Categorical( + ["a", "b", "b", "a", "a", "c"], + categories=["a", "b", "c", "d"], + ordered=False, + ) + ) + store.append("s2", s, format="table") + result = store.select("s2") + tm.assert_series_equal(result, s) + + # Query + _maybe_remove(store, "df3") + store.append("df3", df, data_columns=["s"]) + expected = df[df.s.isin(["b", "c"])] + result = store.select("df3", where=['s in ["b","c"]']) + tm.assert_frame_equal(result, expected) + + expected = df[df.s.isin(["b", "c"])] + result = store.select("df3", where=['s = ["b","c"]']) + tm.assert_frame_equal(result, expected) + + expected = df[df.s.isin(["d"])] + result = store.select("df3", where=['s in ["d"]']) + tm.assert_frame_equal(result, expected) + + expected = df[df.s.isin(["f"])] + result = store.select("df3", where=['s in ["f"]']) + tm.assert_frame_equal(result, expected) + + # Appending with same categories is ok + store.append("df3", df) + + df = concat([df, df]) + expected = df[df.s.isin(["b", "c"])] + result = store.select("df3", where=['s in ["b","c"]']) + tm.assert_frame_equal(result, expected) + + # Appending must have the same categories + df3 = df.copy() + df3["s"].cat.remove_unused_categories(inplace=True) + + with pytest.raises(ValueError): + store.append("df3", df3) + + # Remove, and make sure meta data is removed (its a recursive + # removal so should be). + result = store.select("df3/meta/s/meta") + assert result is not None + store.remove("df3") + + with pytest.raises( + KeyError, match="'No object named df3/meta/s/meta in the file'" + ): + store.select("df3/meta/s/meta") + + def test_categorical_conversion(self, setup_path): + + # GH13322 + # Check that read_hdf with categorical columns doesn't return rows if + # where criteria isn't met. + obsids = ["ESP_012345_6789", "ESP_987654_3210"] + imgids = ["APF00006np", "APF0001imm"] + data = [4.3, 9.8] + + # Test without categories + df = DataFrame(dict(obsids=obsids, imgids=imgids, data=data)) + + # We are expecting an empty DataFrame matching types of df + expected = df.iloc[[], :] + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", format="table", data_columns=True) + result = read_hdf(path, "df", where="obsids=B") + tm.assert_frame_equal(result, expected) + + # Test with categories + df.obsids = df.obsids.astype("category") + df.imgids = df.imgids.astype("category") + + # We are expecting an empty DataFrame matching types of df + expected = df.iloc[[], :] + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", format="table", data_columns=True) + result = read_hdf(path, "df", where="obsids=B") + tm.assert_frame_equal(result, expected) + + def test_categorical_nan_only_columns(self, setup_path): + # GH18413 + # Check that read_hdf with categorical columns with NaN-only values can + # be read back. + df = pd.DataFrame( + { + "a": ["a", "b", "c", np.nan], + "b": [np.nan, np.nan, np.nan, np.nan], + "c": [1, 2, 3, 4], + "d": pd.Series([None] * 4, dtype=object), + } + ) + df["a"] = df.a.astype("category") + df["b"] = df.b.astype("category") + df["d"] = df.b.astype("category") + expected = df + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", format="table", data_columns=True) + result = read_hdf(path, "df") + tm.assert_frame_equal(result, expected) + + def test_duplicate_column_name(self, setup_path): + df = DataFrame(columns=["a", "a"], data=[[0, 0]]) + + with ensure_clean_path(setup_path) as path: + with pytest.raises(ValueError): + df.to_hdf(path, "df", format="fixed") + + df.to_hdf(path, "df", format="table") + other = read_hdf(path, "df") + + tm.assert_frame_equal(df, other) + assert df.equals(other) + assert other.equals(df) + + def test_round_trip_equals(self, setup_path): + # GH 9330 + df = DataFrame({"B": [1, 2], "A": ["x", "y"]}) + + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", format="table") + other = read_hdf(path, "df") + tm.assert_frame_equal(df, other) + assert df.equals(other) + assert other.equals(df) + + def test_preserve_timedeltaindex_type(self, setup_path): + # GH9635 + # Storing TimedeltaIndexed DataFrames in fixed stores did not preserve + # the type of the index. + df = DataFrame(np.random.normal(size=(10, 5))) + df.index = timedelta_range(start="0s", periods=10, freq="1s", name="example") + + with ensure_clean_store(setup_path) as store: + + store["df"] = df + assert_frame_equal(store["df"], df) + + def test_columns_multiindex_modified(self, setup_path): + # BUG: 7212 + # read_hdf store.select modified the passed columns parameters + # when multi-indexed. + + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + df.index.name = "letters" + df = df.set_index(keys="E", append=True) + + data_columns = df.index.names + df.columns.tolist() + with ensure_clean_path(setup_path) as path: + df.to_hdf( + path, + "df", + mode="a", + append=True, + data_columns=data_columns, + index=False, + ) + cols2load = list("BCD") + cols2load_original = list(cols2load) + df_loaded = read_hdf(path, "df", columns=cols2load) # noqa + assert cols2load_original == cols2load + + @ignore_natural_naming_warning + def test_to_hdf_with_object_column_names(self, setup_path): + # GH9057 + # Writing HDF5 table format should only work for string-like + # column types + + types_should_fail = [ + tm.makeIntIndex, + tm.makeFloatIndex, + tm.makeDateIndex, + tm.makeTimedeltaIndex, + tm.makePeriodIndex, + ] + types_should_run = [ + tm.makeStringIndex, + tm.makeCategoricalIndex, + tm.makeUnicodeIndex, + ] + + for index in types_should_fail: + df = DataFrame(np.random.randn(10, 2), columns=index(2)) + with ensure_clean_path(setup_path) as path: + with catch_warnings(record=True): + msg = "cannot have non-object label DataIndexableCol" + with pytest.raises(ValueError, match=msg): + df.to_hdf(path, "df", format="table", data_columns=True) + + for index in types_should_run: + df = DataFrame(np.random.randn(10, 2), columns=index(2)) + with ensure_clean_path(setup_path) as path: + with catch_warnings(record=True): + df.to_hdf(path, "df", format="table", data_columns=True) + result = pd.read_hdf( + path, "df", where="index = [{0}]".format(df.index[0]) + ) + assert len(result) + + def test_read_hdf_open_store(self, setup_path): + # GH10330 + # No check for non-string path_or-buf, and no test of open store + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + df.index.name = "letters" + df = df.set_index(keys="E", append=True) + + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", mode="w") + direct = read_hdf(path, "df") + store = HDFStore(path, mode="r") + indirect = read_hdf(store, "df") + tm.assert_frame_equal(direct, indirect) + assert store.is_open + store.close() + + def test_read_hdf_iterator(self, setup_path): + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + df.index.name = "letters" + df = df.set_index(keys="E", append=True) + + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", mode="w", format="t") + direct = read_hdf(path, "df") + iterator = read_hdf(path, "df", iterator=True) + assert isinstance(iterator, TableIterator) + indirect = next(iterator.__iter__()) + tm.assert_frame_equal(direct, indirect) + iterator.store.close() + + def test_read_hdf_errors(self, setup_path): + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + + with ensure_clean_path(setup_path) as path: + with pytest.raises(IOError): + read_hdf(path, "key") + + df.to_hdf(path, "df") + store = HDFStore(path, mode="r") + store.close() + + with pytest.raises(IOError): + read_hdf(store, "df") + + def test_read_hdf_generic_buffer_errors(self): + with pytest.raises(NotImplementedError): + read_hdf(BytesIO(b""), "df") + + def test_invalid_complib(self, setup_path): + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + with ensure_clean_path(setup_path) as path: + with pytest.raises(ValueError): + df.to_hdf(path, "df", complib="foolib") + + # GH10443 + + def test_read_nokey(self, setup_path): + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + + # Categorical dtype not supported for "fixed" format. So no need + # to test with that dtype in the dataframe here. + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", mode="a") + reread = read_hdf(path) + assert_frame_equal(df, reread) + df.to_hdf(path, "df2", mode="a") + + with pytest.raises(ValueError): + read_hdf(path) + + def test_read_nokey_table(self, setup_path): + # GH13231 + df = DataFrame({"i": range(5), "c": Series(list("abacd"), dtype="category")}) + + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", mode="a", format="table") + reread = read_hdf(path) + assert_frame_equal(df, reread) + df.to_hdf(path, "df2", mode="a", format="table") + + with pytest.raises(ValueError): + read_hdf(path) + + def test_read_nokey_empty(self, setup_path): + with ensure_clean_path(setup_path) as path: + store = HDFStore(path) + store.close() + + with pytest.raises(ValueError): + read_hdf(path) + + @td.skip_if_no("pathlib") + def test_read_from_pathlib_path(self, setup_path): + + # GH11773 + from pathlib import Path + + expected = DataFrame( + np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE") + ) + with ensure_clean_path(setup_path) as filename: + path_obj = Path(filename) + + expected.to_hdf(path_obj, "df", mode="a") + actual = read_hdf(path_obj, "df") + + tm.assert_frame_equal(expected, actual) + + @td.skip_if_no("py.path") + def test_read_from_py_localpath(self, setup_path): + + # GH11773 + from py.path import local as LocalPath + + expected = DataFrame( + np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE") + ) + with ensure_clean_path(setup_path) as filename: + path_obj = LocalPath(filename) + + expected.to_hdf(path_obj, "df", mode="a") + actual = read_hdf(path_obj, "df") + + tm.assert_frame_equal(expected, actual) + + def test_query_long_float_literal(self, setup_path): + # GH 14241 + df = pd.DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]}) + + with ensure_clean_store(setup_path) as store: + store.append("test", df, format="table", data_columns=True) + + cutoff = 1000000000.0006 + result = store.select("test", "A < {cutoff:.4f}".format(cutoff=cutoff)) + assert result.empty + + cutoff = 1000000000.0010 + result = store.select("test", "A > {cutoff:.4f}".format(cutoff=cutoff)) + expected = df.loc[[1, 2], :] + tm.assert_frame_equal(expected, result) + + exact = 1000000000.0011 + result = store.select("test", "A == {exact:.4f}".format(exact=exact)) + expected = df.loc[[1], :] + tm.assert_frame_equal(expected, result) + + def test_query_compare_column_type(self, setup_path): + # GH 15492 + df = pd.DataFrame( + { + "date": ["2014-01-01", "2014-01-02"], + "real_date": date_range("2014-01-01", periods=2), + "float": [1.1, 1.2], + "int": [1, 2], + }, + columns=["date", "real_date", "float", "int"], + ) + + with ensure_clean_store(setup_path) as store: + store.append("test", df, format="table", data_columns=True) + + ts = pd.Timestamp("2014-01-01") # noqa + result = store.select("test", where="real_date > ts") + expected = df.loc[[1], :] + tm.assert_frame_equal(expected, result) + + for op in ["<", ">", "=="]: + # non strings to string column always fail + for v in [2.1, True, pd.Timestamp("2014-01-01"), pd.Timedelta(1, "s")]: + query = "date {op} v".format(op=op) + with pytest.raises(TypeError): + store.select("test", where=query) + + # strings to other columns must be convertible to type + v = "a" + for col in ["int", "float", "real_date"]: + query = "{col} {op} v".format(op=op, col=col) + with pytest.raises(ValueError): + store.select("test", where=query) + + for v, col in zip( + ["1", "1.1", "2014-01-01"], ["int", "float", "real_date"] + ): + query = "{col} {op} v".format(op=op, col=col) + result = store.select("test", where=query) + + if op == "==": + expected = df.loc[[0], :] + elif op == ">": + expected = df.loc[[1], :] + else: + expected = df.loc[[], :] + tm.assert_frame_equal(expected, result) + + @pytest.mark.parametrize("format", ["fixed", "table"]) + def test_read_hdf_series_mode_r(self, format, setup_path): + # GH 16583 + # Tests that reading a Series saved to an HDF file + # still works if a mode='r' argument is supplied + series = tm.makeFloatSeries() + with ensure_clean_path(setup_path) as path: + series.to_hdf(path, key="data", format=format) + result = pd.read_hdf(path, key="data", mode="r") + tm.assert_series_equal(result, series) + + @pytest.mark.skipif(not PY36, reason="Need python 3.6") + def test_fspath(self): + with tm.ensure_clean("foo.h5") as path: + with pd.HDFStore(path) as store: + assert os.fspath(store) == str(path) + + def test_read_py2_hdf_file_in_py3(self, datapath): + # GH 16781 + + # tests reading a PeriodIndex DataFrame written in Python2 in Python3 + + # the file was generated in Python 2.7 like so: + # + # df = pd.DataFrame([1.,2,3], index=pd.PeriodIndex( + # ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B')) + # df.to_hdf('periodindex_0.20.1_x86_64_darwin_2.7.13.h5', 'p') + + expected = pd.DataFrame( + [1.0, 2, 3], + index=pd.PeriodIndex(["2015-01-01", "2015-01-02", "2015-01-05"], freq="B"), + ) + + with ensure_clean_store( + datapath( + "io", "data", "legacy_hdf", "periodindex_0.20.1_x86_64_darwin_2.7.13.h5" + ), + mode="r", + ) as store: + result = store["p"] + assert_frame_equal(result, expected) + + @pytest.mark.parametrize("where", ["", (), (None,), [], [None]]) + def test_select_empty_where(self, where): + # GH26610 + + # Using keyword `where` as '' or (), or [None], etc + # while reading from HDF store raises + # "SyntaxError: only a single expression is allowed" + + df = pd.DataFrame([1, 2, 3]) + with ensure_clean_path("empty_where.h5") as path: + with pd.HDFStore(path) as store: + store.put("df", df, "t") + result = pd.read_hdf(store, "df", where=where) + assert_frame_equal(result, df) + + @pytest.mark.parametrize( + "idx", + [ + date_range("2019", freq="D", periods=3, tz="UTC"), + CategoricalIndex(list("abc")), + ], + ) + def test_to_hdf_multiindex_extension_dtype(self, idx, setup_path): + # GH 7775 + mi = MultiIndex.from_arrays([idx, idx]) + df = pd.DataFrame(0, index=mi, columns=["a"]) + with ensure_clean_path(setup_path) as path: + with pytest.raises(NotImplementedError, match="Saving a MultiIndex"): + df.to_hdf(path, "df") + + From cd6a2c98e0aa0cbc6147334d8d26293b2baba888 Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Wed, 2 Oct 2019 10:38:10 +0100 Subject: [PATCH 11/28] Ran black formatting --- .../io/pytables/test_hdf_complex_values.py | 13 ++++++------ pandas/tests/io/pytables/test_hdf_store.py | 21 ++++++++++--------- pandas/tests/io/pytables/test_timezones.py | 16 ++++++-------- 3 files changed, 23 insertions(+), 27 deletions(-) diff --git a/pandas/tests/io/pytables/test_hdf_complex_values.py b/pandas/tests/io/pytables/test_hdf_complex_values.py index 65587b8b8b993..c7553471aea6b 100644 --- a/pandas/tests/io/pytables/test_hdf_complex_values.py +++ b/pandas/tests/io/pytables/test_hdf_complex_values.py @@ -4,17 +4,16 @@ import pytest import pandas as pd -from pandas import ( - DataFrame, - Series, -) +from pandas import DataFrame, Series import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal from pandas.io.pytables import read_hdf -from pandas.tests.io.pytables.common import (xfail_non_writeable, - ensure_clean_path, - ensure_clean_store) +from pandas.tests.io.pytables.common import ( + xfail_non_writeable, + ensure_clean_path, + ensure_clean_store, +) class TestHDFComplexValues: diff --git a/pandas/tests/io/pytables/test_hdf_store.py b/pandas/tests/io/pytables/test_hdf_store.py index a5a8b0591d048..536d62ae393a7 100644 --- a/pandas/tests/io/pytables/test_hdf_store.py +++ b/pandas/tests/io/pytables/test_hdf_store.py @@ -42,14 +42,16 @@ Term, read_hdf, ) -from pandas.tests.io.pytables.common import (xfail_non_writeable, - tables, - ensure_clean_path, - ensure_clean_store, - create_tempfile, - safe_close, - safe_remove, - _maybe_remove) +from pandas.tests.io.pytables.common import ( + xfail_non_writeable, + tables, + ensure_clean_path, + ensure_clean_store, + create_tempfile, + safe_close, + safe_remove, + _maybe_remove, +) from pandas.io import pytables as pytables # noqa: E402 isort:skip from pandas.io.pytables import TableIterator # noqa: E402 isort:skip @@ -60,6 +62,7 @@ "ignore:object name:tables.exceptions.NaturalNameWarning" ) + @pytest.mark.single class TestHDFStore: def test_format_kwarg_in_constructor(self, setup_path): @@ -4754,5 +4757,3 @@ def test_to_hdf_multiindex_extension_dtype(self, idx, setup_path): with ensure_clean_path(setup_path) as path: with pytest.raises(NotImplementedError, match="Saving a MultiIndex"): df.to_hdf(path, "df") - - diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 6d7b73ee6e803..ed529d538f0e3 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -6,18 +6,14 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import ( - DataFrame, - DatetimeIndex, - Series, - Timestamp, - date_range, -) +from pandas import DataFrame, DatetimeIndex, Series, Timestamp, date_range import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, set_timezone -from pandas.tests.io.pytables.common import (ensure_clean_path, - ensure_clean_store, - _maybe_remove) +from pandas.tests.io.pytables.common import ( + ensure_clean_path, + ensure_clean_store, + _maybe_remove, +) class TestTimezones: From eb14b23258130328044f9b59015d7177b35eab00 Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Wed, 2 Oct 2019 11:45:18 +0100 Subject: [PATCH 12/28] amended scope for setup_mode to module as the future state of test modules will be functional --- pandas/tests/io/pytables/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/pytables/conftest.py b/pandas/tests/io/pytables/conftest.py index 365fc23a402fd..6164f5d0722cc 100644 --- a/pandas/tests/io/pytables/conftest.py +++ b/pandas/tests/io/pytables/conftest.py @@ -9,7 +9,7 @@ def setup_path(): return "tmp.__{}__.h5".format(tm.rands(10)) -@pytest.fixture(scope="class", autouse=True) +@pytest.fixture(scope="module", autouse=True) def setup_mode(): """ Reset testing mode fixture""" tm.reset_testing_mode() From 7b640fc0675f7798535115dc1e261fdd564a3a51 Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Wed, 2 Oct 2019 11:51:54 +0100 Subject: [PATCH 13/28] running isort to import in the right order --- pandas/tests/io/pytables/common.py | 8 ++++---- .../io/pytables/test_hdf_complex_values.py | 10 +++++----- pandas/tests/io/pytables/test_hdf_store.py | 20 +++++++++---------- pandas/tests/io/pytables/test_timezones.py | 6 +++--- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/pandas/tests/io/pytables/common.py b/pandas/tests/io/pytables/common.py index 2a327ce5ad1d6..88e39d6db7172 100644 --- a/pandas/tests/io/pytables/common.py +++ b/pandas/tests/io/pytables/common.py @@ -1,12 +1,12 @@ from contextlib import contextmanager -import os -import pytest -import numpy as np from distutils.version import LooseVersion +import os import tempfile -from pandas.io.pytables import HDFStore +import numpy as np +import pytest +from pandas.io.pytables import HDFStore # TODO: # remove when gh-24839 is fixed; this affects numpy 1.16 diff --git a/pandas/tests/io/pytables/test_hdf_complex_values.py b/pandas/tests/io/pytables/test_hdf_complex_values.py index c7553471aea6b..e0787f46c1b08 100644 --- a/pandas/tests/io/pytables/test_hdf_complex_values.py +++ b/pandas/tests/io/pytables/test_hdf_complex_values.py @@ -5,15 +5,15 @@ import pandas as pd from pandas import DataFrame, Series -import pandas.util.testing as tm -from pandas.util.testing import assert_frame_equal - -from pandas.io.pytables import read_hdf from pandas.tests.io.pytables.common import ( - xfail_non_writeable, ensure_clean_path, ensure_clean_store, + xfail_non_writeable, ) +import pandas.util.testing as tm +from pandas.util.testing import assert_frame_equal + +from pandas.io.pytables import read_hdf class TestHDFComplexValues: diff --git a/pandas/tests/io/pytables/test_hdf_store.py b/pandas/tests/io/pytables/test_hdf_store.py index 536d62ae393a7..e8f105db298dd 100644 --- a/pandas/tests/io/pytables/test_hdf_store.py +++ b/pandas/tests/io/pytables/test_hdf_store.py @@ -32,6 +32,16 @@ isna, timedelta_range, ) +from pandas.tests.io.pytables.common import ( + _maybe_remove, + create_tempfile, + ensure_clean_path, + ensure_clean_store, + safe_close, + safe_remove, + tables, + xfail_non_writeable, +) import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal @@ -42,16 +52,6 @@ Term, read_hdf, ) -from pandas.tests.io.pytables.common import ( - xfail_non_writeable, - tables, - ensure_clean_path, - ensure_clean_store, - create_tempfile, - safe_close, - safe_remove, - _maybe_remove, -) from pandas.io import pytables as pytables # noqa: E402 isort:skip from pandas.io.pytables import TableIterator # noqa: E402 isort:skip diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index ed529d538f0e3..3a75c1b84b628 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -7,13 +7,13 @@ import pandas as pd from pandas import DataFrame, DatetimeIndex, Series, Timestamp, date_range -import pandas.util.testing as tm -from pandas.util.testing import assert_frame_equal, set_timezone from pandas.tests.io.pytables.common import ( + _maybe_remove, ensure_clean_path, ensure_clean_store, - _maybe_remove, ) +import pandas.util.testing as tm +from pandas.util.testing import assert_frame_equal, set_timezone class TestTimezones: From 44c56f189996de4c1d2f349ce2273231fab1361e Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Wed, 2 Oct 2019 12:00:55 +0100 Subject: [PATCH 14/28] moving tables from common.py to the modules requiring this --- pandas/tests/io/pytables/common.py | 22 ---------------- .../io/pytables/test_hdf_complex_values.py | 26 +++++++++++++++---- pandas/tests/io/pytables/test_hdf_store.py | 22 ++++++++++++++-- 3 files changed, 41 insertions(+), 29 deletions(-) diff --git a/pandas/tests/io/pytables/common.py b/pandas/tests/io/pytables/common.py index 88e39d6db7172..d4f3e01a28a79 100644 --- a/pandas/tests/io/pytables/common.py +++ b/pandas/tests/io/pytables/common.py @@ -1,31 +1,9 @@ from contextlib import contextmanager -from distutils.version import LooseVersion import os import tempfile -import numpy as np -import pytest - from pandas.io.pytables import HDFStore -# TODO: -# remove when gh-24839 is fixed; this affects numpy 1.16 -# and pytables 3.4.4 -tables = pytest.importorskip("tables") -xfail_non_writeable = pytest.mark.xfail( - LooseVersion(np.__version__) >= LooseVersion("1.16") - and LooseVersion(tables.__version__) < LooseVersion("3.5.1"), - reason=( - "gh-25511, gh-24839. pytables needs a " - "release beyong 3.4.4 to support numpy 1.16x" - ), -) - -# set these parameters so we don't have file sharing -tables.parameters.MAX_NUMEXPR_THREADS = 1 -tables.parameters.MAX_BLOSC_THREADS = 1 -tables.parameters.MAX_THREADS = 1 - def safe_remove(path): if path is not None: diff --git a/pandas/tests/io/pytables/test_hdf_complex_values.py b/pandas/tests/io/pytables/test_hdf_complex_values.py index e0787f46c1b08..a50aa9e7b2561 100644 --- a/pandas/tests/io/pytables/test_hdf_complex_values.py +++ b/pandas/tests/io/pytables/test_hdf_complex_values.py @@ -1,3 +1,4 @@ +from distutils.version import LooseVersion from warnings import catch_warnings import numpy as np @@ -5,16 +6,31 @@ import pandas as pd from pandas import DataFrame, Series -from pandas.tests.io.pytables.common import ( - ensure_clean_path, - ensure_clean_store, - xfail_non_writeable, -) +from pandas.tests.io.pytables.common import ensure_clean_path, ensure_clean_store import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal from pandas.io.pytables import read_hdf +# TODO: +# remove when gh-24839 is fixed; this affects numpy 1.16 +# and pytables 3.4.4 +tables = pytest.importorskip("tables") +# set these parameters so we don't have file sharing +tables.parameters.MAX_NUMEXPR_THREADS = 1 +tables.parameters.MAX_BLOSC_THREADS = 1 +tables.parameters.MAX_THREADS = 1 + + +xfail_non_writeable = pytest.mark.xfail( + LooseVersion(np.__version__) >= LooseVersion("1.16") + and LooseVersion(tables.__version__) < LooseVersion("3.5.1"), + reason=( + "gh-25511, gh-24839. pytables needs a " + "release beyong 3.4.4 to support numpy 1.16x" + ), +) + class TestHDFComplexValues: # GH10447 diff --git a/pandas/tests/io/pytables/test_hdf_store.py b/pandas/tests/io/pytables/test_hdf_store.py index e8f105db298dd..43d2865df8730 100644 --- a/pandas/tests/io/pytables/test_hdf_store.py +++ b/pandas/tests/io/pytables/test_hdf_store.py @@ -39,8 +39,6 @@ ensure_clean_store, safe_close, safe_remove, - tables, - xfail_non_writeable, ) import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal @@ -57,6 +55,26 @@ from pandas.io.pytables import TableIterator # noqa: E402 isort:skip +# TODO: +# remove when gh-24839 is fixed; this affects numpy 1.16 +# and pytables 3.4.4 +tables = pytest.importorskip("tables") +# set these parameters so we don't have file sharing +tables.parameters.MAX_NUMEXPR_THREADS = 1 +tables.parameters.MAX_BLOSC_THREADS = 1 +tables.parameters.MAX_THREADS = 1 + + +xfail_non_writeable = pytest.mark.xfail( + LooseVersion(np.__version__) >= LooseVersion("1.16") + and LooseVersion(tables.__version__) < LooseVersion("3.5.1"), + reason=( + "gh-25511, gh-24839. pytables needs a " + "release beyong 3.4.4 to support numpy 1.16x" + ), +) + + _default_compressor = "blosc" ignore_natural_naming_warning = pytest.mark.filterwarnings( "ignore:object name:tables.exceptions.NaturalNameWarning" From 595670e37826eaaa272c4fbc27250226917cbb82 Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Wed, 2 Oct 2019 12:03:29 +0100 Subject: [PATCH 15/28] renaming test files to make them sexier --- .../io/pytables/{test_hdf_complex_values.py => test_complex.py} | 0 pandas/tests/io/pytables/{test_hdf_store.py => test_store.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename pandas/tests/io/pytables/{test_hdf_complex_values.py => test_complex.py} (100%) rename pandas/tests/io/pytables/{test_hdf_store.py => test_store.py} (100%) diff --git a/pandas/tests/io/pytables/test_hdf_complex_values.py b/pandas/tests/io/pytables/test_complex.py similarity index 100% rename from pandas/tests/io/pytables/test_hdf_complex_values.py rename to pandas/tests/io/pytables/test_complex.py diff --git a/pandas/tests/io/pytables/test_hdf_store.py b/pandas/tests/io/pytables/test_store.py similarity index 100% rename from pandas/tests/io/pytables/test_hdf_store.py rename to pandas/tests/io/pytables/test_store.py From 515825aa03edec48a576cc1ff14bdc3d241fdd68 Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Wed, 2 Oct 2019 14:38:58 +0100 Subject: [PATCH 16/28] copied pytest.importorskip to timezone --- pandas/tests/io/pytables/test_timezones.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 3a75c1b84b628..5f7fc8d822300 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -15,6 +15,12 @@ import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, set_timezone +tables = pytest.importorskip("tables") +# set these parameters so we don't have file sharing +tables.parameters.MAX_NUMEXPR_THREADS = 1 +tables.parameters.MAX_BLOSC_THREADS = 1 +tables.parameters.MAX_THREADS = 1 + class TestTimezones: def _compare_with_tz(self, a, b): From 6106a24badf23f6a3045d20c75532340e262910a Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Wed, 2 Oct 2019 14:43:31 +0100 Subject: [PATCH 17/28] removed test_timezone class and kept module functional --- pandas/tests/io/pytables/test_timezones.py | 636 +++++++++++---------- 1 file changed, 321 insertions(+), 315 deletions(-) diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 5f7fc8d822300..d811c5def7d96 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -22,368 +22,374 @@ tables.parameters.MAX_THREADS = 1 -class TestTimezones: - def _compare_with_tz(self, a, b): - tm.assert_frame_equal(a, b) - - # compare the zones on each element - for c in a.columns: - for i in a.index: - a_e = a.loc[i, c] - b_e = b.loc[i, c] - if not (a_e == b_e and a_e.tz == b_e.tz): - raise AssertionError( - "invalid tz comparison [{a_e}] [{b_e}]".format(a_e=a_e, b_e=b_e) - ) - - def test_append_with_timezones_dateutil(self, setup_path): - - from datetime import timedelta - - # use maybe_get_tz instead of dateutil.tz.gettz to handle the windows - # filename issues. - from pandas._libs.tslibs.timezones import maybe_get_tz - - gettz = lambda x: maybe_get_tz("dateutil/" + x) - - # as columns - with ensure_clean_store(setup_path) as store: - - _maybe_remove(store, "df_tz") - df = DataFrame( - dict( - A=[ - Timestamp("20130102 2:00:00", tz=gettz("US/Eastern")) - + timedelta(hours=1) * i - for i in range(5) - ] +def _compare_with_tz(a, b): + tm.assert_frame_equal(a, b) + + # compare the zones on each element + for c in a.columns: + for i in a.index: + a_e = a.loc[i, c] + b_e = b.loc[i, c] + if not (a_e == b_e and a_e.tz == b_e.tz): + raise AssertionError( + "invalid tz comparison [{a_e}] [{b_e}]".format(a_e=a_e, b_e=b_e) ) - ) - store.append("df_tz", df, data_columns=["A"]) - result = store["df_tz"] - self._compare_with_tz(result, df) - assert_frame_equal(result, df) - # select with tz aware - expected = df[df.A >= df.A[3]] - result = store.select("df_tz", where="A>=df.A[3]") - self._compare_with_tz(result, expected) - - # ensure we include dates in DST and STD time here. - _maybe_remove(store, "df_tz") - df = DataFrame( - dict( - A=Timestamp("20130102", tz=gettz("US/Eastern")), - B=Timestamp("20130603", tz=gettz("US/Eastern")), - ), - index=range(5), +def test_append_with_timezones_dateutil(setup_path): + + from datetime import timedelta + + # use maybe_get_tz instead of dateutil.tz.gettz to handle the windows + # filename issues. + from pandas._libs.tslibs.timezones import maybe_get_tz + + gettz = lambda x: maybe_get_tz("dateutil/" + x) + + # as columns + with ensure_clean_store(setup_path) as store: + + _maybe_remove(store, "df_tz") + df = DataFrame( + dict( + A=[ + Timestamp("20130102 2:00:00", tz=gettz("US/Eastern")) + + timedelta(hours=1) * i + for i in range(5) + ] ) + ) + + store.append("df_tz", df, data_columns=["A"]) + result = store["df_tz"] + _compare_with_tz(result, df) + assert_frame_equal(result, df) + + # select with tz aware + expected = df[df.A >= df.A[3]] + result = store.select("df_tz", where="A>=df.A[3]") + _compare_with_tz(result, expected) + + # ensure we include dates in DST and STD time here. + _maybe_remove(store, "df_tz") + df = DataFrame( + dict( + A=Timestamp("20130102", tz=gettz("US/Eastern")), + B=Timestamp("20130603", tz=gettz("US/Eastern")), + ), + index=range(5), + ) + store.append("df_tz", df) + result = store["df_tz"] + _compare_with_tz(result, df) + assert_frame_equal(result, df) + + df = DataFrame( + dict( + A=Timestamp("20130102", tz=gettz("US/Eastern")), + B=Timestamp("20130102", tz=gettz("EET")), + ), + index=range(5), + ) + with pytest.raises(ValueError): store.append("df_tz", df) - result = store["df_tz"] - self._compare_with_tz(result, df) - assert_frame_equal(result, df) - df = DataFrame( - dict( - A=Timestamp("20130102", tz=gettz("US/Eastern")), - B=Timestamp("20130102", tz=gettz("EET")), - ), - index=range(5), - ) - with pytest.raises(ValueError): - store.append("df_tz", df) - - # this is ok - _maybe_remove(store, "df_tz") - store.append("df_tz", df, data_columns=["A", "B"]) - result = store["df_tz"] - self._compare_with_tz(result, df) - assert_frame_equal(result, df) + # this is ok + _maybe_remove(store, "df_tz") + store.append("df_tz", df, data_columns=["A", "B"]) + result = store["df_tz"] + _compare_with_tz(result, df) + assert_frame_equal(result, df) - # can't append with diff timezone - df = DataFrame( - dict( - A=Timestamp("20130102", tz=gettz("US/Eastern")), - B=Timestamp("20130102", tz=gettz("CET")), - ), - index=range(5), - ) - with pytest.raises(ValueError): - store.append("df_tz", df) - - # as index - with ensure_clean_store(setup_path) as store: - - # GH 4098 example - df = DataFrame( - dict( - A=Series( - range(3), - index=date_range( - "2000-1-1", periods=3, freq="H", tz=gettz("US/Eastern") - ), - ) + # can't append with diff timezone + df = DataFrame( + dict( + A=Timestamp("20130102", tz=gettz("US/Eastern")), + B=Timestamp("20130102", tz=gettz("CET")), + ), + index=range(5), + ) + with pytest.raises(ValueError): + store.append("df_tz", df) + + # as index + with ensure_clean_store(setup_path) as store: + + # GH 4098 example + df = DataFrame( + dict( + A=Series( + range(3), + index=date_range( + "2000-1-1", periods=3, freq="H", tz=gettz("US/Eastern") + ), ) ) + ) - _maybe_remove(store, "df") - store.put("df", df) - result = store.select("df") - assert_frame_equal(result, df) + _maybe_remove(store, "df") + store.put("df", df) + result = store.select("df") + assert_frame_equal(result, df) - _maybe_remove(store, "df") - store.append("df", df) - result = store.select("df") - assert_frame_equal(result, df) + _maybe_remove(store, "df") + store.append("df", df) + result = store.select("df") + assert_frame_equal(result, df) - def test_append_with_timezones_pytz(self, setup_path): - from datetime import timedelta +def test_append_with_timezones_pytz(setup_path): - # as columns - with ensure_clean_store(setup_path) as store: + from datetime import timedelta - _maybe_remove(store, "df_tz") - df = DataFrame( - dict( - A=[ - Timestamp("20130102 2:00:00", tz="US/Eastern") - + timedelta(hours=1) * i - for i in range(5) - ] - ) - ) - store.append("df_tz", df, data_columns=["A"]) - result = store["df_tz"] - self._compare_with_tz(result, df) - assert_frame_equal(result, df) + # as columns + with ensure_clean_store(setup_path) as store: - # select with tz aware - self._compare_with_tz( - store.select("df_tz", where="A>=df.A[3]"), df[df.A >= df.A[3]] + _maybe_remove(store, "df_tz") + df = DataFrame( + dict( + A=[ + Timestamp("20130102 2:00:00", tz="US/Eastern") + + timedelta(hours=1) * i + for i in range(5) + ] ) + ) + store.append("df_tz", df, data_columns=["A"]) + result = store["df_tz"] + _compare_with_tz(result, df) + assert_frame_equal(result, df) + + # select with tz aware + _compare_with_tz( + store.select("df_tz", where="A>=df.A[3]"), df[df.A >= df.A[3]] + ) - _maybe_remove(store, "df_tz") - # ensure we include dates in DST and STD time here. - df = DataFrame( - dict( - A=Timestamp("20130102", tz="US/Eastern"), - B=Timestamp("20130603", tz="US/Eastern"), - ), - index=range(5), - ) + _maybe_remove(store, "df_tz") + # ensure we include dates in DST and STD time here. + df = DataFrame( + dict( + A=Timestamp("20130102", tz="US/Eastern"), + B=Timestamp("20130603", tz="US/Eastern"), + ), + index=range(5), + ) + store.append("df_tz", df) + result = store["df_tz"] + _compare_with_tz(result, df) + assert_frame_equal(result, df) + + df = DataFrame( + dict( + A=Timestamp("20130102", tz="US/Eastern"), + B=Timestamp("20130102", tz="EET"), + ), + index=range(5), + ) + with pytest.raises(ValueError): store.append("df_tz", df) - result = store["df_tz"] - self._compare_with_tz(result, df) - assert_frame_equal(result, df) - df = DataFrame( - dict( - A=Timestamp("20130102", tz="US/Eastern"), - B=Timestamp("20130102", tz="EET"), - ), - index=range(5), - ) - with pytest.raises(ValueError): - store.append("df_tz", df) - - # this is ok - _maybe_remove(store, "df_tz") - store.append("df_tz", df, data_columns=["A", "B"]) - result = store["df_tz"] - self._compare_with_tz(result, df) - assert_frame_equal(result, df) + # this is ok + _maybe_remove(store, "df_tz") + store.append("df_tz", df, data_columns=["A", "B"]) + result = store["df_tz"] + _compare_with_tz(result, df) + assert_frame_equal(result, df) - # can't append with diff timezone - df = DataFrame( - dict( - A=Timestamp("20130102", tz="US/Eastern"), - B=Timestamp("20130102", tz="CET"), - ), - index=range(5), - ) - with pytest.raises(ValueError): - store.append("df_tz", df) - - # as index - with ensure_clean_store(setup_path) as store: - - # GH 4098 example - df = DataFrame( - dict( - A=Series( - range(3), - index=date_range( - "2000-1-1", periods=3, freq="H", tz="US/Eastern" - ), - ) + # can't append with diff timezone + df = DataFrame( + dict( + A=Timestamp("20130102", tz="US/Eastern"), + B=Timestamp("20130102", tz="CET"), + ), + index=range(5), + ) + with pytest.raises(ValueError): + store.append("df_tz", df) + + # as index + with ensure_clean_store(setup_path) as store: + + # GH 4098 example + df = DataFrame( + dict( + A=Series( + range(3), + index=date_range("2000-1-1", periods=3, freq="H", tz="US/Eastern"), ) ) + ) - _maybe_remove(store, "df") - store.put("df", df) - result = store.select("df") - assert_frame_equal(result, df) + _maybe_remove(store, "df") + store.put("df", df) + result = store.select("df") + assert_frame_equal(result, df) - _maybe_remove(store, "df") - store.append("df", df) - result = store.select("df") - assert_frame_equal(result, df) + _maybe_remove(store, "df") + store.append("df", df) + result = store.select("df") + assert_frame_equal(result, df) + + +def test_tseries_select_index_column(setup_path): + # GH7777 + # selecting a UTC datetimeindex column did + # not preserve UTC tzinfo set before storing + + # check that no tz still works + rng = date_range("1/1/2000", "1/30/2000") + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + + with ensure_clean_store(setup_path) as store: + store.append("frame", frame) + result = store.select_column("frame", "index") + assert rng.tz == DatetimeIndex(result.values).tz + + # check utc + rng = date_range("1/1/2000", "1/30/2000", tz="UTC") + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - def test_tseries_select_index_column(self, setup_path): - # GH7777 - # selecting a UTC datetimeindex column did - # not preserve UTC tzinfo set before storing + with ensure_clean_store(setup_path) as store: + store.append("frame", frame) + result = store.select_column("frame", "index") + assert rng.tz == result.dt.tz - # check that no tz still works - rng = date_range("1/1/2000", "1/30/2000") - frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + # double check non-utc + rng = date_range("1/1/2000", "1/30/2000", tz="US/Eastern") + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - with ensure_clean_store(setup_path) as store: - store.append("frame", frame) - result = store.select_column("frame", "index") - assert rng.tz == DatetimeIndex(result.values).tz + with ensure_clean_store(setup_path) as store: + store.append("frame", frame) + result = store.select_column("frame", "index") + assert rng.tz == result.dt.tz - # check utc - rng = date_range("1/1/2000", "1/30/2000", tz="UTC") - frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - with ensure_clean_store(setup_path) as store: - store.append("frame", frame) - result = store.select_column("frame", "index") - assert rng.tz == result.dt.tz +def test_timezones_fixed(setup_path): + with ensure_clean_store(setup_path) as store: - # double check non-utc + # index rng = date_range("1/1/2000", "1/30/2000", tz="US/Eastern") - frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + store["df"] = df + result = store["df"] + assert_frame_equal(result, df) + + # as data + # GH11411 + _maybe_remove(store, "df") + df = DataFrame( + { + "A": rng, + "B": rng.tz_convert("UTC").tz_localize(None), + "C": rng.tz_convert("CET"), + "D": range(len(rng)), + }, + index=rng, + ) + store["df"] = df + result = store["df"] + assert_frame_equal(result, df) - with ensure_clean_store(setup_path) as store: - store.append("frame", frame) - result = store.select_column("frame", "index") - assert rng.tz == result.dt.tz - def test_timezones_fixed(self, setup_path): - with ensure_clean_store(setup_path) as store: +def test_fixed_offset_tz(setup_path): + rng = date_range("1/1/2000 00:00:00-07:00", "1/30/2000 00:00:00-07:00") + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - # index - rng = date_range("1/1/2000", "1/30/2000", tz="US/Eastern") - df = DataFrame(np.random.randn(len(rng), 4), index=rng) - store["df"] = df - result = store["df"] - assert_frame_equal(result, df) + with ensure_clean_store(setup_path) as store: + store["frame"] = frame + recons = store["frame"] + tm.assert_index_equal(recons.index, rng) + assert rng.tz == recons.index.tz - # as data - # GH11411 - _maybe_remove(store, "df") - df = DataFrame( - { - "A": rng, - "B": rng.tz_convert("UTC").tz_localize(None), - "C": rng.tz_convert("CET"), - "D": range(len(rng)), - }, - index=rng, - ) - store["df"] = df - result = store["df"] - assert_frame_equal(result, df) - def test_fixed_offset_tz(self, setup_path): - rng = date_range("1/1/2000 00:00:00-07:00", "1/30/2000 00:00:00-07:00") - frame = DataFrame(np.random.randn(len(rng), 4), index=rng) +@td.skip_if_windows +def test_store_timezone(setup_path): + # GH2852 + # issue storing datetime.date with a timezone as it resets when read + # back in a new timezone - with ensure_clean_store(setup_path) as store: - store["frame"] = frame - recons = store["frame"] - tm.assert_index_equal(recons.index, rng) - assert rng.tz == recons.index.tz + # original method + with ensure_clean_store(setup_path) as store: - @td.skip_if_windows - def test_store_timezone(self, setup_path): - # GH2852 - # issue storing datetime.date with a timezone as it resets when read - # back in a new timezone + today = datetime.date(2013, 9, 10) + df = DataFrame([1, 2, 3], index=[today, today, today]) + store["obj1"] = df + result = store["obj1"] + assert_frame_equal(result, df) - # original method - with ensure_clean_store(setup_path) as store: + # with tz setting + with ensure_clean_store(setup_path) as store: + with set_timezone("EST5EDT"): today = datetime.date(2013, 9, 10) df = DataFrame([1, 2, 3], index=[today, today, today]) store["obj1"] = df - result = store["obj1"] - assert_frame_equal(result, df) - - # with tz setting - with ensure_clean_store(setup_path) as store: - with set_timezone("EST5EDT"): - today = datetime.date(2013, 9, 10) - df = DataFrame([1, 2, 3], index=[today, today, today]) - store["obj1"] = df + with set_timezone("CST6CDT"): + result = store["obj1"] - with set_timezone("CST6CDT"): - result = store["obj1"] + assert_frame_equal(result, df) + + +def test_legacy_datetimetz_object(datapath, setup_path): + # legacy from < 0.17.0 + # 8260 + expected = DataFrame( + dict( + A=Timestamp("20130102", tz="US/Eastern"), B=Timestamp("20130603", tz="CET") + ), + index=range(5), + ) + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "datetimetz_object.h5"), mode="r" + ) as store: + result = store["df"] + assert_frame_equal(result, expected) + + +def test_dst_transitions(setup_path): + # make sure we are not failing on transitions + with ensure_clean_store(setup_path) as store: + times = pd.date_range( + "2013-10-26 23:00", + "2013-10-27 01:00", + tz="Europe/London", + freq="H", + ambiguous="infer", + ) + for i in [times, times + pd.Timedelta("10min")]: + _maybe_remove(store, "df") + df = DataFrame({"A": range(len(i)), "B": i}, index=i) + store.append("df", df) + result = store.select("df") assert_frame_equal(result, df) - def test_legacy_datetimetz_object(self, datapath, setup_path): - # legacy from < 0.17.0 - # 8260 - expected = DataFrame( - dict( - A=Timestamp("20130102", tz="US/Eastern"), - B=Timestamp("20130603", tz="CET"), - ), - index=range(5), - ) - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "datetimetz_object.h5"), mode="r" - ) as store: - result = store["df"] - assert_frame_equal(result, expected) - - def test_dst_transitions(self, setup_path): - # make sure we are not failing on transitions - with ensure_clean_store(setup_path) as store: - times = pd.date_range( - "2013-10-26 23:00", - "2013-10-27 01:00", - tz="Europe/London", - freq="H", - ambiguous="infer", - ) - for i in [times, times + pd.Timedelta("10min")]: - _maybe_remove(store, "df") - df = DataFrame({"A": range(len(i)), "B": i}, index=i) - store.append("df", df) - result = store.select("df") - assert_frame_equal(result, df) - - def test_read_with_where_tz_aware_index(self, setup_path): - # GH 11926 - periods = 10 - dts = pd.date_range("20151201", periods=periods, freq="D", tz="UTC") - mi = pd.MultiIndex.from_arrays([dts, range(periods)], names=["DATE", "NO"]) - expected = pd.DataFrame({"MYCOL": 0}, index=mi) - - key = "mykey" - with ensure_clean_path(setup_path) as path: - with pd.HDFStore(path) as store: - store.append(key, expected, format="table", append=True) - result = pd.read_hdf(path, key, where="DATE > 20151130") - assert_frame_equal(result, expected) - - def test_py2_created_with_datetimez(self, datapath, setup_path): - # The test HDF5 file was created in Python 2, but could not be read in - # Python 3. - # - # GH26443 - index = [pd.Timestamp("2019-01-01T18:00").tz_localize("America/New_York")] - expected = DataFrame({"data": 123}, index=index) - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "gh26443.h5"), mode="r" - ) as store: - result = store["key"] - assert_frame_equal(result, expected) +def test_read_with_where_tz_aware_index(setup_path): + # GH 11926 + periods = 10 + dts = pd.date_range("20151201", periods=periods, freq="D", tz="UTC") + mi = pd.MultiIndex.from_arrays([dts, range(periods)], names=["DATE", "NO"]) + expected = pd.DataFrame({"MYCOL": 0}, index=mi) + + key = "mykey" + with ensure_clean_path(setup_path) as path: + with pd.HDFStore(path) as store: + store.append(key, expected, format="table", append=True) + result = pd.read_hdf(path, key, where="DATE > 20151130") + assert_frame_equal(result, expected) + + +def test_py2_created_with_datetimez(datapath, setup_path): + # The test HDF5 file was created in Python 2, but could not be read in + # Python 3. + # + # GH26443 + index = [pd.Timestamp("2019-01-01T18:00").tz_localize("America/New_York")] + expected = DataFrame({"data": 123}, index=index) + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "gh26443.h5"), mode="r" + ) as store: + result = store["key"] + assert_frame_equal(result, expected) From 8f540a7fda8fcdcd3e550f46973f9906d42e4931 Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Wed, 2 Oct 2019 14:45:24 +0100 Subject: [PATCH 18/28] removed test_hdfcomplexvalues class and kept module functional --- pandas/tests/io/pytables/test_complex.py | 317 +++++++++++------------ 1 file changed, 158 insertions(+), 159 deletions(-) diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py index a50aa9e7b2561..78a5f58af1f45 100644 --- a/pandas/tests/io/pytables/test_complex.py +++ b/pandas/tests/io/pytables/test_complex.py @@ -32,166 +32,165 @@ ) -class TestHDFComplexValues: - # GH10447 - - def test_complex_fixed(self, setup_path): - df = DataFrame( - np.random.rand(4, 5).astype(np.complex64), - index=list("abcd"), - columns=list("ABCDE"), - ) - - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df") - reread = read_hdf(path, "df") - assert_frame_equal(df, reread) - - df = DataFrame( - np.random.rand(4, 5).astype(np.complex128), - index=list("abcd"), - columns=list("ABCDE"), - ) - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df") - reread = read_hdf(path, "df") - assert_frame_equal(df, reread) - - def test_complex_table(self, setup_path): - df = DataFrame( - np.random.rand(4, 5).astype(np.complex64), - index=list("abcd"), - columns=list("ABCDE"), - ) - - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", format="table") - reread = read_hdf(path, "df") - assert_frame_equal(df, reread) - - df = DataFrame( - np.random.rand(4, 5).astype(np.complex128), - index=list("abcd"), - columns=list("ABCDE"), - ) - - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", format="table", mode="w") - reread = read_hdf(path, "df") - assert_frame_equal(df, reread) - - @xfail_non_writeable - def test_complex_mixed_fixed(self, setup_path): - complex64 = np.array( - [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64 - ) - complex128 = np.array( - [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 - ) - df = DataFrame( - { - "A": [1, 2, 3, 4], - "B": ["a", "b", "c", "d"], - "C": complex64, - "D": complex128, - "E": [1.0, 2.0, 3.0, 4.0], - }, - index=list("abcd"), - ) - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df") - reread = read_hdf(path, "df") - assert_frame_equal(df, reread) - - def test_complex_mixed_table(self, setup_path): - complex64 = np.array( - [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64 - ) - complex128 = np.array( - [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 - ) - df = DataFrame( - { - "A": [1, 2, 3, 4], - "B": ["a", "b", "c", "d"], - "C": complex64, - "D": complex128, - "E": [1.0, 2.0, 3.0, 4.0], - }, - index=list("abcd"), - ) - - with ensure_clean_store(setup_path) as store: - store.append("df", df, data_columns=["A", "B"]) - result = store.select("df", where="A>2") - assert_frame_equal(df.loc[df.A > 2], result) - - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", format="table") - reread = read_hdf(path, "df") - assert_frame_equal(df, reread) - - def test_complex_across_dimensions_fixed(self, setup_path): - with catch_warnings(record=True): - complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) - s = Series(complex128, index=list("abcd")) - df = DataFrame({"A": s, "B": s}) - - objs = [s, df] - comps = [tm.assert_series_equal, tm.assert_frame_equal] - for obj, comp in zip(objs, comps): - with ensure_clean_path(setup_path) as path: - obj.to_hdf(path, "obj", format="fixed") - reread = read_hdf(path, "obj") - comp(obj, reread) - - def test_complex_across_dimensions(self, setup_path): +# GH10447 + +def test_complex_fixed(setup_path): + df = DataFrame( + np.random.rand(4, 5).astype(np.complex64), + index=list("abcd"), + columns=list("ABCDE"), + ) + + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df") + reread = read_hdf(path, "df") + assert_frame_equal(df, reread) + + df = DataFrame( + np.random.rand(4, 5).astype(np.complex128), + index=list("abcd"), + columns=list("ABCDE"), + ) + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df") + reread = read_hdf(path, "df") + assert_frame_equal(df, reread) + +def test_complex_table(setup_path): + df = DataFrame( + np.random.rand(4, 5).astype(np.complex64), + index=list("abcd"), + columns=list("ABCDE"), + ) + + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", format="table") + reread = read_hdf(path, "df") + assert_frame_equal(df, reread) + + df = DataFrame( + np.random.rand(4, 5).astype(np.complex128), + index=list("abcd"), + columns=list("ABCDE"), + ) + + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", format="table", mode="w") + reread = read_hdf(path, "df") + assert_frame_equal(df, reread) + +@xfail_non_writeable +def test_complex_mixed_fixed(setup_path): + complex64 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64 + ) + complex128 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 + ) + df = DataFrame( + { + "A": [1, 2, 3, 4], + "B": ["a", "b", "c", "d"], + "C": complex64, + "D": complex128, + "E": [1.0, 2.0, 3.0, 4.0], + }, + index=list("abcd"), + ) + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df") + reread = read_hdf(path, "df") + assert_frame_equal(df, reread) + +def test_complex_mixed_table(setup_path): + complex64 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64 + ) + complex128 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 + ) + df = DataFrame( + { + "A": [1, 2, 3, 4], + "B": ["a", "b", "c", "d"], + "C": complex64, + "D": complex128, + "E": [1.0, 2.0, 3.0, 4.0], + }, + index=list("abcd"), + ) + + with ensure_clean_store(setup_path) as store: + store.append("df", df, data_columns=["A", "B"]) + result = store.select("df", where="A>2") + assert_frame_equal(df.loc[df.A > 2], result) + + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", format="table") + reread = read_hdf(path, "df") + assert_frame_equal(df, reread) + +def test_complex_across_dimensions_fixed(setup_path): + with catch_warnings(record=True): complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) s = Series(complex128, index=list("abcd")) df = DataFrame({"A": s, "B": s}) - with catch_warnings(record=True): - - objs = [df] - comps = [tm.assert_frame_equal] - for obj, comp in zip(objs, comps): - with ensure_clean_path(setup_path) as path: - obj.to_hdf(path, "obj", format="table") - reread = read_hdf(path, "obj") - comp(obj, reread) - - def test_complex_indexing_error(self, setup_path): - complex128 = np.array( - [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 - ) - df = DataFrame( - {"A": [1, 2, 3, 4], "B": ["a", "b", "c", "d"], "C": complex128}, - index=list("abcd"), - ) - with ensure_clean_store(setup_path) as store: - with pytest.raises(TypeError): - store.append("df", df, data_columns=["C"]) - - def test_complex_series_error(self, setup_path): - complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) - s = Series(complex128, index=list("abcd")) - - with ensure_clean_path(setup_path) as path: - with pytest.raises(TypeError): - s.to_hdf(path, "obj", format="t") - - with ensure_clean_path(setup_path) as path: - s.to_hdf(path, "obj", format="t", index=False) - reread = read_hdf(path, "obj") - tm.assert_series_equal(s, reread) - - def test_complex_append(self, setup_path): - df = DataFrame( - {"a": np.random.randn(100).astype(np.complex128), "b": np.random.randn(100)} - ) - - with ensure_clean_store(setup_path) as store: - store.append("df", df, data_columns=["b"]) - store.append("df", df) - result = store.select("df") - assert_frame_equal(pd.concat([df, df], 0), result) + objs = [s, df] + comps = [tm.assert_series_equal, tm.assert_frame_equal] + for obj, comp in zip(objs, comps): + with ensure_clean_path(setup_path) as path: + obj.to_hdf(path, "obj", format="fixed") + reread = read_hdf(path, "obj") + comp(obj, reread) + +def test_complex_across_dimensions(setup_path): + complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) + s = Series(complex128, index=list("abcd")) + df = DataFrame({"A": s, "B": s}) + + with catch_warnings(record=True): + + objs = [df] + comps = [tm.assert_frame_equal] + for obj, comp in zip(objs, comps): + with ensure_clean_path(setup_path) as path: + obj.to_hdf(path, "obj", format="table") + reread = read_hdf(path, "obj") + comp(obj, reread) + +def test_complex_indexing_error(setup_path): + complex128 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 + ) + df = DataFrame( + {"A": [1, 2, 3, 4], "B": ["a", "b", "c", "d"], "C": complex128}, + index=list("abcd"), + ) + with ensure_clean_store(setup_path) as store: + with pytest.raises(TypeError): + store.append("df", df, data_columns=["C"]) + +def test_complex_series_error(setup_path): + complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) + s = Series(complex128, index=list("abcd")) + + with ensure_clean_path(setup_path) as path: + with pytest.raises(TypeError): + s.to_hdf(path, "obj", format="t") + + with ensure_clean_path(setup_path) as path: + s.to_hdf(path, "obj", format="t", index=False) + reread = read_hdf(path, "obj") + tm.assert_series_equal(s, reread) + +def test_complex_append(setup_path): + df = DataFrame( + {"a": np.random.randn(100).astype(np.complex128), "b": np.random.randn(100)} + ) + + with ensure_clean_store(setup_path) as store: + store.append("df", df, data_columns=["b"]) + store.append("df", df) + result = store.select("df") + assert_frame_equal(pd.concat([df, df], 0), result) From cfdd437f3b8091f017e8e175b3b908a6ffe2a52e Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Wed, 2 Oct 2019 16:08:40 +0100 Subject: [PATCH 19/28] convert test_store back to class to make git diff easier for review --- pandas/tests/io/pytables/test_complex.py | 9 +++++++++ pandas/tests/io/pytables/test_timezones.py | 4 +--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py index 78a5f58af1f45..06e5edbc5294c 100644 --- a/pandas/tests/io/pytables/test_complex.py +++ b/pandas/tests/io/pytables/test_complex.py @@ -34,6 +34,7 @@ # GH10447 + def test_complex_fixed(setup_path): df = DataFrame( np.random.rand(4, 5).astype(np.complex64), @@ -56,6 +57,7 @@ def test_complex_fixed(setup_path): reread = read_hdf(path, "df") assert_frame_equal(df, reread) + def test_complex_table(setup_path): df = DataFrame( np.random.rand(4, 5).astype(np.complex64), @@ -79,6 +81,7 @@ def test_complex_table(setup_path): reread = read_hdf(path, "df") assert_frame_equal(df, reread) + @xfail_non_writeable def test_complex_mixed_fixed(setup_path): complex64 = np.array( @@ -102,6 +105,7 @@ def test_complex_mixed_fixed(setup_path): reread = read_hdf(path, "df") assert_frame_equal(df, reread) + def test_complex_mixed_table(setup_path): complex64 = np.array( [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64 @@ -130,6 +134,7 @@ def test_complex_mixed_table(setup_path): reread = read_hdf(path, "df") assert_frame_equal(df, reread) + def test_complex_across_dimensions_fixed(setup_path): with catch_warnings(record=True): complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) @@ -144,6 +149,7 @@ def test_complex_across_dimensions_fixed(setup_path): reread = read_hdf(path, "obj") comp(obj, reread) + def test_complex_across_dimensions(setup_path): complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) s = Series(complex128, index=list("abcd")) @@ -159,6 +165,7 @@ def test_complex_across_dimensions(setup_path): reread = read_hdf(path, "obj") comp(obj, reread) + def test_complex_indexing_error(setup_path): complex128 = np.array( [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 @@ -171,6 +178,7 @@ def test_complex_indexing_error(setup_path): with pytest.raises(TypeError): store.append("df", df, data_columns=["C"]) + def test_complex_series_error(setup_path): complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) s = Series(complex128, index=list("abcd")) @@ -184,6 +192,7 @@ def test_complex_series_error(setup_path): reread = read_hdf(path, "obj") tm.assert_series_equal(s, reread) + def test_complex_append(setup_path): df = DataFrame( {"a": np.random.randn(100).astype(np.complex128), "b": np.random.randn(100)} diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index d811c5def7d96..3e91c638190c9 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -161,9 +161,7 @@ def test_append_with_timezones_pytz(setup_path): assert_frame_equal(result, df) # select with tz aware - _compare_with_tz( - store.select("df_tz", where="A>=df.A[3]"), df[df.A >= df.A[3]] - ) + _compare_with_tz(store.select("df_tz", where="A>=df.A[3]"), df[df.A >= df.A[3]]) _maybe_remove(store, "df_tz") # ensure we include dates in DST and STD time here. From aa97d78efa5c6ad60c986092bdaf645aa4752b56 Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Wed, 2 Oct 2019 16:29:40 +0100 Subject: [PATCH 20/28] moved xfail_non_writeable into _test_decorators --- pandas/tests/io/pytables/test_complex.py | 15 ++------------- pandas/tests/io/pytables/test_store.py | 14 +------------- pandas/util/_test_decorators.py | 15 +++++++++++++++ 3 files changed, 18 insertions(+), 26 deletions(-) diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py index 06e5edbc5294c..00cf0367b50ca 100644 --- a/pandas/tests/io/pytables/test_complex.py +++ b/pandas/tests/io/pytables/test_complex.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.util._test_decorators import xfail_non_writeable + import pandas as pd from pandas import DataFrame, Series from pandas.tests.io.pytables.common import ensure_clean_path, ensure_clean_store @@ -12,9 +14,6 @@ from pandas.io.pytables import read_hdf -# TODO: -# remove when gh-24839 is fixed; this affects numpy 1.16 -# and pytables 3.4.4 tables = pytest.importorskip("tables") # set these parameters so we don't have file sharing tables.parameters.MAX_NUMEXPR_THREADS = 1 @@ -22,16 +21,6 @@ tables.parameters.MAX_THREADS = 1 -xfail_non_writeable = pytest.mark.xfail( - LooseVersion(np.__version__) >= LooseVersion("1.16") - and LooseVersion(tables.__version__) < LooseVersion("3.5.1"), - reason=( - "gh-25511, gh-24839. pytables needs a " - "release beyong 3.4.4 to support numpy 1.16x" - ), -) - - # GH10447 diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 43d2865df8730..3ca9dd5729f62 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -11,6 +11,7 @@ from pandas.compat import PY36, is_platform_little_endian, is_platform_windows import pandas.util._test_decorators as td +from pandas.util._test_decorators import xfail_non_writeable from pandas.core.dtypes.common import is_categorical_dtype @@ -55,9 +56,6 @@ from pandas.io.pytables import TableIterator # noqa: E402 isort:skip -# TODO: -# remove when gh-24839 is fixed; this affects numpy 1.16 -# and pytables 3.4.4 tables = pytest.importorskip("tables") # set these parameters so we don't have file sharing tables.parameters.MAX_NUMEXPR_THREADS = 1 @@ -65,16 +63,6 @@ tables.parameters.MAX_THREADS = 1 -xfail_non_writeable = pytest.mark.xfail( - LooseVersion(np.__version__) >= LooseVersion("1.16") - and LooseVersion(tables.__version__) < LooseVersion("3.5.1"), - reason=( - "gh-25511, gh-24839. pytables needs a " - "release beyong 3.4.4 to support numpy 1.16x" - ), -) - - _default_compressor = "blosc" ignore_natural_naming_warning = pytest.mark.filterwarnings( "ignore:object name:tables.exceptions.NaturalNameWarning" diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 0e07b9f5fe9f7..9ce7d9d50cd2e 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -73,6 +73,21 @@ def safe_import(mod_name, min_version=None): return False +# TODO: +# remove when gh-24839 is fixed; this affects numpy 1.16 +# and pytables 3.4.4 +mod = safe_import("numpy") +custom_mod = safe_import("tables") +xfail_non_writeable = pytest.mark.xfail( + LooseVersion(mod.__version__) >= LooseVersion("1.16") + and LooseVersion(custom_mod.__version__) < LooseVersion("3.5.1"), + reason=( + "gh-25511, gh-24839. pytables needs a " + "release beyong 3.4.4 to support numpy 1.16x" + ), +) + + def _skip_if_no_mpl(): mod = safe_import("matplotlib") if mod: From 993135278e06f2b994f7d4a159cd6d338a72e2da Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Wed, 2 Oct 2019 18:09:59 +0100 Subject: [PATCH 21/28] removed numpy from safe import --- pandas/util/_test_decorators.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 9ce7d9d50cd2e..e053e2e9d5f7c 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -26,7 +26,7 @@ def test_foo(): from distutils.version import LooseVersion import locale from typing import Callable, Optional - +import numpy as np import pytest from pandas.compat import is_platform_32bit, is_platform_windows @@ -76,11 +76,10 @@ def safe_import(mod_name, min_version=None): # TODO: # remove when gh-24839 is fixed; this affects numpy 1.16 # and pytables 3.4.4 -mod = safe_import("numpy") -custom_mod = safe_import("tables") +tables = safe_import("tables") xfail_non_writeable = pytest.mark.xfail( - LooseVersion(mod.__version__) >= LooseVersion("1.16") - and LooseVersion(custom_mod.__version__) < LooseVersion("3.5.1"), + LooseVersion(np.__version__) >= LooseVersion("1.16") + and LooseVersion(tables.__version__) < LooseVersion("3.5.1"), reason=( "gh-25511, gh-24839. pytables needs a " "release beyong 3.4.4 to support numpy 1.16x" From 27d62e4e80b957d180ec1ef8074517ced0e8945f Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Thu, 3 Oct 2019 08:24:47 +0100 Subject: [PATCH 22/28] importing numpy instead of using safe import --- pandas/util/_test_decorators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index e053e2e9d5f7c..f5d76d0f5cdca 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -26,6 +26,7 @@ def test_foo(): from distutils.version import LooseVersion import locale from typing import Callable, Optional + import numpy as np import pytest From eeac6ce3c676000d558bf63a734675bdea5295f9 Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Thu, 3 Oct 2019 11:33:44 +0100 Subject: [PATCH 23/28] removing unused imports --- pandas/tests/io/pytables/test_complex.py | 1 - pandas/tests/series/test_operators.py | 31 ++++++++++++------------ 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py index 00cf0367b50ca..4e85c4b0f9fc9 100644 --- a/pandas/tests/io/pytables/test_complex.py +++ b/pandas/tests/io/pytables/test_complex.py @@ -1,4 +1,3 @@ -from distutils.version import LooseVersion from warnings import catch_warnings import numpy as np diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index c2cf91e582c47..d072b5de799d0 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -1,6 +1,6 @@ from datetime import datetime, timedelta import operator - +import re import numpy as np import pytest @@ -94,24 +94,25 @@ def test_logical_operators_int_dtype_with_int_scalar(self): def test_logical_operators_int_dtype_with_float(self): # GH#9016: support bitwise op for integer types s_0123 = Series(range(4), dtype="int64") - - with pytest.raises(TypeError): + msg = r"cannot compare a dtyped \[int64\] array with a scalar of type \[float\]" + with pytest.raises(TypeError, match=msg): s_0123 & np.NaN - with pytest.raises(TypeError): + msg = r"cannot compare a dtyped \[int64\] array with a scalar of type \[bool\]" + with pytest.raises(TypeError, match=msg): s_0123 & 3.14 - with pytest.raises(TypeError): + with pytest.raises(TypeError, match="unsupported.* 'int' and 'float'"): s_0123 & [0.1, 4, 3.14, 2] - with pytest.raises(TypeError): + with pytest.raises(TypeError, match="unsupported.* 'int' and 'float'"): s_0123 & np.array([0.1, 4, 3.14, 2]) - with pytest.raises(TypeError): + with pytest.raises(TypeError, match="unsupported.* 'int' and 'float'"): s_0123 & Series([0.1, 4, -3.14, 2]) def test_logical_operators_int_dtype_with_str(self): s_1111 = Series([1] * 4, dtype="int8") - - with pytest.raises(TypeError): + msg = r"cannot compare a dtyped \[int8\] array with a scalar of type \[bool\]" + with pytest.raises(TypeError, match=msg): s_1111 & "a" - with pytest.raises(TypeError): + with pytest.raises(TypeError, match="unsupported.* 'int' and 'str'"): s_1111 & ["a", "b", "c", "d"] def test_logical_operators_int_dtype_with_bool(self): @@ -228,8 +229,8 @@ def test_logical_operators_int_dtype_with_bool_dtype_and_reindex(self): def test_scalar_na_logical_ops_corners(self): s = Series([2, 3, 4, 5, 6, 7, 8, 9, 10]) - - with pytest.raises(TypeError): + msg = r"cannot compare a dtyped \[int64\] array with a scalar of type \[bool\]" + with pytest.raises(TypeError, match=msg): s & datetime(2005, 1, 1) s = Series([2, 3, 4, 5, 6, 7, 8, 9, datetime(2005, 1, 1)]) @@ -247,11 +248,11 @@ def test_scalar_na_logical_ops_corners(self): # this is an alignment issue; these are equivalent # https://github.com/pandas-dev/pandas/issues/5284 - - with pytest.raises(TypeError): + msg = r"cannot compare a dtyped \[float64\] array with a scalar of type \[float\]" + with pytest.raises(TypeError, match=msg): d.__and__(s, axis="columns") - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): s & d # this is wrong as its not a boolean result From 78c710ba8c56819b34ef259a0ed1edefa66d24a5 Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Thu, 3 Oct 2019 11:41:18 +0100 Subject: [PATCH 24/28] added tables to xfail decorator --- pandas/tests/io/pytables/test_complex.py | 4 ++-- pandas/tests/io/pytables/test_store.py | 9 ++++----- pandas/tests/series/test_operators.py | 4 +++- pandas/util/_test_decorators.py | 3 ++- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py index 4e85c4b0f9fc9..96c4a30424d74 100644 --- a/pandas/tests/io/pytables/test_complex.py +++ b/pandas/tests/io/pytables/test_complex.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas.util._test_decorators import xfail_non_writeable +import pandas.util._test_decorators as td import pandas as pd from pandas import DataFrame, Series @@ -70,7 +70,7 @@ def test_complex_table(setup_path): assert_frame_equal(df, reread) -@xfail_non_writeable +@td.xfail_non_writeable def test_complex_mixed_fixed(setup_path): complex64 = np.array( [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64 diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 3ca9dd5729f62..8a7ea9f2174bc 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -11,7 +11,6 @@ from pandas.compat import PY36, is_platform_little_endian, is_platform_windows import pandas.util._test_decorators as td -from pandas.util._test_decorators import xfail_non_writeable from pandas.core.dtypes.common import is_categorical_dtype @@ -812,7 +811,7 @@ def test_put_integer(self, setup_path): df = DataFrame(np.random.randn(50, 100)) self._check_roundtrip(df, tm.assert_frame_equal, setup_path) - @xfail_non_writeable + @td.xfail_non_writeable def test_put_mixed_type(self, setup_path): df = tm.makeTimeDataFrame() df["obj1"] = "foo" @@ -1415,7 +1414,7 @@ def test_to_hdf_with_min_itemsize(self, setup_path): ) @pytest.mark.parametrize( - "format", [pytest.param("fixed", marks=xfail_non_writeable), "table"] + "format", [pytest.param("fixed", marks=td.xfail_non_writeable), "table"] ) def test_to_hdf_errors(self, format, setup_path): @@ -1812,7 +1811,7 @@ def test_pass_spec_to_storer(self, setup_path): with pytest.raises(TypeError): store.select("df", where=[("columns=A")]) - @xfail_non_writeable + @td.xfail_non_writeable def test_append_misc(self, setup_path): with ensure_clean_store(setup_path) as store: @@ -2020,7 +2019,7 @@ def test_unimplemented_dtypes_table_columns(self, setup_path): with pytest.raises(TypeError): store.append("df_unimplemented", df) - @xfail_non_writeable + @td.xfail_non_writeable @pytest.mark.skipif( LooseVersion(np.__version__) == LooseVersion("1.15.0"), reason=( diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index d072b5de799d0..44a15d08510a5 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -248,7 +248,9 @@ def test_scalar_na_logical_ops_corners(self): # this is an alignment issue; these are equivalent # https://github.com/pandas-dev/pandas/issues/5284 - msg = r"cannot compare a dtyped \[float64\] array with a scalar of type \[float\]" + msg = ( + r"cannot compare a dtyped \[float64\] array with a scalar of type \[float\]" + ) with pytest.raises(TypeError, match=msg): d.__and__(s, axis="columns") diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index f5d76d0f5cdca..c9fd426f68b48 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -79,7 +79,8 @@ def safe_import(mod_name, min_version=None): # and pytables 3.4.4 tables = safe_import("tables") xfail_non_writeable = pytest.mark.xfail( - LooseVersion(np.__version__) >= LooseVersion("1.16") + tables + and LooseVersion(np.__version__) >= LooseVersion("1.16") and LooseVersion(tables.__version__) < LooseVersion("3.5.1"), reason=( "gh-25511, gh-24839. pytables needs a " From a38bc5e4928b83a9b00d47a0b5d697b31204f143 Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Thu, 3 Oct 2019 11:48:26 +0100 Subject: [PATCH 25/28] changed import for decorator --- pandas/tests/io/pytables/test_store.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 8a7ea9f2174bc..97fd89903de3f 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -2254,7 +2254,7 @@ def test_float_index(self, setup_path): s = Series(np.random.randn(10), index=index) self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) - @xfail_non_writeable + @td.xfail_non_writeable def test_tuple_index(self, setup_path): # GH #492 @@ -2267,7 +2267,7 @@ def test_tuple_index(self, setup_path): simplefilter("ignore", pd.errors.PerformanceWarning) self._check_roundtrip(DF, tm.assert_frame_equal, path=setup_path) - @xfail_non_writeable + @td.xfail_non_writeable @pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning") def test_index_types(self, setup_path): @@ -2331,7 +2331,7 @@ def test_timeseries_preepoch(self, setup_path): except OverflowError: pytest.skip("known failer on some windows platforms") - @xfail_non_writeable + @td.xfail_non_writeable @pytest.mark.parametrize( "compression", [False, pytest.param(True, marks=td.skip_if_windows_python_3)] ) @@ -2365,7 +2365,7 @@ def test_frame(self, compression, setup_path): # empty self._check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path) - @xfail_non_writeable + @td.xfail_non_writeable def test_empty_series_frame(self, setup_path): s0 = Series() s1 = Series(name="myseries") @@ -2379,7 +2379,7 @@ def test_empty_series_frame(self, setup_path): self._check_roundtrip(df1, tm.assert_frame_equal, path=setup_path) self._check_roundtrip(df2, tm.assert_frame_equal, path=setup_path) - @xfail_non_writeable + @td.xfail_non_writeable @pytest.mark.parametrize( "dtype", [np.int64, np.float64, np.object, "m8[ns]", "M8[ns]"] ) @@ -2465,7 +2465,7 @@ def test_store_series_name(self, setup_path): recons = store["series"] tm.assert_series_equal(recons, series) - @xfail_non_writeable + @td.xfail_non_writeable @pytest.mark.parametrize( "compression", [False, pytest.param(True, marks=td.skip_if_windows_python_3)] ) @@ -4023,7 +4023,7 @@ def test_pytables_native2_read(self, datapath, setup_path): d1 = store["detector"] assert isinstance(d1, DataFrame) - @xfail_non_writeable + @td.xfail_non_writeable def test_legacy_table_fixed_format_read_py2(self, datapath, setup_path): # GH 24510 # legacy table with fixed format written in Python 2 @@ -4182,7 +4182,7 @@ def test_unicode_longer_encoded(self, setup_path): result = store.get("df") tm.assert_frame_equal(result, df) - @xfail_non_writeable + @td.xfail_non_writeable def test_store_datetime_mixed(self, setup_path): df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]}) From 6d97676d66cbf9ee1e61f300292606497015b4ab Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Thu, 3 Oct 2019 19:22:06 +0100 Subject: [PATCH 26/28] reverted accidentally changes to test_operator.py --- pandas/tests/series/test_operators.py | 33 ++++++++++++--------------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 44a15d08510a5..c2cf91e582c47 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -1,6 +1,6 @@ from datetime import datetime, timedelta import operator -import re + import numpy as np import pytest @@ -94,25 +94,24 @@ def test_logical_operators_int_dtype_with_int_scalar(self): def test_logical_operators_int_dtype_with_float(self): # GH#9016: support bitwise op for integer types s_0123 = Series(range(4), dtype="int64") - msg = r"cannot compare a dtyped \[int64\] array with a scalar of type \[float\]" - with pytest.raises(TypeError, match=msg): + + with pytest.raises(TypeError): s_0123 & np.NaN - msg = r"cannot compare a dtyped \[int64\] array with a scalar of type \[bool\]" - with pytest.raises(TypeError, match=msg): + with pytest.raises(TypeError): s_0123 & 3.14 - with pytest.raises(TypeError, match="unsupported.* 'int' and 'float'"): + with pytest.raises(TypeError): s_0123 & [0.1, 4, 3.14, 2] - with pytest.raises(TypeError, match="unsupported.* 'int' and 'float'"): + with pytest.raises(TypeError): s_0123 & np.array([0.1, 4, 3.14, 2]) - with pytest.raises(TypeError, match="unsupported.* 'int' and 'float'"): + with pytest.raises(TypeError): s_0123 & Series([0.1, 4, -3.14, 2]) def test_logical_operators_int_dtype_with_str(self): s_1111 = Series([1] * 4, dtype="int8") - msg = r"cannot compare a dtyped \[int8\] array with a scalar of type \[bool\]" - with pytest.raises(TypeError, match=msg): + + with pytest.raises(TypeError): s_1111 & "a" - with pytest.raises(TypeError, match="unsupported.* 'int' and 'str'"): + with pytest.raises(TypeError): s_1111 & ["a", "b", "c", "d"] def test_logical_operators_int_dtype_with_bool(self): @@ -229,8 +228,8 @@ def test_logical_operators_int_dtype_with_bool_dtype_and_reindex(self): def test_scalar_na_logical_ops_corners(self): s = Series([2, 3, 4, 5, 6, 7, 8, 9, 10]) - msg = r"cannot compare a dtyped \[int64\] array with a scalar of type \[bool\]" - with pytest.raises(TypeError, match=msg): + + with pytest.raises(TypeError): s & datetime(2005, 1, 1) s = Series([2, 3, 4, 5, 6, 7, 8, 9, datetime(2005, 1, 1)]) @@ -248,13 +247,11 @@ def test_scalar_na_logical_ops_corners(self): # this is an alignment issue; these are equivalent # https://github.com/pandas-dev/pandas/issues/5284 - msg = ( - r"cannot compare a dtyped \[float64\] array with a scalar of type \[float\]" - ) - with pytest.raises(TypeError, match=msg): + + with pytest.raises(TypeError): d.__and__(s, axis="columns") - with pytest.raises(TypeError, match=msg): + with pytest.raises(TypeError): s & d # this is wrong as its not a boolean result From 49e9d3171d5025df34964567b9b0bf258965c60e Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Fri, 4 Oct 2019 11:06:43 +0100 Subject: [PATCH 27/28] moved tables to common.py --- pandas/tests/io/pytables/common.py | 8 ++++++++ pandas/tests/io/pytables/test_complex.py | 7 ------- pandas/tests/io/pytables/test_store.py | 7 ------- pandas/tests/io/pytables/test_timezones.py | 6 ------ 4 files changed, 8 insertions(+), 20 deletions(-) diff --git a/pandas/tests/io/pytables/common.py b/pandas/tests/io/pytables/common.py index d4f3e01a28a79..d06f467760518 100644 --- a/pandas/tests/io/pytables/common.py +++ b/pandas/tests/io/pytables/common.py @@ -2,8 +2,16 @@ import os import tempfile +import pytest + from pandas.io.pytables import HDFStore +tables = pytest.importorskip("tables") +# set these parameters so we don't have file sharing +tables.parameters.MAX_NUMEXPR_THREADS = 1 +tables.parameters.MAX_BLOSC_THREADS = 1 +tables.parameters.MAX_THREADS = 1 + def safe_remove(path): if path is not None: diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py index 96c4a30424d74..e48cfb724ef1d 100644 --- a/pandas/tests/io/pytables/test_complex.py +++ b/pandas/tests/io/pytables/test_complex.py @@ -13,13 +13,6 @@ from pandas.io.pytables import read_hdf -tables = pytest.importorskip("tables") -# set these parameters so we don't have file sharing -tables.parameters.MAX_NUMEXPR_THREADS = 1 -tables.parameters.MAX_BLOSC_THREADS = 1 -tables.parameters.MAX_THREADS = 1 - - # GH10447 diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 97fd89903de3f..a570cd58d8b2e 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -55,13 +55,6 @@ from pandas.io.pytables import TableIterator # noqa: E402 isort:skip -tables = pytest.importorskip("tables") -# set these parameters so we don't have file sharing -tables.parameters.MAX_NUMEXPR_THREADS = 1 -tables.parameters.MAX_BLOSC_THREADS = 1 -tables.parameters.MAX_THREADS = 1 - - _default_compressor = "blosc" ignore_natural_naming_warning = pytest.mark.filterwarnings( "ignore:object name:tables.exceptions.NaturalNameWarning" diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 3e91c638190c9..ba1df24224831 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -15,12 +15,6 @@ import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, set_timezone -tables = pytest.importorskip("tables") -# set these parameters so we don't have file sharing -tables.parameters.MAX_NUMEXPR_THREADS = 1 -tables.parameters.MAX_BLOSC_THREADS = 1 -tables.parameters.MAX_THREADS = 1 - def _compare_with_tz(a, b): tm.assert_frame_equal(a, b) From e5cecce85c84e93a67ff174e14aa6f02c984b22b Mon Sep 17 00:00:00 2001 From: Tola Alade Date: Fri, 4 Oct 2019 11:08:43 +0100 Subject: [PATCH 28/28] fixed import --- pandas/tests/io/pytables/test_store.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index a570cd58d8b2e..140ee5082f55d 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -39,6 +39,7 @@ ensure_clean_store, safe_close, safe_remove, + tables, ) import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal