diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py new file mode 100644 index 0000000000000..3eebeee9788c6 --- /dev/null +++ b/pandas/tests/io/pytables/test_append.py @@ -0,0 +1,927 @@ +import datetime +from datetime import timedelta +import re +from warnings import catch_warnings + +import numpy as np +import pytest + +from pandas._libs.tslibs import Timestamp + +import pandas as pd +from pandas import ( + DataFrame, + MultiIndex, + Series, + _testing as tm, + concat, + date_range, + read_hdf, +) +from pandas.tests.io.pytables.common import ( + _maybe_remove, + ensure_clean_path, + ensure_clean_store, +) + +pytestmark = pytest.mark.single + + +@pytest.mark.filterwarnings("ignore:object name:tables.exceptions.NaturalNameWarning") +def test_append(setup_path): + + with ensure_clean_store(setup_path) as store: + + # this is allowed by almost always don't want to do it + # tables.NaturalNameWarning): + with catch_warnings(record=True): + + df = tm.makeTimeDataFrame() + _maybe_remove(store, "df1") + store.append("df1", df[:10]) + store.append("df1", df[10:]) + tm.assert_frame_equal(store["df1"], df) + + _maybe_remove(store, "df2") + store.put("df2", df[:10], format="table") + store.append("df2", df[10:]) + tm.assert_frame_equal(store["df2"], df) + + _maybe_remove(store, "df3") + store.append("/df3", df[:10]) + store.append("/df3", df[10:]) + tm.assert_frame_equal(store["df3"], df) + + # this is allowed by almost always don't want to do it + # tables.NaturalNameWarning + _maybe_remove(store, "/df3 foo") + store.append("/df3 foo", df[:10]) + store.append("/df3 foo", df[10:]) + tm.assert_frame_equal(store["df3 foo"], df) + + # dtype issues - mizxed type in a single object column + df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]]) + df["mixed_column"] = "testing" + df.loc[2, "mixed_column"] = np.nan + _maybe_remove(store, "df") + store.append("df", df) + tm.assert_frame_equal(store["df"], df) + + # uints - test storage of uints + uint_data = DataFrame( + { + "u08": Series( + np.random.randint(0, high=255, size=5), dtype=np.uint8 + ), + "u16": Series( + np.random.randint(0, high=65535, size=5), dtype=np.uint16 + ), + "u32": Series( + np.random.randint(0, high=2 ** 30, size=5), dtype=np.uint32 + ), + "u64": Series( + [2 ** 58, 2 ** 59, 2 ** 60, 2 ** 61, 2 ** 62], + dtype=np.uint64, + ), + }, + index=np.arange(5), + ) + _maybe_remove(store, "uints") + store.append("uints", uint_data) + tm.assert_frame_equal(store["uints"], uint_data) + + # uints - test storage of uints in indexable columns + _maybe_remove(store, "uints") + # 64-bit indices not yet supported + store.append("uints", uint_data, data_columns=["u08", "u16", "u32"]) + tm.assert_frame_equal(store["uints"], uint_data) + + +def test_append_series(setup_path): + + with ensure_clean_store(setup_path) as store: + + # basic + ss = tm.makeStringSeries() + ts = tm.makeTimeSeries() + ns = Series(np.arange(100)) + + store.append("ss", ss) + result = store["ss"] + tm.assert_series_equal(result, ss) + assert result.name is None + + store.append("ts", ts) + result = store["ts"] + tm.assert_series_equal(result, ts) + assert result.name is None + + ns.name = "foo" + store.append("ns", ns) + result = store["ns"] + tm.assert_series_equal(result, ns) + assert result.name == ns.name + + # select on the values + expected = ns[ns > 60] + result = store.select("ns", "foo>60") + tm.assert_series_equal(result, expected) + + # select on the index and values + expected = ns[(ns > 70) & (ns.index < 90)] + result = store.select("ns", "foo>70 and index<90") + tm.assert_series_equal(result, expected) + + # multi-index + mi = DataFrame(np.random.randn(5, 1), columns=["A"]) + mi["B"] = np.arange(len(mi)) + mi["C"] = "foo" + mi.loc[3:5, "C"] = "bar" + mi.set_index(["C", "B"], inplace=True) + s = mi.stack() + s.index = s.index.droplevel(2) + store.append("mi", s) + tm.assert_series_equal(store["mi"], s) + + +def test_append_some_nans(setup_path): + + with ensure_clean_store(setup_path) as store: + df = DataFrame( + { + "A": Series(np.random.randn(20)).astype("int32"), + "A1": np.random.randn(20), + "A2": np.random.randn(20), + "B": "foo", + "C": "bar", + "D": Timestamp("20010101"), + "E": datetime.datetime(2001, 1, 2, 0, 0), + }, + index=np.arange(20), + ) + # some nans + _maybe_remove(store, "df1") + df.loc[0:15, ["A1", "B", "D", "E"]] = np.nan + store.append("df1", df[:10]) + store.append("df1", df[10:]) + tm.assert_frame_equal(store["df1"], df) + + # first column + df1 = df.copy() + df1.loc[:, "A1"] = np.nan + _maybe_remove(store, "df1") + store.append("df1", df1[:10]) + store.append("df1", df1[10:]) + tm.assert_frame_equal(store["df1"], df1) + + # 2nd column + df2 = df.copy() + df2.loc[:, "A2"] = np.nan + _maybe_remove(store, "df2") + store.append("df2", df2[:10]) + store.append("df2", df2[10:]) + tm.assert_frame_equal(store["df2"], df2) + + # datetimes + df3 = df.copy() + df3.loc[:, "E"] = np.nan + _maybe_remove(store, "df3") + store.append("df3", df3[:10]) + store.append("df3", df3[10:]) + tm.assert_frame_equal(store["df3"], df3) + + +def test_append_all_nans(setup_path): + + with ensure_clean_store(setup_path) as store: + + df = DataFrame( + {"A1": np.random.randn(20), "A2": np.random.randn(20)}, + index=np.arange(20), + ) + df.loc[0:15, :] = np.nan + + # nan some entire rows (dropna=True) + _maybe_remove(store, "df") + store.append("df", df[:10], dropna=True) + store.append("df", df[10:], dropna=True) + tm.assert_frame_equal(store["df"], df[-4:]) + + # nan some entire rows (dropna=False) + _maybe_remove(store, "df2") + store.append("df2", df[:10], dropna=False) + store.append("df2", df[10:], dropna=False) + tm.assert_frame_equal(store["df2"], df) + + # tests the option io.hdf.dropna_table + pd.set_option("io.hdf.dropna_table", False) + _maybe_remove(store, "df3") + store.append("df3", df[:10]) + store.append("df3", df[10:]) + tm.assert_frame_equal(store["df3"], df) + + pd.set_option("io.hdf.dropna_table", True) + _maybe_remove(store, "df4") + store.append("df4", df[:10]) + store.append("df4", df[10:]) + tm.assert_frame_equal(store["df4"], df[-4:]) + + # nan some entire rows (string are still written!) + df = DataFrame( + { + "A1": np.random.randn(20), + "A2": np.random.randn(20), + "B": "foo", + "C": "bar", + }, + index=np.arange(20), + ) + + df.loc[0:15, :] = np.nan + + _maybe_remove(store, "df") + store.append("df", df[:10], dropna=True) + store.append("df", df[10:], dropna=True) + tm.assert_frame_equal(store["df"], df) + + _maybe_remove(store, "df2") + store.append("df2", df[:10], dropna=False) + store.append("df2", df[10:], dropna=False) + tm.assert_frame_equal(store["df2"], df) + + # nan some entire rows (but since we have dates they are still + # written!) + df = DataFrame( + { + "A1": np.random.randn(20), + "A2": np.random.randn(20), + "B": "foo", + "C": "bar", + "D": Timestamp("20010101"), + "E": datetime.datetime(2001, 1, 2, 0, 0), + }, + index=np.arange(20), + ) + + df.loc[0:15, :] = np.nan + + _maybe_remove(store, "df") + store.append("df", df[:10], dropna=True) + store.append("df", df[10:], dropna=True) + tm.assert_frame_equal(store["df"], df) + + _maybe_remove(store, "df2") + store.append("df2", df[:10], dropna=False) + store.append("df2", df[10:], dropna=False) + tm.assert_frame_equal(store["df2"], df) + + +def test_append_frame_column_oriented(setup_path): + with ensure_clean_store(setup_path) as store: + + # column oriented + df = tm.makeTimeDataFrame() + df.index = df.index._with_freq(None) # freq doesnt round-trip + + _maybe_remove(store, "df1") + store.append("df1", df.iloc[:, :2], axes=["columns"]) + store.append("df1", df.iloc[:, 2:]) + tm.assert_frame_equal(store["df1"], df) + + result = store.select("df1", "columns=A") + expected = df.reindex(columns=["A"]) + tm.assert_frame_equal(expected, result) + + # selection on the non-indexable + result = store.select("df1", ("columns=A", "index=df.index[0:4]")) + expected = df.reindex(columns=["A"], index=df.index[0:4]) + tm.assert_frame_equal(expected, result) + + # this isn't supported + msg = re.escape( + "passing a filterable condition to a non-table indexer " + "[Filter: Not Initialized]" + ) + with pytest.raises(TypeError, match=msg): + store.select("df1", "columns=A and index>df.index[4]") + + +def test_append_with_different_block_ordering(setup_path): + + # GH 4096; using same frames, but different block orderings + with ensure_clean_store(setup_path) as store: + + for i in range(10): + + df = DataFrame(np.random.randn(10, 2), columns=list("AB")) + df["index"] = range(10) + df["index"] += i * 10 + df["int64"] = Series([1] * len(df), dtype="int64") + df["int16"] = Series([1] * len(df), dtype="int16") + + if i % 2 == 0: + del df["int64"] + df["int64"] = Series([1] * len(df), dtype="int64") + if i % 3 == 0: + a = df.pop("A") + df["A"] = a + + df.set_index("index", inplace=True) + + store.append("df", df) + + # test a different ordering but with more fields (like invalid + # combinate) + with ensure_clean_store(setup_path) as store: + + df = DataFrame(np.random.randn(10, 2), columns=list("AB"), dtype="float64") + df["int64"] = Series([1] * len(df), dtype="int64") + df["int16"] = Series([1] * len(df), dtype="int16") + store.append("df", df) + + # store additional fields in different blocks + df["int16_2"] = Series([1] * len(df), dtype="int16") + msg = re.escape( + "cannot match existing table structure for [int16] on appending data" + ) + with pytest.raises(ValueError, match=msg): + store.append("df", df) + + # store multiple additional fields in different blocks + df["float_3"] = Series([1.0] * len(df), dtype="float64") + msg = re.escape( + "cannot match existing table structure for [A,B] on appending data" + ) + with pytest.raises(ValueError, match=msg): + store.append("df", df) + + +def test_append_with_strings(setup_path): + + with ensure_clean_store(setup_path) as store: + with catch_warnings(record=True): + + def check_col(key, name, size): + assert ( + getattr(store.get_storer(key).table.description, name).itemsize + == size + ) + + # avoid truncation on elements + df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]]) + store.append("df_big", df) + tm.assert_frame_equal(store.select("df_big"), df) + check_col("df_big", "values_block_1", 15) + + # appending smaller string ok + df2 = DataFrame([[124, "asdqy"], [346, "dggnhefbdfb"]]) + store.append("df_big", df2) + expected = concat([df, df2]) + tm.assert_frame_equal(store.select("df_big"), expected) + check_col("df_big", "values_block_1", 15) + + # avoid truncation on elements + df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]]) + store.append("df_big2", df, min_itemsize={"values": 50}) + tm.assert_frame_equal(store.select("df_big2"), df) + check_col("df_big2", "values_block_1", 50) + + # bigger string on next append + store.append("df_new", df) + df_new = DataFrame( + [[124, "abcdefqhij"], [346, "abcdefghijklmnopqrtsuvwxyz"]] + ) + msg = ( + r"Trying to store a string with len \[26\] in " + r"\[values_block_1\] column but\n" + r"this column has a limit of \[15\]!\n" + "Consider using min_itemsize to preset the sizes on these " + "columns" + ) + with pytest.raises(ValueError, match=msg): + store.append("df_new", df_new) + + # min_itemsize on Series index (GH 11412) + df = tm.makeMixedDataFrame().set_index("C") + store.append("ss", df["B"], min_itemsize={"index": 4}) + tm.assert_series_equal(store.select("ss"), df["B"]) + + # same as above, with data_columns=True + store.append("ss2", df["B"], data_columns=True, min_itemsize={"index": 4}) + tm.assert_series_equal(store.select("ss2"), df["B"]) + + # min_itemsize in index without appending (GH 10381) + store.put("ss3", df, format="table", min_itemsize={"index": 6}) + # just make sure there is a longer string: + df2 = df.copy().reset_index().assign(C="longer").set_index("C") + store.append("ss3", df2) + tm.assert_frame_equal(store.select("ss3"), pd.concat([df, df2])) + + # same as above, with a Series + store.put("ss4", df["B"], format="table", min_itemsize={"index": 6}) + store.append("ss4", df2["B"]) + tm.assert_series_equal(store.select("ss4"), pd.concat([df["B"], df2["B"]])) + + # with nans + _maybe_remove(store, "df") + df = tm.makeTimeDataFrame() + df["string"] = "foo" + df.loc[df.index[1:4], "string"] = np.nan + df["string2"] = "bar" + df.loc[df.index[4:8], "string2"] = np.nan + df["string3"] = "bah" + df.loc[df.index[1:], "string3"] = np.nan + store.append("df", df) + result = store.select("df") + tm.assert_frame_equal(result, df) + + with ensure_clean_store(setup_path) as store: + + def check_col(key, name, size): + assert getattr(store.get_storer(key).table.description, name).itemsize, size + + df = DataFrame({"A": "foo", "B": "bar"}, index=range(10)) + + # a min_itemsize that creates a data_column + _maybe_remove(store, "df") + store.append("df", df, min_itemsize={"A": 200}) + check_col("df", "A", 200) + assert store.get_storer("df").data_columns == ["A"] + + # a min_itemsize that creates a data_column2 + _maybe_remove(store, "df") + store.append("df", df, data_columns=["B"], min_itemsize={"A": 200}) + check_col("df", "A", 200) + assert store.get_storer("df").data_columns == ["B", "A"] + + # a min_itemsize that creates a data_column2 + _maybe_remove(store, "df") + store.append("df", df, data_columns=["B"], min_itemsize={"values": 200}) + check_col("df", "B", 200) + check_col("df", "values_block_0", 200) + assert store.get_storer("df").data_columns == ["B"] + + # infer the .typ on subsequent appends + _maybe_remove(store, "df") + store.append("df", df[:5], min_itemsize=200) + store.append("df", df[5:], min_itemsize=200) + tm.assert_frame_equal(store["df"], df) + + # invalid min_itemsize keys + df = DataFrame(["foo", "foo", "foo", "barh", "barh", "barh"], columns=["A"]) + _maybe_remove(store, "df") + msg = re.escape( + "min_itemsize has the key [foo] which is not an axis or data_column" + ) + with pytest.raises(ValueError, match=msg): + store.append("df", df, min_itemsize={"foo": 20, "foobar": 20}) + + +def test_append_with_empty_string(setup_path): + + with ensure_clean_store(setup_path) as store: + + # with all empty strings (GH 12242) + df = DataFrame({"x": ["a", "b", "c", "d", "e", "f", ""]}) + store.append("df", df[:-1], min_itemsize={"x": 1}) + store.append("df", df[-1:], min_itemsize={"x": 1}) + tm.assert_frame_equal(store.select("df"), df) + + +def test_append_with_data_columns(setup_path): + + with ensure_clean_store(setup_path) as store: + df = tm.makeTimeDataFrame() + df.iloc[0, df.columns.get_loc("B")] = 1.0 + _maybe_remove(store, "df") + store.append("df", df[:2], data_columns=["B"]) + store.append("df", df[2:]) + tm.assert_frame_equal(store["df"], df) + + # check that we have indices created + assert store._handle.root.df.table.cols.index.is_indexed is True + assert store._handle.root.df.table.cols.B.is_indexed is True + + # data column searching + result = store.select("df", "B>0") + expected = df[df.B > 0] + tm.assert_frame_equal(result, expected) + + # data column searching (with an indexable and a data_columns) + result = store.select("df", "B>0 and index>df.index[3]") + df_new = df.reindex(index=df.index[4:]) + expected = df_new[df_new.B > 0] + tm.assert_frame_equal(result, expected) + + # data column selection with a string data_column + df_new = df.copy() + df_new["string"] = "foo" + df_new.loc[df_new.index[1:4], "string"] = np.nan + df_new.loc[df_new.index[5:6], "string"] = "bar" + _maybe_remove(store, "df") + store.append("df", df_new, data_columns=["string"]) + result = store.select("df", "string='foo'") + expected = df_new[df_new.string == "foo"] + tm.assert_frame_equal(result, expected) + + # using min_itemsize and a data column + def check_col(key, name, size): + assert ( + getattr(store.get_storer(key).table.description, name).itemsize == size + ) + + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "df") + store.append("df", df_new, data_columns=["string"], min_itemsize={"string": 30}) + check_col("df", "string", 30) + _maybe_remove(store, "df") + store.append("df", df_new, data_columns=["string"], min_itemsize=30) + check_col("df", "string", 30) + _maybe_remove(store, "df") + store.append("df", df_new, data_columns=["string"], min_itemsize={"values": 30}) + check_col("df", "string", 30) + + with ensure_clean_store(setup_path) as store: + df_new["string2"] = "foobarbah" + df_new["string_block1"] = "foobarbah1" + df_new["string_block2"] = "foobarbah2" + _maybe_remove(store, "df") + store.append( + "df", + df_new, + data_columns=["string", "string2"], + min_itemsize={"string": 30, "string2": 40, "values": 50}, + ) + check_col("df", "string", 30) + check_col("df", "string2", 40) + check_col("df", "values_block_1", 50) + + with ensure_clean_store(setup_path) as store: + # multiple data columns + df_new = df.copy() + df_new.iloc[0, df_new.columns.get_loc("A")] = 1.0 + df_new.iloc[0, df_new.columns.get_loc("B")] = -1.0 + df_new["string"] = "foo" + + sl = df_new.columns.get_loc("string") + df_new.iloc[1:4, sl] = np.nan + df_new.iloc[5:6, sl] = "bar" + + df_new["string2"] = "foo" + sl = df_new.columns.get_loc("string2") + df_new.iloc[2:5, sl] = np.nan + df_new.iloc[7:8, sl] = "bar" + _maybe_remove(store, "df") + store.append("df", df_new, data_columns=["A", "B", "string", "string2"]) + result = store.select("df", "string='foo' and string2='foo' and A>0 and B<0") + expected = df_new[ + (df_new.string == "foo") + & (df_new.string2 == "foo") + & (df_new.A > 0) + & (df_new.B < 0) + ] + tm.assert_frame_equal(result, expected, check_freq=False) + # FIXME: 2020-05-07 freq check randomly fails in the CI + + # yield an empty frame + result = store.select("df", "string='foo' and string2='cool'") + expected = df_new[(df_new.string == "foo") & (df_new.string2 == "cool")] + tm.assert_frame_equal(result, expected) + + with ensure_clean_store(setup_path) as store: + # doc example + df_dc = df.copy() + df_dc["string"] = "foo" + df_dc.loc[df_dc.index[4:6], "string"] = np.nan + df_dc.loc[df_dc.index[7:9], "string"] = "bar" + df_dc["string2"] = "cool" + df_dc["datetime"] = Timestamp("20010102") + df_dc = df_dc._convert(datetime=True) + df_dc.loc[df_dc.index[3:5], ["A", "B", "datetime"]] = np.nan + + _maybe_remove(store, "df_dc") + store.append( + "df_dc", df_dc, data_columns=["B", "C", "string", "string2", "datetime"] + ) + result = store.select("df_dc", "B>0") + + expected = df_dc[df_dc.B > 0] + tm.assert_frame_equal(result, expected) + + result = store.select("df_dc", ["B > 0", "C > 0", "string == foo"]) + expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] + tm.assert_frame_equal(result, expected, check_freq=False) + # FIXME: 2020-12-07 intermittent build failures here with freq of + # None instead of BDay(4) + + with ensure_clean_store(setup_path) as store: + # doc example part 2 + np.random.seed(1234) + index = date_range("1/1/2000", periods=8) + df_dc = DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"]) + df_dc["string"] = "foo" + df_dc.loc[df_dc.index[4:6], "string"] = np.nan + df_dc.loc[df_dc.index[7:9], "string"] = "bar" + df_dc.loc[:, ["B", "C"]] = df_dc.loc[:, ["B", "C"]].abs() + df_dc["string2"] = "cool" + + # on-disk operations + store.append("df_dc", df_dc, data_columns=["B", "C", "string", "string2"]) + + result = store.select("df_dc", "B>0") + expected = df_dc[df_dc.B > 0] + tm.assert_frame_equal(result, expected) + + result = store.select("df_dc", ["B > 0", "C > 0", 'string == "foo"']) + expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] + tm.assert_frame_equal(result, expected) + + +def test_append_hierarchical(setup_path): + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["foo", "bar"], + ) + df = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) + + with ensure_clean_store(setup_path) as store: + store.append("mi", df) + result = store.select("mi") + tm.assert_frame_equal(result, df) + + # GH 3748 + result = store.select("mi", columns=["A", "B"]) + expected = df.reindex(columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + with ensure_clean_path("test.hdf") as path: + df.to_hdf(path, "df", format="table") + result = read_hdf(path, "df", columns=["A", "B"]) + expected = df.reindex(columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + +def test_append_misc(setup_path): + + with ensure_clean_store(setup_path) as store: + df = tm.makeDataFrame() + store.append("df", df, chunksize=1) + result = store.select("df") + tm.assert_frame_equal(result, df) + + store.append("df1", df, expectedrows=10) + result = store.select("df1") + tm.assert_frame_equal(result, df) + + # more chunksize in append tests + def check(obj, comparator): + for c in [10, 200, 1000]: + with ensure_clean_store(setup_path, mode="w") as store: + store.append("obj", obj, chunksize=c) + result = store.select("obj") + comparator(result, obj) + + df = tm.makeDataFrame() + df["string"] = "foo" + df["float322"] = 1.0 + df["float322"] = df["float322"].astype("float32") + df["bool"] = df["float322"] > 0 + df["time1"] = Timestamp("20130101") + df["time2"] = Timestamp("20130102") + check(df, tm.assert_frame_equal) + + # empty frame, GH4273 + with ensure_clean_store(setup_path) as store: + + # 0 len + df_empty = DataFrame(columns=list("ABC")) + store.append("df", df_empty) + with pytest.raises(KeyError, match="'No object named df in the file'"): + store.select("df") + + # repeated append of 0/non-zero frames + df = DataFrame(np.random.rand(10, 3), columns=list("ABC")) + store.append("df", df) + tm.assert_frame_equal(store.select("df"), df) + store.append("df", df_empty) + tm.assert_frame_equal(store.select("df"), df) + + # store + df = DataFrame(columns=list("ABC")) + store.put("df2", df) + tm.assert_frame_equal(store.select("df2"), df) + + +def test_append_raise(setup_path): + + with ensure_clean_store(setup_path) as store: + + # test append with invalid input to get good error messages + + # list in column + df = tm.makeDataFrame() + df["invalid"] = [["a"]] * len(df) + assert df.dtypes["invalid"] == np.object_ + msg = re.escape( + """Cannot serialize the column [invalid] +because its data contents are not [string] but [mixed] object dtype""" + ) + with pytest.raises(TypeError, match=msg): + store.append("df", df) + + # multiple invalid columns + df["invalid2"] = [["a"]] * len(df) + df["invalid3"] = [["a"]] * len(df) + with pytest.raises(TypeError, match=msg): + store.append("df", df) + + # datetime with embedded nans as object + df = tm.makeDataFrame() + s = Series(datetime.datetime(2001, 1, 2), index=df.index) + s = s.astype(object) + s[0:5] = np.nan + df["invalid"] = s + assert df.dtypes["invalid"] == np.object_ + msg = "too many timezones in this block, create separate data columns" + with pytest.raises(TypeError, match=msg): + store.append("df", df) + + # directly ndarray + msg = "value must be None, Series, or DataFrame" + with pytest.raises(TypeError, match=msg): + store.append("df", np.arange(10)) + + # series directly + msg = re.escape( + "cannot properly create the storer for: " + "[group->df,value->]" + ) + with pytest.raises(TypeError, match=msg): + store.append("df", Series(np.arange(10))) + + # appending an incompatible table + df = tm.makeDataFrame() + store.append("df", df) + + df["foo"] = "foo" + msg = re.escape( + "invalid combination of [non_index_axes] on appending data " + "[(1, ['A', 'B', 'C', 'D', 'foo'])] vs current table " + "[(1, ['A', 'B', 'C', 'D'])]" + ) + with pytest.raises(ValueError, match=msg): + store.append("df", df) + + +def test_append_with_timedelta(setup_path): + # GH 3577 + # append timedelta + + df = DataFrame( + { + "A": Timestamp("20130101"), + "B": [ + Timestamp("20130101") + timedelta(days=i, seconds=10) for i in range(10) + ], + } + ) + df["C"] = df["A"] - df["B"] + df.loc[3:5, "C"] = np.nan + + with ensure_clean_store(setup_path) as store: + + # table + _maybe_remove(store, "df") + store.append("df", df, data_columns=True) + result = store.select("df") + tm.assert_frame_equal(result, df) + + result = store.select("df", where="C<100000") + tm.assert_frame_equal(result, df) + + result = store.select("df", where="C0", "B>0"], selector="df1" + ) + expected = df[(df.A > 0) & (df.B > 0)] + tm.assert_frame_equal(result, expected) + + +def test_append_to_multiple_dropna(setup_path): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) + df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan + df = concat([df1, df2], axis=1) + + with ensure_clean_store(setup_path) as store: + + # dropna=True should guarantee rows are synchronized + store.append_to_multiple( + {"df1": ["A", "B"], "df2": None}, df, selector="df1", dropna=True + ) + result = store.select_as_multiple(["df1", "df2"]) + expected = df.dropna() + tm.assert_frame_equal(result, expected) + tm.assert_index_equal(store.select("df1").index, store.select("df2").index) + + +@pytest.mark.xfail( + run=False, reason="append_to_multiple_dropna_false is not raising as failed" +) +def test_append_to_multiple_dropna_false(setup_path): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) + df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan + df = concat([df1, df2], axis=1) + + with ensure_clean_store(setup_path) as store: + + # dropna=False shouldn't synchronize row indexes + store.append_to_multiple( + {"df1a": ["A", "B"], "df2a": None}, df, selector="df1a", dropna=False + ) + + # TODO Update error message to desired message for this case + msg = "Cannot select as multiple after appending with dropna=False" + with pytest.raises(ValueError, match=msg): + store.select_as_multiple(["df1a", "df2a"]) + + assert not store.select("df1a").index.equals(store.select("df2a").index) + + +def test_append_to_multiple_min_itemsize(setup_path): + # GH 11238 + df = DataFrame( + { + "IX": np.arange(1, 21), + "Num": np.arange(1, 21), + "BigNum": np.arange(1, 21) * 88, + "Str": ["a" for _ in range(20)], + "LongStr": ["abcde" for _ in range(20)], + } + ) + expected = df.iloc[[0]] + + with ensure_clean_store(setup_path) as store: + store.append_to_multiple( + { + "index": ["IX"], + "nums": ["Num", "BigNum"], + "strs": ["Str", "LongStr"], + }, + df.iloc[[0]], + "index", + min_itemsize={"Str": 10, "LongStr": 100, "Num": 2}, + ) + result = store.select_as_multiple(["index", "nums", "strs"]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_categorical.py b/pandas/tests/io/pytables/test_categorical.py new file mode 100644 index 0000000000000..67209c2bc0d57 --- /dev/null +++ b/pandas/tests/io/pytables/test_categorical.py @@ -0,0 +1,186 @@ +import numpy as np +import pytest + +from pandas import Categorical, DataFrame, Series, _testing as tm, concat, read_hdf +from pandas.tests.io.pytables.common import ( + _maybe_remove, + ensure_clean_path, + ensure_clean_store, +) + +pytestmark = pytest.mark.single + + +def test_categorical(setup_path): + + with ensure_clean_store(setup_path) as store: + + # Basic + _maybe_remove(store, "s") + s = Series( + Categorical( + ["a", "b", "b", "a", "a", "c"], + categories=["a", "b", "c", "d"], + ordered=False, + ) + ) + store.append("s", s, format="table") + result = store.select("s") + tm.assert_series_equal(s, result) + + _maybe_remove(store, "s_ordered") + s = Series( + Categorical( + ["a", "b", "b", "a", "a", "c"], + categories=["a", "b", "c", "d"], + ordered=True, + ) + ) + store.append("s_ordered", s, format="table") + result = store.select("s_ordered") + tm.assert_series_equal(s, result) + + _maybe_remove(store, "df") + df = DataFrame({"s": s, "vals": [1, 2, 3, 4, 5, 6]}) + store.append("df", df, format="table") + result = store.select("df") + tm.assert_frame_equal(result, df) + + # Dtypes + _maybe_remove(store, "si") + s = Series([1, 1, 2, 2, 3, 4, 5]).astype("category") + store.append("si", s) + result = store.select("si") + tm.assert_series_equal(result, s) + + _maybe_remove(store, "si2") + s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype("category") + store.append("si2", s) + result = store.select("si2") + tm.assert_series_equal(result, s) + + # Multiple + _maybe_remove(store, "df2") + df2 = df.copy() + df2["s2"] = Series(list("abcdefg")).astype("category") + store.append("df2", df2) + result = store.select("df2") + tm.assert_frame_equal(result, df2) + + # Make sure the metadata is OK + info = store.info() + assert "/df2 " in info + # assert '/df2/meta/values_block_0/meta' in info + assert "/df2/meta/values_block_1/meta" in info + + # unordered + _maybe_remove(store, "s2") + s = Series( + Categorical( + ["a", "b", "b", "a", "a", "c"], + categories=["a", "b", "c", "d"], + ordered=False, + ) + ) + store.append("s2", s, format="table") + result = store.select("s2") + tm.assert_series_equal(result, s) + + # Query + _maybe_remove(store, "df3") + store.append("df3", df, data_columns=["s"]) + expected = df[df.s.isin(["b", "c"])] + result = store.select("df3", where=['s in ["b","c"]']) + tm.assert_frame_equal(result, expected) + + expected = df[df.s.isin(["b", "c"])] + result = store.select("df3", where=['s = ["b","c"]']) + tm.assert_frame_equal(result, expected) + + expected = df[df.s.isin(["d"])] + result = store.select("df3", where=['s in ["d"]']) + tm.assert_frame_equal(result, expected) + + expected = df[df.s.isin(["f"])] + result = store.select("df3", where=['s in ["f"]']) + tm.assert_frame_equal(result, expected) + + # Appending with same categories is ok + store.append("df3", df) + + df = concat([df, df]) + expected = df[df.s.isin(["b", "c"])] + result = store.select("df3", where=['s in ["b","c"]']) + tm.assert_frame_equal(result, expected) + + # Appending must have the same categories + df3 = df.copy() + df3["s"] = df3["s"].cat.remove_unused_categories() + + msg = "cannot append a categorical with different categories to the existing" + with pytest.raises(ValueError, match=msg): + store.append("df3", df3) + + # Remove, and make sure meta data is removed (its a recursive + # removal so should be). + result = store.select("df3/meta/s/meta") + assert result is not None + store.remove("df3") + + with pytest.raises( + KeyError, match="'No object named df3/meta/s/meta in the file'" + ): + store.select("df3/meta/s/meta") + + +def test_categorical_conversion(setup_path): + + # GH13322 + # Check that read_hdf with categorical columns doesn't return rows if + # where criteria isn't met. + obsids = ["ESP_012345_6789", "ESP_987654_3210"] + imgids = ["APF00006np", "APF0001imm"] + data = [4.3, 9.8] + + # Test without categories + df = DataFrame({"obsids": obsids, "imgids": imgids, "data": data}) + + # We are expecting an empty DataFrame matching types of df + expected = df.iloc[[], :] + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", format="table", data_columns=True) + result = read_hdf(path, "df", where="obsids=B") + tm.assert_frame_equal(result, expected) + + # Test with categories + df.obsids = df.obsids.astype("category") + df.imgids = df.imgids.astype("category") + + # We are expecting an empty DataFrame matching types of df + expected = df.iloc[[], :] + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", format="table", data_columns=True) + result = read_hdf(path, "df", where="obsids=B") + tm.assert_frame_equal(result, expected) + + +def test_categorical_nan_only_columns(setup_path): + # GH18413 + # Check that read_hdf with categorical columns with NaN-only values can + # be read back. + df = DataFrame( + { + "a": ["a", "b", "c", np.nan], + "b": [np.nan, np.nan, np.nan, np.nan], + "c": [1, 2, 3, 4], + "d": Series([None] * 4, dtype=object), + } + ) + df["a"] = df.a.astype("category") + df["b"] = df.b.astype("category") + df["d"] = df.b.astype("category") + expected = df + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", format="table", data_columns=True) + result = read_hdf(path, "df") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py new file mode 100644 index 0000000000000..24bd573341dc4 --- /dev/null +++ b/pandas/tests/io/pytables/test_errors.py @@ -0,0 +1,234 @@ +import datetime +from io import BytesIO +import re +from warnings import catch_warnings + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + CategoricalIndex, + DataFrame, + HDFStore, + MultiIndex, + _testing as tm, + date_range, + read_hdf, +) +from pandas.tests.io.pytables.common import ensure_clean_path, ensure_clean_store + +from pandas.io.pytables import Term, _maybe_adjust_name + +pytestmark = pytest.mark.single + + +def test_pass_spec_to_storer(setup_path): + + df = tm.makeDataFrame() + + with ensure_clean_store(setup_path) as store: + store.put("df", df) + msg = ( + "cannot pass a column specification when reading a Fixed format " + "store. this store must be selected in its entirety" + ) + with pytest.raises(TypeError, match=msg): + store.select("df", columns=["A"]) + msg = ( + "cannot pass a where specification when reading from a Fixed " + "format store. this store must be selected in its entirety" + ) + with pytest.raises(TypeError, match=msg): + store.select("df", where=[("columns=A")]) + + +def test_table_index_incompatible_dtypes(setup_path): + df1 = DataFrame({"a": [1, 2, 3]}) + df2 = DataFrame({"a": [4, 5, 6]}, index=date_range("1/1/2000", periods=3)) + + with ensure_clean_store(setup_path) as store: + store.put("frame", df1, format="table") + msg = re.escape("incompatible kind in col [integer - datetime64]") + with pytest.raises(TypeError, match=msg): + store.put("frame", df2, format="table", append=True) + + +def test_unimplemented_dtypes_table_columns(setup_path): + + with ensure_clean_store(setup_path) as store: + + dtypes = [("date", datetime.date(2001, 1, 2))] + + # currently not supported dtypes #### + for n, f in dtypes: + df = tm.makeDataFrame() + df[n] = f + msg = re.escape(f"[{n}] is not implemented as a table column") + with pytest.raises(TypeError, match=msg): + store.append(f"df1_{n}", df) + + # frame + df = tm.makeDataFrame() + df["obj1"] = "foo" + df["obj2"] = "bar" + df["datetime1"] = datetime.date(2001, 1, 2) + df = df._consolidate()._convert(datetime=True) + + with ensure_clean_store(setup_path) as store: + # this fails because we have a date in the object block...... + msg = re.escape( + """Cannot serialize the column [datetime1] +because its data contents are not [string] but [date] object dtype""" + ) + with pytest.raises(TypeError, match=msg): + store.append("df_unimplemented", df) + + +def test_invalid_terms(setup_path): + + with ensure_clean_store(setup_path) as store: + + with catch_warnings(record=True): + + df = tm.makeTimeDataFrame() + df["string"] = "foo" + df.loc[df.index[0:4], "string"] = "bar" + + store.put("df", df, format="table") + + # some invalid terms + msg = re.escape( + "__init__() missing 1 required positional argument: 'where'" + ) + with pytest.raises(TypeError, match=msg): + Term() + + # more invalid + msg = re.escape( + "cannot process expression [df.index[3]], " + "[2000-01-06 00:00:00] is not a valid condition" + ) + with pytest.raises(ValueError, match=msg): + store.select("df", "df.index[3]") + + msg = "invalid syntax" + with pytest.raises(SyntaxError, match=msg): + store.select("df", "index>") + + # from the docs + with ensure_clean_path(setup_path) as path: + dfq = DataFrame( + np.random.randn(10, 4), + columns=list("ABCD"), + index=date_range("20130101", periods=10), + ) + dfq.to_hdf(path, "dfq", format="table", data_columns=True) + + # check ok + read_hdf(path, "dfq", where="index>Timestamp('20130104') & columns=['A', 'B']") + read_hdf(path, "dfq", where="A>0 or C>0") + + # catch the invalid reference + with ensure_clean_path(setup_path) as path: + dfq = DataFrame( + np.random.randn(10, 4), + columns=list("ABCD"), + index=date_range("20130101", periods=10), + ) + dfq.to_hdf(path, "dfq", format="table") + + msg = ( + r"The passed where expression: A>0 or C>0\n\s*" + r"contains an invalid variable reference\n\s*" + r"all of the variable references must be a reference to\n\s*" + r"an axis \(e.g. 'index' or 'columns'\), or a data_column\n\s*" + r"The currently defined references are: index,columns\n" + ) + with pytest.raises(ValueError, match=msg): + read_hdf(path, "dfq", where="A>0 or C>0") + + +def test_append_with_diff_col_name_types_raises_value_error(setup_path): + df = DataFrame(np.random.randn(10, 1)) + df2 = DataFrame({"a": np.random.randn(10)}) + df3 = DataFrame({(1, 2): np.random.randn(10)}) + df4 = DataFrame({("1", 2): np.random.randn(10)}) + df5 = DataFrame({("1", 2, object): np.random.randn(10)}) + + with ensure_clean_store(setup_path) as store: + name = f"df_{tm.rands(10)}" + store.append(name, df) + + for d in (df2, df3, df4, df5): + msg = re.escape( + "cannot match existing table structure for [0] on appending data" + ) + with pytest.raises(ValueError, match=msg): + store.append(name, d) + + +def test_invalid_complib(setup_path): + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + with tm.ensure_clean(setup_path) as path: + msg = r"complib only supports \[.*\] compression." + with pytest.raises(ValueError, match=msg): + df.to_hdf(path, "df", complib="foolib") + + +@pytest.mark.parametrize( + "idx", + [ + date_range("2019", freq="D", periods=3, tz="UTC"), + CategoricalIndex(list("abc")), + ], +) +def test_to_hdf_multiindex_extension_dtype(idx, setup_path): + # GH 7775 + mi = MultiIndex.from_arrays([idx, idx]) + df = DataFrame(0, index=mi, columns=["a"]) + with ensure_clean_path(setup_path) as path: + with pytest.raises(NotImplementedError, match="Saving a MultiIndex"): + df.to_hdf(path, "df") + + +def test_unsuppored_hdf_file_error(datapath): + # GH 9539 + data_path = datapath("io", "data", "legacy_hdf/incompatible_dataset.h5") + message = ( + r"Dataset\(s\) incompatible with Pandas data types, " + "not table, or no datasets found in HDF5 file." + ) + + with pytest.raises(ValueError, match=message): + pd.read_hdf(data_path) + + +def test_read_hdf_errors(setup_path): + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + + with ensure_clean_path(setup_path) as path: + msg = r"File [\S]* does not exist" + with pytest.raises(IOError, match=msg): + read_hdf(path, "key") + + df.to_hdf(path, "df") + store = HDFStore(path, mode="r") + store.close() + + msg = "The HDFStore must be open for reading." + with pytest.raises(IOError, match=msg): + read_hdf(store, "df") + + +def test_read_hdf_generic_buffer_errors(): + msg = "Support for generic buffers has not been implemented." + with pytest.raises(NotImplementedError, match=msg): + read_hdf(BytesIO(b""), "df") + + +@pytest.mark.parametrize("bad_version", [(1, 2), (1,), [], "12", "123"]) +def test_maybe_adjust_name_bad_version_raises(bad_version): + msg = "Version is incorrect, expected sequence of 3 integers" + with pytest.raises(ValueError, match=msg): + _maybe_adjust_name("values_block_0", version=bad_version) diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py new file mode 100644 index 0000000000000..e0e995e03064f --- /dev/null +++ b/pandas/tests/io/pytables/test_file_handling.py @@ -0,0 +1,445 @@ +import os + +import numpy as np +import pytest + +from pandas.compat import is_platform_little_endian + +import pandas as pd +from pandas import DataFrame, HDFStore, Series, _testing as tm, read_hdf +from pandas.tests.io.pytables.common import ( + _maybe_remove, + ensure_clean_path, + ensure_clean_store, + tables, +) + +from pandas.io import pytables as pytables +from pandas.io.pytables import ClosedFileError, PossibleDataLossError, Term + +pytestmark = pytest.mark.single + + +def test_mode(setup_path): + + df = tm.makeTimeDataFrame() + + def check(mode): + + msg = r"[\S]* does not exist" + with ensure_clean_path(setup_path) as path: + + # constructor + if mode in ["r", "r+"]: + with pytest.raises(IOError, match=msg): + HDFStore(path, mode=mode) + + else: + store = HDFStore(path, mode=mode) + assert store._handle.mode == mode + store.close() + + with ensure_clean_path(setup_path) as path: + + # context + if mode in ["r", "r+"]: + with pytest.raises(IOError, match=msg): + with HDFStore(path, mode=mode) as store: + pass + else: + with HDFStore(path, mode=mode) as store: + assert store._handle.mode == mode + + with ensure_clean_path(setup_path) as path: + + # conv write + if mode in ["r", "r+"]: + with pytest.raises(IOError, match=msg): + df.to_hdf(path, "df", mode=mode) + df.to_hdf(path, "df", mode="w") + else: + df.to_hdf(path, "df", mode=mode) + + # conv read + if mode in ["w"]: + msg = ( + "mode w is not allowed while performing a read. " + r"Allowed modes are r, r\+ and a." + ) + with pytest.raises(ValueError, match=msg): + read_hdf(path, "df", mode=mode) + else: + result = read_hdf(path, "df", mode=mode) + tm.assert_frame_equal(result, df) + + def check_default_mode(): + + # read_hdf uses default mode + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", mode="w") + result = read_hdf(path, "df") + tm.assert_frame_equal(result, df) + + check("r") + check("r+") + check("a") + check("w") + check_default_mode() + + +def test_reopen_handle(setup_path): + + with ensure_clean_path(setup_path) as path: + + store = HDFStore(path, mode="a") + store["a"] = tm.makeTimeSeries() + + msg = ( + r"Re-opening the file \[[\S]*\] with mode \[a\] will delete the " + "current file!" + ) + # invalid mode change + with pytest.raises(PossibleDataLossError, match=msg): + store.open("w") + + store.close() + assert not store.is_open + + # truncation ok here + store.open("w") + assert store.is_open + assert len(store) == 0 + store.close() + assert not store.is_open + + store = HDFStore(path, mode="a") + store["a"] = tm.makeTimeSeries() + + # reopen as read + store.open("r") + assert store.is_open + assert len(store) == 1 + assert store._mode == "r" + store.close() + assert not store.is_open + + # reopen as append + store.open("a") + assert store.is_open + assert len(store) == 1 + assert store._mode == "a" + store.close() + assert not store.is_open + + # reopen as append (again) + store.open("a") + assert store.is_open + assert len(store) == 1 + assert store._mode == "a" + store.close() + assert not store.is_open + + +def test_open_args(setup_path): + + with tm.ensure_clean(setup_path) as path: + + df = tm.makeDataFrame() + + # create an in memory store + store = HDFStore( + path, mode="a", driver="H5FD_CORE", driver_core_backing_store=0 + ) + store["df"] = df + store.append("df2", df) + + tm.assert_frame_equal(store["df"], df) + tm.assert_frame_equal(store["df2"], df) + + store.close() + + # the file should not have actually been written + assert not os.path.exists(path) + + +def test_flush(setup_path): + + with ensure_clean_store(setup_path) as store: + store["a"] = tm.makeTimeSeries() + store.flush() + store.flush(fsync=True) + + +def test_complibs_default_settings(setup_path): + # GH15943 + df = tm.makeDataFrame() + + # Set complevel and check if complib is automatically set to + # default value + with ensure_clean_path(setup_path) as tmpfile: + df.to_hdf(tmpfile, "df", complevel=9) + result = pd.read_hdf(tmpfile, "df") + tm.assert_frame_equal(result, df) + + with tables.open_file(tmpfile, mode="r") as h5file: + for node in h5file.walk_nodes(where="/df", classname="Leaf"): + assert node.filters.complevel == 9 + assert node.filters.complib == "zlib" + + # Set complib and check to see if compression is disabled + with ensure_clean_path(setup_path) as tmpfile: + df.to_hdf(tmpfile, "df", complib="zlib") + result = pd.read_hdf(tmpfile, "df") + tm.assert_frame_equal(result, df) + + with tables.open_file(tmpfile, mode="r") as h5file: + for node in h5file.walk_nodes(where="/df", classname="Leaf"): + assert node.filters.complevel == 0 + assert node.filters.complib is None + + # Check if not setting complib or complevel results in no compression + with ensure_clean_path(setup_path) as tmpfile: + df.to_hdf(tmpfile, "df") + result = pd.read_hdf(tmpfile, "df") + tm.assert_frame_equal(result, df) + + with tables.open_file(tmpfile, mode="r") as h5file: + for node in h5file.walk_nodes(where="/df", classname="Leaf"): + assert node.filters.complevel == 0 + assert node.filters.complib is None + + # Check if file-defaults can be overridden on a per table basis + with ensure_clean_path(setup_path) as tmpfile: + store = HDFStore(tmpfile) + store.append("dfc", df, complevel=9, complib="blosc") + store.append("df", df) + store.close() + + with tables.open_file(tmpfile, mode="r") as h5file: + for node in h5file.walk_nodes(where="/df", classname="Leaf"): + assert node.filters.complevel == 0 + assert node.filters.complib is None + for node in h5file.walk_nodes(where="/dfc", classname="Leaf"): + assert node.filters.complevel == 9 + assert node.filters.complib == "blosc" + + +def test_complibs(setup_path): + # GH14478 + df = tm.makeDataFrame() + + # Building list of all complibs and complevels tuples + all_complibs = tables.filters.all_complibs + # Remove lzo if its not available on this platform + if not tables.which_lib_version("lzo"): + all_complibs.remove("lzo") + # Remove bzip2 if its not available on this platform + if not tables.which_lib_version("bzip2"): + all_complibs.remove("bzip2") + + all_levels = range(0, 10) + all_tests = [(lib, lvl) for lib in all_complibs for lvl in all_levels] + + for (lib, lvl) in all_tests: + with ensure_clean_path(setup_path) as tmpfile: + gname = "foo" + + # Write and read file to see if data is consistent + df.to_hdf(tmpfile, gname, complib=lib, complevel=lvl) + result = pd.read_hdf(tmpfile, gname) + tm.assert_frame_equal(result, df) + + # Open file and check metadata + # for correct amount of compression + h5table = tables.open_file(tmpfile, mode="r") + for node in h5table.walk_nodes(where="/" + gname, classname="Leaf"): + assert node.filters.complevel == lvl + if lvl == 0: + assert node.filters.complib is None + else: + assert node.filters.complib == lib + h5table.close() + + +@pytest.mark.skipif( + not is_platform_little_endian(), reason="reason platform is not little endian" +) +def test_encoding(setup_path): + + with ensure_clean_store(setup_path) as store: + df = DataFrame({"A": "foo", "B": "bar"}, index=range(5)) + df.loc[2, "A"] = np.nan + df.loc[3, "B"] = np.nan + _maybe_remove(store, "df") + store.append("df", df, encoding="ascii") + tm.assert_frame_equal(store["df"], df) + + expected = df.reindex(columns=["A"]) + result = store.select("df", Term("columns=A", encoding="ascii")) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "val", + [ + [b"E\xc9, 17", b"", b"a", b"b", b"c"], + [b"E\xc9, 17", b"a", b"b", b"c"], + [b"EE, 17", b"", b"a", b"b", b"c"], + [b"E\xc9, 17", b"\xf8\xfc", b"a", b"b", b"c"], + [b"", b"a", b"b", b"c"], + [b"\xf8\xfc", b"a", b"b", b"c"], + [b"A\xf8\xfc", b"", b"a", b"b", b"c"], + [np.nan, b"", b"b", b"c"], + [b"A\xf8\xfc", np.nan, b"", b"b", b"c"], + ], +) +@pytest.mark.parametrize("dtype", ["category", object]) +def test_latin_encoding(setup_path, dtype, val): + enc = "latin-1" + nan_rep = "" + key = "data" + + val = [x.decode(enc) if isinstance(x, bytes) else x for x in val] + ser = Series(val, dtype=dtype) + + with ensure_clean_path(setup_path) as store: + ser.to_hdf(store, key, format="table", encoding=enc, nan_rep=nan_rep) + retr = read_hdf(store, key) + + s_nan = ser.replace(nan_rep, np.nan) + + tm.assert_series_equal(s_nan, retr) + + +def test_multiple_open_close(setup_path): + # gh-4409: open & close multiple times + + with ensure_clean_path(setup_path) as path: + + df = tm.makeDataFrame() + df.to_hdf(path, "df", mode="w", format="table") + + # single + store = HDFStore(path) + assert "CLOSED" not in store.info() + assert store.is_open + + store.close() + assert "CLOSED" in store.info() + assert not store.is_open + + with ensure_clean_path(setup_path) as path: + + if pytables._table_file_open_policy_is_strict: + # multiples + store1 = HDFStore(path) + msg = ( + r"The file [\S]* is already opened\. Please close it before " + r"reopening in write mode\." + ) + with pytest.raises(ValueError, match=msg): + HDFStore(path) + + store1.close() + else: + + # multiples + store1 = HDFStore(path) + store2 = HDFStore(path) + + assert "CLOSED" not in store1.info() + assert "CLOSED" not in store2.info() + assert store1.is_open + assert store2.is_open + + store1.close() + assert "CLOSED" in store1.info() + assert not store1.is_open + assert "CLOSED" not in store2.info() + assert store2.is_open + + store2.close() + assert "CLOSED" in store1.info() + assert "CLOSED" in store2.info() + assert not store1.is_open + assert not store2.is_open + + # nested close + store = HDFStore(path, mode="w") + store.append("df", df) + + store2 = HDFStore(path) + store2.append("df2", df) + store2.close() + assert "CLOSED" in store2.info() + assert not store2.is_open + + store.close() + assert "CLOSED" in store.info() + assert not store.is_open + + # double closing + store = HDFStore(path, mode="w") + store.append("df", df) + + store2 = HDFStore(path) + store.close() + assert "CLOSED" in store.info() + assert not store.is_open + + store2.close() + assert "CLOSED" in store2.info() + assert not store2.is_open + + # ops on a closed store + with ensure_clean_path(setup_path) as path: + + df = tm.makeDataFrame() + df.to_hdf(path, "df", mode="w", format="table") + + store = HDFStore(path) + store.close() + + msg = r"[\S]* file is not open!" + with pytest.raises(ClosedFileError, match=msg): + store.keys() + + with pytest.raises(ClosedFileError, match=msg): + "df" in store + + with pytest.raises(ClosedFileError, match=msg): + len(store) + + with pytest.raises(ClosedFileError, match=msg): + store["df"] + + with pytest.raises(ClosedFileError, match=msg): + store.select("df") + + with pytest.raises(ClosedFileError, match=msg): + store.get("df") + + with pytest.raises(ClosedFileError, match=msg): + store.append("df2", df) + + with pytest.raises(ClosedFileError, match=msg): + store.put("df3", df) + + with pytest.raises(ClosedFileError, match=msg): + store.get_storer("df2") + + with pytest.raises(ClosedFileError, match=msg): + store.remove("df2") + + with pytest.raises(ClosedFileError, match=msg): + store.select("df") + + msg = "'HDFStore' object has no attribute 'df'" + with pytest.raises(AttributeError, match=msg): + store.df + + +def test_fspath(): + with tm.ensure_clean("foo.h5") as path: + with HDFStore(path) as store: + assert os.fspath(store) == str(path) diff --git a/pandas/tests/io/pytables/test_keys.py b/pandas/tests/io/pytables/test_keys.py new file mode 100644 index 0000000000000..4f939adeb4138 --- /dev/null +++ b/pandas/tests/io/pytables/test_keys.py @@ -0,0 +1,76 @@ +import pytest + +from pandas import DataFrame, HDFStore, _testing as tm +from pandas.tests.io.pytables.common import ( + ensure_clean_path, + ensure_clean_store, + tables, +) + +pytestmark = pytest.mark.single + + +def test_keys(setup_path): + + with ensure_clean_store(setup_path) as store: + store["a"] = tm.makeTimeSeries() + store["b"] = tm.makeStringSeries() + store["c"] = tm.makeDataFrame() + + assert len(store) == 3 + expected = {"/a", "/b", "/c"} + assert set(store.keys()) == expected + assert set(store) == expected + + +def test_non_pandas_keys(setup_path): + class Table1(tables.IsDescription): + value1 = tables.Float32Col() + + class Table2(tables.IsDescription): + value2 = tables.Float32Col() + + class Table3(tables.IsDescription): + value3 = tables.Float32Col() + + with ensure_clean_path(setup_path) as path: + with tables.open_file(path, mode="w") as h5file: + group = h5file.create_group("/", "group") + h5file.create_table(group, "table1", Table1, "Table 1") + h5file.create_table(group, "table2", Table2, "Table 2") + h5file.create_table(group, "table3", Table3, "Table 3") + with HDFStore(path) as store: + assert len(store.keys(include="native")) == 3 + expected = {"/group/table1", "/group/table2", "/group/table3"} + assert set(store.keys(include="native")) == expected + assert set(store.keys(include="pandas")) == set() + for name in expected: + df = store.get(name) + assert len(df.columns) == 1 + + +def test_keys_illegal_include_keyword_value(setup_path): + with ensure_clean_store(setup_path) as store: + with pytest.raises( + ValueError, + match="`include` should be either 'pandas' or 'native' but is 'illegal'", + ): + store.keys(include="illegal") + + +def test_keys_ignore_hdf_softlink(setup_path): + + # GH 20523 + # Puts a softlink into HDF file and rereads + + with ensure_clean_store(setup_path) as store: + + df = DataFrame({"A": range(5), "B": range(5)}) + store.put("df", df) + + assert store.keys() == ["/df"] + + store._handle.create_soft_link(store._handle.root, "symlink", "df") + + # Should ignore the softlink + assert store.keys() == ["/df"] diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py new file mode 100644 index 0000000000000..5f6a39d46df97 --- /dev/null +++ b/pandas/tests/io/pytables/test_put.py @@ -0,0 +1,375 @@ +import datetime +import re +from warnings import catch_warnings, simplefilter + +import numpy as np +import pytest + +from pandas._libs.tslibs import Timestamp + +import pandas as pd +from pandas import ( + DataFrame, + HDFStore, + Index, + Int64Index, + MultiIndex, + RangeIndex, + Series, + _testing as tm, + concat, +) +from pandas.tests.io.pytables.common import ( + _maybe_remove, + ensure_clean_path, + ensure_clean_store, +) +from pandas.util import _test_decorators as td + +pytestmark = pytest.mark.single + + +def test_format_type(setup_path): + df = DataFrame({"A": [1, 2]}) + with ensure_clean_path(setup_path) as path: + with HDFStore(path) as store: + store.put("a", df, format="fixed") + store.put("b", df, format="table") + + assert store.get_storer("a").format_type == "fixed" + assert store.get_storer("b").format_type == "table" + + +def test_format_kwarg_in_constructor(setup_path): + # GH 13291 + + msg = "format is not a defined argument for HDFStore" + + with tm.ensure_clean(setup_path) as path: + with pytest.raises(ValueError, match=msg): + HDFStore(path, format="table") + + +def test_api_default_format(setup_path): + + # default_format option + with ensure_clean_store(setup_path) as store: + df = tm.makeDataFrame() + + pd.set_option("io.hdf.default_format", "fixed") + _maybe_remove(store, "df") + store.put("df", df) + assert not store.get_storer("df").is_table + + msg = "Can only append to Tables" + + with pytest.raises(ValueError, match=msg): + store.append("df2", df) + + pd.set_option("io.hdf.default_format", "table") + _maybe_remove(store, "df") + store.put("df", df) + assert store.get_storer("df").is_table + _maybe_remove(store, "df2") + store.append("df2", df) + assert store.get_storer("df").is_table + + pd.set_option("io.hdf.default_format", None) + + with ensure_clean_path(setup_path) as path: + + df = tm.makeDataFrame() + + pd.set_option("io.hdf.default_format", "fixed") + df.to_hdf(path, "df") + with HDFStore(path) as store: + assert not store.get_storer("df").is_table + with pytest.raises(ValueError, match=msg): + df.to_hdf(path, "df2", append=True) + + pd.set_option("io.hdf.default_format", "table") + df.to_hdf(path, "df3") + with HDFStore(path) as store: + assert store.get_storer("df3").is_table + df.to_hdf(path, "df4", append=True) + with HDFStore(path) as store: + assert store.get_storer("df4").is_table + + pd.set_option("io.hdf.default_format", None) + + +def test_put(setup_path): + + with ensure_clean_store(setup_path) as store: + + ts = tm.makeTimeSeries() + df = tm.makeTimeDataFrame() + store["a"] = ts + store["b"] = df[:10] + store["foo/bar/bah"] = df[:10] + store["foo"] = df[:10] + store["/foo"] = df[:10] + store.put("c", df[:10], format="table") + + # not OK, not a table + msg = "Can only append to Tables" + with pytest.raises(ValueError, match=msg): + store.put("b", df[10:], append=True) + + # node does not currently exist, test _is_table_type returns False + # in this case + _maybe_remove(store, "f") + with pytest.raises(ValueError, match=msg): + store.put("f", df[10:], append=True) + + # can't put to a table (use append instead) + with pytest.raises(ValueError, match=msg): + store.put("c", df[10:], append=True) + + # overwrite table + store.put("c", df[:10], format="table", append=False) + tm.assert_frame_equal(df[:10], store["c"]) + + +def test_put_string_index(setup_path): + + with ensure_clean_store(setup_path) as store: + + index = Index([f"I am a very long string index: {i}" for i in range(20)]) + s = Series(np.arange(20), index=index) + df = DataFrame({"A": s, "B": s}) + + store["a"] = s + tm.assert_series_equal(store["a"], s) + + store["b"] = df + tm.assert_frame_equal(store["b"], df) + + # mixed length + index = Index( + ["abcdefghijklmnopqrstuvwxyz1234567890"] + + [f"I am a very long string index: {i}" for i in range(20)] + ) + s = Series(np.arange(21), index=index) + df = DataFrame({"A": s, "B": s}) + store["a"] = s + tm.assert_series_equal(store["a"], s) + + store["b"] = df + tm.assert_frame_equal(store["b"], df) + + +def test_put_compression(setup_path): + + with ensure_clean_store(setup_path) as store: + df = tm.makeTimeDataFrame() + + store.put("c", df, format="table", complib="zlib") + tm.assert_frame_equal(store["c"], df) + + # can't compress if format='fixed' + msg = "Compression not supported on Fixed format stores" + with pytest.raises(ValueError, match=msg): + store.put("b", df, format="fixed", complib="zlib") + + +@td.skip_if_windows_python_3 +def test_put_compression_blosc(setup_path): + df = tm.makeTimeDataFrame() + + with ensure_clean_store(setup_path) as store: + + # can't compress if format='fixed' + msg = "Compression not supported on Fixed format stores" + with pytest.raises(ValueError, match=msg): + store.put("b", df, format="fixed", complib="blosc") + + store.put("c", df, format="table", complib="blosc") + tm.assert_frame_equal(store["c"], df) + + +def test_put_mixed_type(setup_path): + df = tm.makeTimeDataFrame() + df["obj1"] = "foo" + df["obj2"] = "bar" + df["bool1"] = df["A"] > 0 + df["bool2"] = df["B"] > 0 + df["bool3"] = True + df["int1"] = 1 + df["int2"] = 2 + df["timestamp1"] = Timestamp("20010102") + df["timestamp2"] = Timestamp("20010103") + df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) + df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) + df.loc[df.index[3:6], ["obj1"]] = np.nan + df = df._consolidate()._convert(datetime=True) + + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "df") + + # PerformanceWarning + with catch_warnings(record=True): + simplefilter("ignore", pd.errors.PerformanceWarning) + store.put("df", df) + + expected = store.get("df") + tm.assert_frame_equal(expected, df) + + +def test_store_index_types(setup_path): + # GH5386 + # test storing various index types + + with ensure_clean_store(setup_path) as store: + + def check(format, index): + df = DataFrame(np.random.randn(10, 2), columns=list("AB")) + df.index = index(len(df)) + + _maybe_remove(store, "df") + store.put("df", df, format=format) + tm.assert_frame_equal(df, store["df"]) + + for index in [ + tm.makeFloatIndex, + tm.makeStringIndex, + tm.makeIntIndex, + tm.makeDateIndex, + ]: + + check("table", index) + check("fixed", index) + + # period index currently broken for table + # seee GH7796 FIXME + check("fixed", tm.makePeriodIndex) + # check('table',tm.makePeriodIndex) + + # unicode + index = tm.makeUnicodeIndex + check("table", index) + check("fixed", index) + + +def test_column_multiindex(setup_path): + # GH 4710 + # recreate multi-indexes properly + + index = MultiIndex.from_tuples( + [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")], names=["first", "second"] + ) + df = DataFrame(np.arange(12).reshape(3, 4), columns=index) + expected = df.copy() + if isinstance(expected.index, RangeIndex): + expected.index = Int64Index(expected.index) + + with ensure_clean_store(setup_path) as store: + + store.put("df", df) + tm.assert_frame_equal( + store["df"], expected, check_index_type=True, check_column_type=True + ) + + store.put("df1", df, format="table") + tm.assert_frame_equal( + store["df1"], expected, check_index_type=True, check_column_type=True + ) + + msg = re.escape("cannot use a multi-index on axis [1] with data_columns ['A']") + with pytest.raises(ValueError, match=msg): + store.put("df2", df, format="table", data_columns=["A"]) + msg = re.escape("cannot use a multi-index on axis [1] with data_columns True") + with pytest.raises(ValueError, match=msg): + store.put("df3", df, format="table", data_columns=True) + + # appending multi-column on existing table (see GH 6167) + with ensure_clean_store(setup_path) as store: + store.append("df2", df) + store.append("df2", df) + + tm.assert_frame_equal(store["df2"], concat((df, df))) + + # non_index_axes name + df = DataFrame(np.arange(12).reshape(3, 4), columns=Index(list("ABCD"), name="foo")) + expected = df.copy() + if isinstance(expected.index, RangeIndex): + expected.index = Int64Index(expected.index) + + with ensure_clean_store(setup_path) as store: + + store.put("df1", df, format="table") + tm.assert_frame_equal( + store["df1"], expected, check_index_type=True, check_column_type=True + ) + + +def test_store_multiindex(setup_path): + + # validate multi-index names + # GH 5527 + with ensure_clean_store(setup_path) as store: + + def make_index(names=None): + return MultiIndex.from_tuples( + [ + (datetime.datetime(2013, 12, d), s, t) + for d in range(1, 3) + for s in range(2) + for t in range(3) + ], + names=names, + ) + + # no names + _maybe_remove(store, "df") + df = DataFrame(np.zeros((12, 2)), columns=["a", "b"], index=make_index()) + store.append("df", df) + tm.assert_frame_equal(store.select("df"), df) + + # partial names + _maybe_remove(store, "df") + df = DataFrame( + np.zeros((12, 2)), + columns=["a", "b"], + index=make_index(["date", None, None]), + ) + store.append("df", df) + tm.assert_frame_equal(store.select("df"), df) + + # series + _maybe_remove(store, "s") + s = Series(np.zeros(12), index=make_index(["date", None, None])) + store.append("s", s) + xp = Series(np.zeros(12), index=make_index(["date", "level_1", "level_2"])) + tm.assert_series_equal(store.select("s"), xp) + + # dup with column + _maybe_remove(store, "df") + df = DataFrame( + np.zeros((12, 2)), + columns=["a", "b"], + index=make_index(["date", "a", "t"]), + ) + msg = "duplicate names/columns in the multi-index when storing as a table" + with pytest.raises(ValueError, match=msg): + store.append("df", df) + + # dup within level + _maybe_remove(store, "df") + df = DataFrame( + np.zeros((12, 2)), + columns=["a", "b"], + index=make_index(["date", "date", "date"]), + ) + with pytest.raises(ValueError, match=msg): + store.append("df", df) + + # fully names + _maybe_remove(store, "df") + df = DataFrame( + np.zeros((12, 2)), + columns=["a", "b"], + index=make_index(["date", "s", "t"]), + ) + store.append("df", df) + tm.assert_frame_equal(store.select("df"), df) diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py new file mode 100644 index 0000000000000..5ca8960ae5604 --- /dev/null +++ b/pandas/tests/io/pytables/test_read.py @@ -0,0 +1,338 @@ +from pathlib import Path +import re + +import numpy as np +import pytest + +from pandas._libs.tslibs import Timestamp +from pandas.compat import is_platform_windows + +import pandas as pd +from pandas import DataFrame, HDFStore, Index, Series, _testing as tm, read_hdf +from pandas.tests.io.pytables.common import ( + _maybe_remove, + ensure_clean_path, + ensure_clean_store, +) +from pandas.util import _test_decorators as td + +from pandas.io.pytables import TableIterator + +pytestmark = pytest.mark.single + + +def test_read_missing_key_close_store(setup_path): + # GH 25766 + with ensure_clean_path(setup_path) as path: + df = DataFrame({"a": range(2), "b": range(2)}) + df.to_hdf(path, "k1") + + with pytest.raises(KeyError, match="'No object named k2 in the file'"): + pd.read_hdf(path, "k2") + + # smoke test to test that file is properly closed after + # read with KeyError before another write + df.to_hdf(path, "k2") + + +def test_read_missing_key_opened_store(setup_path): + # GH 28699 + with ensure_clean_path(setup_path) as path: + df = DataFrame({"a": range(2), "b": range(2)}) + df.to_hdf(path, "k1") + + with HDFStore(path, "r") as store: + + with pytest.raises(KeyError, match="'No object named k2 in the file'"): + pd.read_hdf(store, "k2") + + # Test that the file is still open after a KeyError and that we can + # still read from it. + pd.read_hdf(store, "k1") + + +def test_read_column(setup_path): + + df = tm.makeTimeDataFrame() + + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "df") + + # GH 17912 + # HDFStore.select_column should raise a KeyError + # exception if the key is not a valid store + with pytest.raises(KeyError, match="No object named df in the file"): + store.select_column("df", "index") + + store.append("df", df) + # error + with pytest.raises( + KeyError, match=re.escape("'column [foo] not found in the table'") + ): + store.select_column("df", "foo") + + msg = re.escape("select_column() got an unexpected keyword argument 'where'") + with pytest.raises(TypeError, match=msg): + store.select_column("df", "index", where=["index>5"]) + + # valid + result = store.select_column("df", "index") + tm.assert_almost_equal(result.values, Series(df.index).values) + assert isinstance(result, Series) + + # not a data indexable column + msg = re.escape( + "column [values_block_0] can not be extracted individually; " + "it is not data indexable" + ) + with pytest.raises(ValueError, match=msg): + store.select_column("df", "values_block_0") + + # a data column + df2 = df.copy() + df2["string"] = "foo" + store.append("df2", df2, data_columns=["string"]) + result = store.select_column("df2", "string") + tm.assert_almost_equal(result.values, df2["string"].values) + + # a data column with NaNs, result excludes the NaNs + df3 = df.copy() + df3["string"] = "foo" + df3.loc[df3.index[4:6], "string"] = np.nan + store.append("df3", df3, data_columns=["string"]) + result = store.select_column("df3", "string") + tm.assert_almost_equal(result.values, df3["string"].values) + + # start/stop + result = store.select_column("df3", "string", start=2) + tm.assert_almost_equal(result.values, df3["string"].values[2:]) + + result = store.select_column("df3", "string", start=-2) + tm.assert_almost_equal(result.values, df3["string"].values[-2:]) + + result = store.select_column("df3", "string", stop=2) + tm.assert_almost_equal(result.values, df3["string"].values[:2]) + + result = store.select_column("df3", "string", stop=-2) + tm.assert_almost_equal(result.values, df3["string"].values[:-2]) + + result = store.select_column("df3", "string", start=2, stop=-2) + tm.assert_almost_equal(result.values, df3["string"].values[2:-2]) + + result = store.select_column("df3", "string", start=-2, stop=2) + tm.assert_almost_equal(result.values, df3["string"].values[-2:2]) + + # GH 10392 - make sure column name is preserved + df4 = DataFrame({"A": np.random.randn(10), "B": "foo"}) + store.append("df4", df4, data_columns=True) + expected = df4["B"] + result = store.select_column("df4", "B") + tm.assert_series_equal(result, expected) + + +def test_pytables_native_read(datapath, setup_path): + with ensure_clean_store( + datapath("io", "data", "legacy_hdf/pytables_native.h5"), mode="r" + ) as store: + d2 = store["detector/readout"] + assert isinstance(d2, DataFrame) + + +@pytest.mark.skipif(is_platform_windows(), reason="native2 read fails oddly on windows") +def test_pytables_native2_read(datapath, setup_path): + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "pytables_native2.h5"), mode="r" + ) as store: + str(store) + d1 = store["detector"] + assert isinstance(d1, DataFrame) + + +def test_legacy_table_fixed_format_read_py2(datapath, setup_path): + # GH 24510 + # legacy table with fixed format written in Python 2 + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "legacy_table_fixed_py2.h5"), mode="r" + ) as store: + result = store.select("df") + expected = DataFrame( + [[1, 2, 3, "D"]], + columns=["A", "B", "C", "D"], + index=Index(["ABC"], name="INDEX_NAME"), + ) + tm.assert_frame_equal(expected, result) + + +def test_legacy_table_fixed_format_read_datetime_py2(datapath, setup_path): + # GH 31750 + # legacy table with fixed format and datetime64 column written in Python 2 + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "legacy_table_fixed_datetime_py2.h5"), + mode="r", + ) as store: + result = store.select("df") + expected = DataFrame( + [[Timestamp("2020-02-06T18:00")]], + columns=["A"], + index=Index(["date"]), + ) + tm.assert_frame_equal(expected, result) + + +def test_legacy_table_read_py2(datapath, setup_path): + # issue: 24925 + # legacy table written in Python 2 + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "legacy_table_py2.h5"), mode="r" + ) as store: + result = store.select("table") + + expected = DataFrame({"a": ["a", "b"], "b": [2, 3]}) + tm.assert_frame_equal(expected, result) + + +def test_read_hdf_open_store(setup_path): + # GH10330 + # No check for non-string path_or-buf, and no test of open store + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + df.index.name = "letters" + df = df.set_index(keys="E", append=True) + + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", mode="w") + direct = read_hdf(path, "df") + store = HDFStore(path, mode="r") + indirect = read_hdf(store, "df") + tm.assert_frame_equal(direct, indirect) + assert store.is_open + store.close() + + +def test_read_hdf_iterator(setup_path): + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + df.index.name = "letters" + df = df.set_index(keys="E", append=True) + + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", mode="w", format="t") + direct = read_hdf(path, "df") + iterator = read_hdf(path, "df", iterator=True) + assert isinstance(iterator, TableIterator) + indirect = next(iterator.__iter__()) + tm.assert_frame_equal(direct, indirect) + iterator.store.close() + + +def test_read_nokey(setup_path): + # GH10443 + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + + # Categorical dtype not supported for "fixed" format. So no need + # to test with that dtype in the dataframe here. + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", mode="a") + reread = read_hdf(path) + tm.assert_frame_equal(df, reread) + df.to_hdf(path, "df2", mode="a") + + msg = "key must be provided when HDF5 file contains multiple datasets." + with pytest.raises(ValueError, match=msg): + read_hdf(path) + + +def test_read_nokey_table(setup_path): + # GH13231 + df = DataFrame({"i": range(5), "c": Series(list("abacd"), dtype="category")}) + + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", mode="a", format="table") + reread = read_hdf(path) + tm.assert_frame_equal(df, reread) + df.to_hdf(path, "df2", mode="a", format="table") + + msg = "key must be provided when HDF5 file contains multiple datasets." + with pytest.raises(ValueError, match=msg): + read_hdf(path) + + +def test_read_nokey_empty(setup_path): + with ensure_clean_path(setup_path) as path: + store = HDFStore(path) + store.close() + msg = re.escape( + "Dataset(s) incompatible with Pandas data types, not table, or no " + "datasets found in HDF5 file." + ) + with pytest.raises(ValueError, match=msg): + read_hdf(path) + + +def test_read_from_pathlib_path(setup_path): + + # GH11773 + expected = DataFrame( + np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE") + ) + with ensure_clean_path(setup_path) as filename: + path_obj = Path(filename) + + expected.to_hdf(path_obj, "df", mode="a") + actual = read_hdf(path_obj, "df") + + tm.assert_frame_equal(expected, actual) + + +@td.skip_if_no("py.path") +def test_read_from_py_localpath(setup_path): + + # GH11773 + from py.path import local as LocalPath + + expected = DataFrame( + np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE") + ) + with ensure_clean_path(setup_path) as filename: + path_obj = LocalPath(filename) + + expected.to_hdf(path_obj, "df", mode="a") + actual = read_hdf(path_obj, "df") + + tm.assert_frame_equal(expected, actual) + + +@pytest.mark.parametrize("format", ["fixed", "table"]) +def test_read_hdf_series_mode_r(format, setup_path): + # GH 16583 + # Tests that reading a Series saved to an HDF file + # still works if a mode='r' argument is supplied + series = tm.makeFloatSeries() + with ensure_clean_path(setup_path) as path: + series.to_hdf(path, key="data", format=format) + result = pd.read_hdf(path, key="data", mode="r") + tm.assert_series_equal(result, series) + + +def test_read_py2_hdf_file_in_py3(datapath): + # GH 16781 + + # tests reading a PeriodIndex DataFrame written in Python2 in Python3 + + # the file was generated in Python 2.7 like so: + # + # df = DataFrame([1.,2,3], index=pd.PeriodIndex( + # ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B')) + # df.to_hdf('periodindex_0.20.1_x86_64_darwin_2.7.13.h5', 'p') + + expected = DataFrame( + [1.0, 2, 3], + index=pd.PeriodIndex(["2015-01-01", "2015-01-02", "2015-01-05"], freq="B"), + ) + + with ensure_clean_store( + datapath( + "io", "data", "legacy_hdf", "periodindex_0.20.1_x86_64_darwin_2.7.13.h5" + ), + mode="r", + ) as store: + result = store["p"] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_retain_attributes.py b/pandas/tests/io/pytables/test_retain_attributes.py new file mode 100644 index 0000000000000..d301835632431 --- /dev/null +++ b/pandas/tests/io/pytables/test_retain_attributes.py @@ -0,0 +1,111 @@ +from warnings import catch_warnings + +import pytest + +from pandas._libs.tslibs import Timestamp + +from pandas import DataFrame, Series, _testing as tm, date_range, read_hdf +from pandas.tests.io.pytables.common import ( + _maybe_remove, + ensure_clean_path, + ensure_clean_store, +) + +pytestmark = pytest.mark.single + + +def test_retain_index_attributes(setup_path): + + # GH 3499, losing frequency info on index recreation + df = DataFrame( + {"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="H"))} + ) + + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "data") + store.put("data", df, format="table") + + result = store.get("data") + tm.assert_frame_equal(df, result) + + for attr in ["freq", "tz", "name"]: + for idx in ["index", "columns"]: + assert getattr(getattr(df, idx), attr, None) == getattr( + getattr(result, idx), attr, None + ) + + # try to append a table with a different frequency + with catch_warnings(record=True): + df2 = DataFrame( + { + "A": Series( + range(3), index=date_range("2002-1-1", periods=3, freq="D") + ) + } + ) + store.append("data", df2) + + assert store.get_storer("data").info["index"]["freq"] is None + + # this is ok + _maybe_remove(store, "df2") + df2 = DataFrame( + { + "A": Series( + range(3), + index=[ + Timestamp("20010101"), + Timestamp("20010102"), + Timestamp("20020101"), + ], + ) + } + ) + store.append("df2", df2) + df3 = DataFrame( + {"A": Series(range(3), index=date_range("2002-1-1", periods=3, freq="D"))} + ) + store.append("df2", df3) + + +@pytest.mark.filterwarnings( + "ignore:\\nthe :pandas.io.pytables.AttributeConflictWarning" +) +def test_retain_index_attributes2(setup_path): + with ensure_clean_path(setup_path) as path: + + with catch_warnings(record=True): + + df = DataFrame( + { + "A": Series( + range(3), index=date_range("2000-1-1", periods=3, freq="H") + ) + } + ) + df.to_hdf(path, "data", mode="w", append=True) + df2 = DataFrame( + { + "A": Series( + range(3), index=date_range("2002-1-1", periods=3, freq="D") + ) + } + ) + + df2.to_hdf(path, "data", append=True) + + idx = date_range("2000-1-1", periods=3, freq="H") + idx.name = "foo" + df = DataFrame({"A": Series(range(3), index=idx)}) + df.to_hdf(path, "data", mode="w", append=True) + + assert read_hdf(path, "data").index.name == "foo" + + with catch_warnings(record=True): + + idx2 = date_range("2001-1-1", periods=3, freq="H") + idx2.name = "bar" + df2 = DataFrame({"A": Series(range(3), index=idx2)}) + df2.to_hdf(path, "data", append=True) + + assert read_hdf(path, "data").index.name is None diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py new file mode 100644 index 0000000000000..403c3766fe6ed --- /dev/null +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -0,0 +1,563 @@ +import datetime +import re +from warnings import catch_warnings, simplefilter + +import numpy as np +import pytest + +from pandas._libs.tslibs import Timestamp + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, + _testing as tm, + bdate_range, + read_hdf, +) +from pandas.tests.io.pytables.common import ( + _maybe_remove, + ensure_clean_path, + ensure_clean_store, +) +from pandas.util import _test_decorators as td + +_default_compressor = "blosc" + + +pytestmark = pytest.mark.single + + +def test_conv_read_write(setup_path): + with tm.ensure_clean() as path: + + def roundtrip(key, obj, **kwargs): + obj.to_hdf(path, key, **kwargs) + return read_hdf(path, key) + + o = tm.makeTimeSeries() + tm.assert_series_equal(o, roundtrip("series", o)) + + o = tm.makeStringSeries() + tm.assert_series_equal(o, roundtrip("string_series", o)) + + o = tm.makeDataFrame() + tm.assert_frame_equal(o, roundtrip("frame", o)) + + # table + df = DataFrame({"A": range(5), "B": range(5)}) + df.to_hdf(path, "table", append=True) + result = read_hdf(path, "table", where=["index>2"]) + tm.assert_frame_equal(df[df.index > 2], result) + + +def test_long_strings(setup_path): + + # GH6166 + df = DataFrame( + {"a": tm.rands_array(100, size=10)}, index=tm.rands_array(100, size=10) + ) + + with ensure_clean_store(setup_path) as store: + store.append("df", df, data_columns=["a"]) + + result = store.select("df") + tm.assert_frame_equal(df, result) + + +def test_api(setup_path): + + # GH4584 + # API issue when to_hdf doesn't accept append AND format args + with ensure_clean_path(setup_path) as path: + + df = tm.makeDataFrame() + df.iloc[:10].to_hdf(path, "df", append=True, format="table") + df.iloc[10:].to_hdf(path, "df", append=True, format="table") + tm.assert_frame_equal(read_hdf(path, "df"), df) + + # append to False + df.iloc[:10].to_hdf(path, "df", append=False, format="table") + df.iloc[10:].to_hdf(path, "df", append=True, format="table") + tm.assert_frame_equal(read_hdf(path, "df"), df) + + with ensure_clean_path(setup_path) as path: + + df = tm.makeDataFrame() + df.iloc[:10].to_hdf(path, "df", append=True) + df.iloc[10:].to_hdf(path, "df", append=True, format="table") + tm.assert_frame_equal(read_hdf(path, "df"), df) + + # append to False + df.iloc[:10].to_hdf(path, "df", append=False, format="table") + df.iloc[10:].to_hdf(path, "df", append=True) + tm.assert_frame_equal(read_hdf(path, "df"), df) + + with ensure_clean_path(setup_path) as path: + + df = tm.makeDataFrame() + df.to_hdf(path, "df", append=False, format="fixed") + tm.assert_frame_equal(read_hdf(path, "df"), df) + + df.to_hdf(path, "df", append=False, format="f") + tm.assert_frame_equal(read_hdf(path, "df"), df) + + df.to_hdf(path, "df", append=False) + tm.assert_frame_equal(read_hdf(path, "df"), df) + + df.to_hdf(path, "df") + tm.assert_frame_equal(read_hdf(path, "df"), df) + + with ensure_clean_store(setup_path) as store: + + df = tm.makeDataFrame() + + _maybe_remove(store, "df") + store.append("df", df.iloc[:10], append=True, format="table") + store.append("df", df.iloc[10:], append=True, format="table") + tm.assert_frame_equal(store.select("df"), df) + + # append to False + _maybe_remove(store, "df") + store.append("df", df.iloc[:10], append=False, format="table") + store.append("df", df.iloc[10:], append=True, format="table") + tm.assert_frame_equal(store.select("df"), df) + + # formats + _maybe_remove(store, "df") + store.append("df", df.iloc[:10], append=False, format="table") + store.append("df", df.iloc[10:], append=True, format="table") + tm.assert_frame_equal(store.select("df"), df) + + _maybe_remove(store, "df") + store.append("df", df.iloc[:10], append=False, format="table") + store.append("df", df.iloc[10:], append=True, format=None) + tm.assert_frame_equal(store.select("df"), df) + + with ensure_clean_path(setup_path) as path: + # Invalid. + df = tm.makeDataFrame() + + msg = "Can only append to Tables" + + with pytest.raises(ValueError, match=msg): + df.to_hdf(path, "df", append=True, format="f") + + with pytest.raises(ValueError, match=msg): + df.to_hdf(path, "df", append=True, format="fixed") + + msg = r"invalid HDFStore format specified \[foo\]" + + with pytest.raises(TypeError, match=msg): + df.to_hdf(path, "df", append=True, format="foo") + + with pytest.raises(TypeError, match=msg): + df.to_hdf(path, "df", append=False, format="foo") + + # File path doesn't exist + path = "" + msg = f"File {path} does not exist" + + with pytest.raises(FileNotFoundError, match=msg): + read_hdf(path, "df") + + +def test_get(setup_path): + + with ensure_clean_store(setup_path) as store: + store["a"] = tm.makeTimeSeries() + left = store.get("a") + right = store["a"] + tm.assert_series_equal(left, right) + + left = store.get("/a") + right = store["/a"] + tm.assert_series_equal(left, right) + + with pytest.raises(KeyError, match="'No object named b in the file'"): + store.get("b") + + +def test_put_integer(setup_path): + # non-date, non-string index + df = DataFrame(np.random.randn(50, 100)) + _check_roundtrip(df, tm.assert_frame_equal, setup_path) + + +def test_table_values_dtypes_roundtrip(setup_path): + + with ensure_clean_store(setup_path) as store: + df1 = DataFrame({"a": [1, 2, 3]}, dtype="f8") + store.append("df_f8", df1) + tm.assert_series_equal(df1.dtypes, store["df_f8"].dtypes) + + df2 = DataFrame({"a": [1, 2, 3]}, dtype="i8") + store.append("df_i8", df2) + tm.assert_series_equal(df2.dtypes, store["df_i8"].dtypes) + + # incompatible dtype + msg = re.escape( + "invalid combination of [values_axes] on appending data " + "[name->values_block_0,cname->values_block_0," + "dtype->float64,kind->float,shape->(1, 3)] vs " + "current table [name->values_block_0," + "cname->values_block_0,dtype->int64,kind->integer," + "shape->None]" + ) + with pytest.raises(ValueError, match=msg): + store.append("df_i8", df1) + + # check creation/storage/retrieval of float32 (a bit hacky to + # actually create them thought) + df1 = DataFrame(np.array([[1], [2], [3]], dtype="f4"), columns=["A"]) + store.append("df_f4", df1) + tm.assert_series_equal(df1.dtypes, store["df_f4"].dtypes) + assert df1.dtypes[0] == "float32" + + # check with mixed dtypes + df1 = DataFrame( + { + c: Series(np.random.randint(5), dtype=c) + for c in ["float32", "float64", "int32", "int64", "int16", "int8"] + } + ) + df1["string"] = "foo" + df1["float322"] = 1.0 + df1["float322"] = df1["float322"].astype("float32") + df1["bool"] = df1["float32"] > 0 + df1["time1"] = Timestamp("20130101") + df1["time2"] = Timestamp("20130102") + + store.append("df_mixed_dtypes1", df1) + result = store.select("df_mixed_dtypes1").dtypes.value_counts() + result.index = [str(i) for i in result.index] + expected = Series( + { + "float32": 2, + "float64": 1, + "int32": 1, + "bool": 1, + "int16": 1, + "int8": 1, + "int64": 1, + "object": 1, + "datetime64[ns]": 2, + } + ) + result = result.sort_index() + expected = expected.sort_index() + tm.assert_series_equal(result, expected) + + +def test_series(setup_path): + + s = tm.makeStringSeries() + _check_roundtrip(s, tm.assert_series_equal, path=setup_path) + + ts = tm.makeTimeSeries() + _check_roundtrip(ts, tm.assert_series_equal, path=setup_path) + + ts2 = Series(ts.index, Index(ts.index, dtype=object)) + _check_roundtrip(ts2, tm.assert_series_equal, path=setup_path) + + ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) + _check_roundtrip( + ts3, tm.assert_series_equal, path=setup_path, check_index_type=False + ) + + +def test_float_index(setup_path): + + # GH #454 + index = np.random.randn(10) + s = Series(np.random.randn(10), index=index) + _check_roundtrip(s, tm.assert_series_equal, path=setup_path) + + +def test_tuple_index(setup_path): + + # GH #492 + col = np.arange(10) + idx = [(0.0, 1.0), (2.0, 3.0), (4.0, 5.0)] + data = np.random.randn(30).reshape((3, 10)) + DF = DataFrame(data, index=idx, columns=col) + + with catch_warnings(record=True): + simplefilter("ignore", pd.errors.PerformanceWarning) + _check_roundtrip(DF, tm.assert_frame_equal, path=setup_path) + + +@pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning") +def test_index_types(setup_path): + with catch_warnings(record=True): + values = np.random.randn(2) + + func = lambda l, r: tm.assert_series_equal(l, r, check_index_type=True) + + with catch_warnings(record=True): + ser = Series(values, [0, "y"]) + _check_roundtrip(ser, func, path=setup_path) + + with catch_warnings(record=True): + ser = Series(values, [datetime.datetime.today(), 0]) + _check_roundtrip(ser, func, path=setup_path) + + with catch_warnings(record=True): + ser = Series(values, ["y", 0]) + _check_roundtrip(ser, func, path=setup_path) + + with catch_warnings(record=True): + ser = Series(values, [datetime.date.today(), "a"]) + _check_roundtrip(ser, func, path=setup_path) + + with catch_warnings(record=True): + ser = Series(values, [0, "y"]) + _check_roundtrip(ser, func, path=setup_path) + + ser = Series(values, [datetime.datetime.today(), 0]) + _check_roundtrip(ser, func, path=setup_path) + + ser = Series(values, ["y", 0]) + _check_roundtrip(ser, func, path=setup_path) + + ser = Series(values, [datetime.date.today(), "a"]) + _check_roundtrip(ser, func, path=setup_path) + + ser = Series(values, [1.23, "b"]) + _check_roundtrip(ser, func, path=setup_path) + + ser = Series(values, [1, 1.53]) + _check_roundtrip(ser, func, path=setup_path) + + ser = Series(values, [1, 5]) + _check_roundtrip(ser, func, path=setup_path) + + ser = Series( + values, [datetime.datetime(2012, 1, 1), datetime.datetime(2012, 1, 2)] + ) + _check_roundtrip(ser, func, path=setup_path) + + +def test_timeseries_preepoch(setup_path): + + dr = bdate_range("1/1/1940", "1/1/1960") + ts = Series(np.random.randn(len(dr)), index=dr) + try: + _check_roundtrip(ts, tm.assert_series_equal, path=setup_path) + except OverflowError: + pytest.skip("known failer on some windows platforms") + + +@pytest.mark.parametrize( + "compression", [False, pytest.param(True, marks=td.skip_if_windows_python_3)] +) +def test_frame(compression, setup_path): + + df = tm.makeDataFrame() + + # put in some random NAs + df.values[0, 0] = np.nan + df.values[5, 3] = np.nan + + _check_roundtrip_table( + df, tm.assert_frame_equal, path=setup_path, compression=compression + ) + _check_roundtrip( + df, tm.assert_frame_equal, path=setup_path, compression=compression + ) + + tdf = tm.makeTimeDataFrame() + _check_roundtrip( + tdf, tm.assert_frame_equal, path=setup_path, compression=compression + ) + + with ensure_clean_store(setup_path) as store: + # not consolidated + df["foo"] = np.random.randn(len(df)) + store["df"] = df + recons = store["df"] + assert recons._mgr.is_consolidated() + + # empty + _check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path) + + +def test_empty_series_frame(setup_path): + s0 = Series(dtype=object) + s1 = Series(name="myseries", dtype=object) + df0 = DataFrame() + df1 = DataFrame(index=["a", "b", "c"]) + df2 = DataFrame(columns=["d", "e", "f"]) + + _check_roundtrip(s0, tm.assert_series_equal, path=setup_path) + _check_roundtrip(s1, tm.assert_series_equal, path=setup_path) + _check_roundtrip(df0, tm.assert_frame_equal, path=setup_path) + _check_roundtrip(df1, tm.assert_frame_equal, path=setup_path) + _check_roundtrip(df2, tm.assert_frame_equal, path=setup_path) + + +@pytest.mark.parametrize("dtype", [np.int64, np.float64, object, "m8[ns]", "M8[ns]"]) +def test_empty_series(dtype, setup_path): + s = Series(dtype=dtype) + _check_roundtrip(s, tm.assert_series_equal, path=setup_path) + + +def test_can_serialize_dates(setup_path): + + rng = [x.date() for x in bdate_range("1/1/2000", "1/30/2000")] + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + + _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) + + +def test_store_hierarchical(setup_path): + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["foo", "bar"], + ) + frame = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) + + _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) + _check_roundtrip(frame.T, tm.assert_frame_equal, path=setup_path) + _check_roundtrip(frame["A"], tm.assert_series_equal, path=setup_path) + + # check that the names are stored + with ensure_clean_store(setup_path) as store: + store["frame"] = frame + recons = store["frame"] + tm.assert_frame_equal(recons, frame) + + +@pytest.mark.parametrize( + "compression", [False, pytest.param(True, marks=td.skip_if_windows_python_3)] +) +def test_store_mixed(compression, setup_path): + def _make_one(): + df = tm.makeDataFrame() + df["obj1"] = "foo" + df["obj2"] = "bar" + df["bool1"] = df["A"] > 0 + df["bool2"] = df["B"] > 0 + df["int1"] = 1 + df["int2"] = 2 + return df._consolidate() + + df1 = _make_one() + df2 = _make_one() + + _check_roundtrip(df1, tm.assert_frame_equal, path=setup_path) + _check_roundtrip(df2, tm.assert_frame_equal, path=setup_path) + + with ensure_clean_store(setup_path) as store: + store["obj"] = df1 + tm.assert_frame_equal(store["obj"], df1) + store["obj"] = df2 + tm.assert_frame_equal(store["obj"], df2) + + # check that can store Series of all of these types + _check_roundtrip( + df1["obj1"], + tm.assert_series_equal, + path=setup_path, + compression=compression, + ) + _check_roundtrip( + df1["bool1"], + tm.assert_series_equal, + path=setup_path, + compression=compression, + ) + _check_roundtrip( + df1["int1"], + tm.assert_series_equal, + path=setup_path, + compression=compression, + ) + + +def _check_roundtrip(obj, comparator, path, compression=False, **kwargs): + + options = {} + if compression: + options["complib"] = _default_compressor + + with ensure_clean_store(path, "w", **options) as store: + store["obj"] = obj + retrieved = store["obj"] + comparator(retrieved, obj, **kwargs) + + +def _check_double_roundtrip(self, obj, comparator, path, compression=False, **kwargs): + options = {} + if compression: + options["complib"] = compression or _default_compressor + + with ensure_clean_store(path, "w", **options) as store: + store["obj"] = obj + retrieved = store["obj"] + comparator(retrieved, obj, **kwargs) + store["obj"] = retrieved + again = store["obj"] + comparator(again, obj, **kwargs) + + +def _check_roundtrip_table(obj, comparator, path, compression=False): + options = {} + if compression: + options["complib"] = _default_compressor + + with ensure_clean_store(path, "w", **options) as store: + store.put("obj", obj, format="table") + retrieved = store["obj"] + + comparator(retrieved, obj) + + +def test_unicode_index(setup_path): + + unicode_values = ["\u03c3", "\u03c3\u03c3"] + + # PerformanceWarning + with catch_warnings(record=True): + simplefilter("ignore", pd.errors.PerformanceWarning) + s = Series(np.random.randn(len(unicode_values)), unicode_values) + _check_roundtrip(s, tm.assert_series_equal, path=setup_path) + + +def test_unicode_longer_encoded(setup_path): + # GH 11234 + char = "\u0394" + df = DataFrame({"A": [char]}) + with ensure_clean_store(setup_path) as store: + store.put("df", df, format="table", encoding="utf-8") + result = store.get("df") + tm.assert_frame_equal(result, df) + + df = DataFrame({"A": ["a", char], "B": ["b", "b"]}) + with ensure_clean_store(setup_path) as store: + store.put("df", df, format="table", encoding="utf-8") + result = store.get("df") + tm.assert_frame_equal(result, df) + + +def test_store_datetime_mixed(setup_path): + + df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]}) + ts = tm.makeTimeSeries() + df["d"] = ts.index[:3] + _check_roundtrip(df, tm.assert_frame_equal, path=setup_path) + + +def test_round_trip_equals(setup_path): + # GH 9330 + df = DataFrame({"B": [1, 2], "A": ["x", "y"]}) + + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", format="table") + other = read_hdf(path, "df") + tm.assert_frame_equal(df, other) + assert df.equals(other) + assert other.equals(df) diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py new file mode 100644 index 0000000000000..87d0728e2418e --- /dev/null +++ b/pandas/tests/io/pytables/test_select.py @@ -0,0 +1,981 @@ +from distutils.version import LooseVersion +from warnings import catch_warnings + +import numpy as np +import pytest + +from pandas._libs.tslibs import Timestamp + +import pandas as pd +from pandas import ( + DataFrame, + HDFStore, + Index, + MultiIndex, + Series, + _testing as tm, + bdate_range, + concat, + date_range, + isna, + read_hdf, +) +from pandas.tests.io.pytables.common import ( + _maybe_remove, + ensure_clean_path, + ensure_clean_store, + tables, +) + +from pandas.io.pytables import Term + +pytestmark = pytest.mark.single + + +def test_select_columns_in_where(setup_path): + + # GH 6169 + # recreate multi-indexes when columns is passed + # in the `where` argument + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["foo_name", "bar_name"], + ) + + # With a DataFrame + df = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) + + with ensure_clean_store(setup_path) as store: + store.put("df", df, format="table") + expected = df[["A"]] + + tm.assert_frame_equal(store.select("df", columns=["A"]), expected) + + tm.assert_frame_equal(store.select("df", where="columns=['A']"), expected) + + # With a Series + s = Series(np.random.randn(10), index=index, name="A") + with ensure_clean_store(setup_path) as store: + store.put("s", s, format="table") + tm.assert_series_equal(store.select("s", where="columns=['A']"), s) + + +def test_select_with_dups(setup_path): + + # single dtypes + df = DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]) + df.index = date_range("20130101 9:30", periods=10, freq="T") + + with ensure_clean_store(setup_path) as store: + store.append("df", df) + + result = store.select("df") + expected = df + tm.assert_frame_equal(result, expected, by_blocks=True) + + result = store.select("df", columns=df.columns) + expected = df + tm.assert_frame_equal(result, expected, by_blocks=True) + + result = store.select("df", columns=["A"]) + expected = df.loc[:, ["A"]] + tm.assert_frame_equal(result, expected) + + # dups across dtypes + df = concat( + [ + DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]), + DataFrame( + np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"] + ), + ], + axis=1, + ) + df.index = date_range("20130101 9:30", periods=10, freq="T") + + with ensure_clean_store(setup_path) as store: + store.append("df", df) + + result = store.select("df") + expected = df + tm.assert_frame_equal(result, expected, by_blocks=True) + + result = store.select("df", columns=df.columns) + expected = df + tm.assert_frame_equal(result, expected, by_blocks=True) + + expected = df.loc[:, ["A"]] + result = store.select("df", columns=["A"]) + tm.assert_frame_equal(result, expected, by_blocks=True) + + expected = df.loc[:, ["B", "A"]] + result = store.select("df", columns=["B", "A"]) + tm.assert_frame_equal(result, expected, by_blocks=True) + + # duplicates on both index and columns + with ensure_clean_store(setup_path) as store: + store.append("df", df) + store.append("df", df) + + expected = df.loc[:, ["B", "A"]] + expected = concat([expected, expected]) + result = store.select("df", columns=["B", "A"]) + tm.assert_frame_equal(result, expected, by_blocks=True) + + +def test_select(setup_path): + + with ensure_clean_store(setup_path) as store: + + with catch_warnings(record=True): + + # select with columns= + df = tm.makeTimeDataFrame() + _maybe_remove(store, "df") + store.append("df", df) + result = store.select("df", columns=["A", "B"]) + expected = df.reindex(columns=["A", "B"]) + tm.assert_frame_equal(expected, result) + + # equivalently + result = store.select("df", [("columns=['A', 'B']")]) + expected = df.reindex(columns=["A", "B"]) + tm.assert_frame_equal(expected, result) + + # with a data column + _maybe_remove(store, "df") + store.append("df", df, data_columns=["A"]) + result = store.select("df", ["A > 0"], columns=["A", "B"]) + expected = df[df.A > 0].reindex(columns=["A", "B"]) + tm.assert_frame_equal(expected, result) + + # all a data columns + _maybe_remove(store, "df") + store.append("df", df, data_columns=True) + result = store.select("df", ["A > 0"], columns=["A", "B"]) + expected = df[df.A > 0].reindex(columns=["A", "B"]) + tm.assert_frame_equal(expected, result) + + # with a data column, but different columns + _maybe_remove(store, "df") + store.append("df", df, data_columns=["A"]) + result = store.select("df", ["A > 0"], columns=["C", "D"]) + expected = df[df.A > 0].reindex(columns=["C", "D"]) + tm.assert_frame_equal(expected, result) + + +def test_select_dtypes(setup_path): + + with ensure_clean_store(setup_path) as store: + # with a Timestamp data column (GH #2637) + df = DataFrame( + { + "ts": bdate_range("2012-01-01", periods=300), + "A": np.random.randn(300), + } + ) + _maybe_remove(store, "df") + store.append("df", df, data_columns=["ts", "A"]) + + result = store.select("df", "ts>=Timestamp('2012-02-01')") + expected = df[df.ts >= Timestamp("2012-02-01")] + tm.assert_frame_equal(expected, result) + + # bool columns (GH #2849) + df = DataFrame(np.random.randn(5, 2), columns=["A", "B"]) + df["object"] = "foo" + df.loc[4:5, "object"] = "bar" + df["boolv"] = df["A"] > 0 + _maybe_remove(store, "df") + store.append("df", df, data_columns=True) + + expected = df[df.boolv == True].reindex(columns=["A", "boolv"]) # noqa + for v in [True, "true", 1]: + result = store.select("df", f"boolv == {v}", columns=["A", "boolv"]) + tm.assert_frame_equal(expected, result) + + expected = df[df.boolv == False].reindex(columns=["A", "boolv"]) # noqa + for v in [False, "false", 0]: + result = store.select("df", f"boolv == {v}", columns=["A", "boolv"]) + tm.assert_frame_equal(expected, result) + + # integer index + df = DataFrame({"A": np.random.rand(20), "B": np.random.rand(20)}) + _maybe_remove(store, "df_int") + store.append("df_int", df) + result = store.select("df_int", "index<10 and columns=['A']") + expected = df.reindex(index=list(df.index)[0:10], columns=["A"]) + tm.assert_frame_equal(expected, result) + + # float index + df = DataFrame( + { + "A": np.random.rand(20), + "B": np.random.rand(20), + "index": np.arange(20, dtype="f8"), + } + ) + _maybe_remove(store, "df_float") + store.append("df_float", df) + result = store.select("df_float", "index<10.0 and columns=['A']") + expected = df.reindex(index=list(df.index)[0:10], columns=["A"]) + tm.assert_frame_equal(expected, result) + + with ensure_clean_store(setup_path) as store: + + # floats w/o NaN + df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64") + df["cols"] = (df["cols"] + 10).apply(str) + + store.append("df1", df, data_columns=True) + result = store.select("df1", where="values>2.0") + expected = df[df["values"] > 2.0] + tm.assert_frame_equal(expected, result) + + # floats with NaN + df.iloc[0] = np.nan + expected = df[df["values"] > 2.0] + + store.append("df2", df, data_columns=True, index=False) + result = store.select("df2", where="values>2.0") + tm.assert_frame_equal(expected, result) + + # https://github.com/PyTables/PyTables/issues/282 + # bug in selection when 0th row has a np.nan and an index + # store.append('df3',df,data_columns=True) + # result = store.select( + # 'df3', where='values>2.0') + # tm.assert_frame_equal(expected, result) + + # not in first position float with NaN ok too + df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64") + df["cols"] = (df["cols"] + 10).apply(str) + + df.iloc[1] = np.nan + expected = df[df["values"] > 2.0] + + store.append("df4", df, data_columns=True) + result = store.select("df4", where="values>2.0") + tm.assert_frame_equal(expected, result) + + # test selection with comparison against numpy scalar + # GH 11283 + with ensure_clean_store(setup_path) as store: + df = tm.makeDataFrame() + + expected = df[df["A"] > 0] + + store.append("df", df, data_columns=True) + np_zero = np.float64(0) # noqa + result = store.select("df", where=["A>np_zero"]) + tm.assert_frame_equal(expected, result) + + +def test_select_with_many_inputs(setup_path): + + with ensure_clean_store(setup_path) as store: + + df = DataFrame( + { + "ts": bdate_range("2012-01-01", periods=300), + "A": np.random.randn(300), + "B": range(300), + "users": ["a"] * 50 + + ["b"] * 50 + + ["c"] * 100 + + [f"a{i:03d}" for i in range(100)], + } + ) + _maybe_remove(store, "df") + store.append("df", df, data_columns=["ts", "A", "B", "users"]) + + # regular select + result = store.select("df", "ts>=Timestamp('2012-02-01')") + expected = df[df.ts >= Timestamp("2012-02-01")] + tm.assert_frame_equal(expected, result) + + # small selector + result = store.select("df", "ts>=Timestamp('2012-02-01') & users=['a','b','c']") + expected = df[ + (df.ts >= Timestamp("2012-02-01")) & df.users.isin(["a", "b", "c"]) + ] + tm.assert_frame_equal(expected, result) + + # big selector along the columns + selector = ["a", "b", "c"] + [f"a{i:03d}" for i in range(60)] + result = store.select("df", "ts>=Timestamp('2012-02-01') and users=selector") + expected = df[(df.ts >= Timestamp("2012-02-01")) & df.users.isin(selector)] + tm.assert_frame_equal(expected, result) + + selector = range(100, 200) + result = store.select("df", "B=selector") + expected = df[df.B.isin(selector)] + tm.assert_frame_equal(expected, result) + assert len(result) == 100 + + # big selector along the index + selector = Index(df.ts[0:100].values) + result = store.select("df", "ts=selector") + expected = df[df.ts.isin(selector.values)] + tm.assert_frame_equal(expected, result) + assert len(result) == 100 + + +def test_select_iterator(setup_path): + + # single table + with ensure_clean_store(setup_path) as store: + + df = tm.makeTimeDataFrame(500) + _maybe_remove(store, "df") + store.append("df", df) + + expected = store.select("df") + + results = list(store.select("df", iterator=True)) + result = concat(results) + tm.assert_frame_equal(expected, result) + + results = list(store.select("df", chunksize=100)) + assert len(results) == 5 + result = concat(results) + tm.assert_frame_equal(expected, result) + + results = list(store.select("df", chunksize=150)) + result = concat(results) + tm.assert_frame_equal(result, expected) + + with ensure_clean_path(setup_path) as path: + + df = tm.makeTimeDataFrame(500) + df.to_hdf(path, "df_non_table") + + msg = "can only use an iterator or chunksize on a table" + with pytest.raises(TypeError, match=msg): + read_hdf(path, "df_non_table", chunksize=100) + + with pytest.raises(TypeError, match=msg): + read_hdf(path, "df_non_table", iterator=True) + + with ensure_clean_path(setup_path) as path: + + df = tm.makeTimeDataFrame(500) + df.to_hdf(path, "df", format="table") + + results = list(read_hdf(path, "df", chunksize=100)) + result = concat(results) + + assert len(results) == 5 + tm.assert_frame_equal(result, df) + tm.assert_frame_equal(result, read_hdf(path, "df")) + + # multiple + + with ensure_clean_store(setup_path) as store: + + df1 = tm.makeTimeDataFrame(500) + store.append("df1", df1, data_columns=True) + df2 = tm.makeTimeDataFrame(500).rename(columns="{}_2".format) + df2["foo"] = "bar" + store.append("df2", df2) + + df = concat([df1, df2], axis=1) + + # full selection + expected = store.select_as_multiple(["df1", "df2"], selector="df1") + results = list( + store.select_as_multiple(["df1", "df2"], selector="df1", chunksize=150) + ) + result = concat(results) + tm.assert_frame_equal(expected, result) + + +def test_select_iterator_complete_8014(setup_path): + + # GH 8014 + # using iterator and where clause + chunksize = 1e4 + + # no iterator + with ensure_clean_store(setup_path) as store: + + expected = tm.makeTimeDataFrame(100064, "S") + _maybe_remove(store, "df") + store.append("df", expected) + + beg_dt = expected.index[0] + end_dt = expected.index[-1] + + # select w/o iteration and no where clause works + result = store.select("df") + tm.assert_frame_equal(expected, result) + + # select w/o iterator and where clause, single term, begin + # of range, works + where = f"index >= '{beg_dt}'" + result = store.select("df", where=where) + tm.assert_frame_equal(expected, result) + + # select w/o iterator and where clause, single term, end + # of range, works + where = f"index <= '{end_dt}'" + result = store.select("df", where=where) + tm.assert_frame_equal(expected, result) + + # select w/o iterator and where clause, inclusive range, + # works + where = f"index >= '{beg_dt}' & index <= '{end_dt}'" + result = store.select("df", where=where) + tm.assert_frame_equal(expected, result) + + # with iterator, full range + with ensure_clean_store(setup_path) as store: + + expected = tm.makeTimeDataFrame(100064, "S") + _maybe_remove(store, "df") + store.append("df", expected) + + beg_dt = expected.index[0] + end_dt = expected.index[-1] + + # select w/iterator and no where clause works + results = list(store.select("df", chunksize=chunksize)) + result = concat(results) + tm.assert_frame_equal(expected, result) + + # select w/iterator and where clause, single term, begin of range + where = f"index >= '{beg_dt}'" + results = list(store.select("df", where=where, chunksize=chunksize)) + result = concat(results) + tm.assert_frame_equal(expected, result) + + # select w/iterator and where clause, single term, end of range + where = f"index <= '{end_dt}'" + results = list(store.select("df", where=where, chunksize=chunksize)) + result = concat(results) + tm.assert_frame_equal(expected, result) + + # select w/iterator and where clause, inclusive range + where = f"index >= '{beg_dt}' & index <= '{end_dt}'" + results = list(store.select("df", where=where, chunksize=chunksize)) + result = concat(results) + tm.assert_frame_equal(expected, result) + + +def test_select_iterator_non_complete_8014(setup_path): + + # GH 8014 + # using iterator and where clause + chunksize = 1e4 + + # with iterator, non complete range + with ensure_clean_store(setup_path) as store: + + expected = tm.makeTimeDataFrame(100064, "S") + _maybe_remove(store, "df") + store.append("df", expected) + + beg_dt = expected.index[1] + end_dt = expected.index[-2] + + # select w/iterator and where clause, single term, begin of range + where = f"index >= '{beg_dt}'" + results = list(store.select("df", where=where, chunksize=chunksize)) + result = concat(results) + rexpected = expected[expected.index >= beg_dt] + tm.assert_frame_equal(rexpected, result) + + # select w/iterator and where clause, single term, end of range + where = f"index <= '{end_dt}'" + results = list(store.select("df", where=where, chunksize=chunksize)) + result = concat(results) + rexpected = expected[expected.index <= end_dt] + tm.assert_frame_equal(rexpected, result) + + # select w/iterator and where clause, inclusive range + where = f"index >= '{beg_dt}' & index <= '{end_dt}'" + results = list(store.select("df", where=where, chunksize=chunksize)) + result = concat(results) + rexpected = expected[(expected.index >= beg_dt) & (expected.index <= end_dt)] + tm.assert_frame_equal(rexpected, result) + + # with iterator, empty where + with ensure_clean_store(setup_path) as store: + + expected = tm.makeTimeDataFrame(100064, "S") + _maybe_remove(store, "df") + store.append("df", expected) + + end_dt = expected.index[-1] + + # select w/iterator and where clause, single term, begin of range + where = f"index > '{end_dt}'" + results = list(store.select("df", where=where, chunksize=chunksize)) + assert 0 == len(results) + + +def test_select_iterator_many_empty_frames(setup_path): + + # GH 8014 + # using iterator and where clause can return many empty + # frames. + chunksize = 10_000 + + # with iterator, range limited to the first chunk + with ensure_clean_store(setup_path) as store: + + expected = tm.makeTimeDataFrame(100000, "S") + _maybe_remove(store, "df") + store.append("df", expected) + + beg_dt = expected.index[0] + end_dt = expected.index[chunksize - 1] + + # select w/iterator and where clause, single term, begin of range + where = f"index >= '{beg_dt}'" + results = list(store.select("df", where=where, chunksize=chunksize)) + result = concat(results) + rexpected = expected[expected.index >= beg_dt] + tm.assert_frame_equal(rexpected, result) + + # select w/iterator and where clause, single term, end of range + where = f"index <= '{end_dt}'" + results = list(store.select("df", where=where, chunksize=chunksize)) + + assert len(results) == 1 + result = concat(results) + rexpected = expected[expected.index <= end_dt] + tm.assert_frame_equal(rexpected, result) + + # select w/iterator and where clause, inclusive range + where = f"index >= '{beg_dt}' & index <= '{end_dt}'" + results = list(store.select("df", where=where, chunksize=chunksize)) + + # should be 1, is 10 + assert len(results) == 1 + result = concat(results) + rexpected = expected[(expected.index >= beg_dt) & (expected.index <= end_dt)] + tm.assert_frame_equal(rexpected, result) + + # select w/iterator and where clause which selects + # *nothing*. + # + # To be consistent with Python idiom I suggest this should + # return [] e.g. `for e in []: print True` never prints + # True. + + where = f"index <= '{beg_dt}' & index >= '{end_dt}'" + results = list(store.select("df", where=where, chunksize=chunksize)) + + # should be [] + assert len(results) == 0 + + +def test_frame_select(setup_path): + + df = tm.makeTimeDataFrame() + + with ensure_clean_store(setup_path) as store: + store.put("frame", df, format="table") + date = df.index[len(df) // 2] + + crit1 = Term("index>=date") + assert crit1.env.scope["date"] == date + + crit2 = "columns=['A', 'D']" + crit3 = "columns=A" + + result = store.select("frame", [crit1, crit2]) + expected = df.loc[date:, ["A", "D"]] + tm.assert_frame_equal(result, expected) + + result = store.select("frame", [crit3]) + expected = df.loc[:, ["A"]] + tm.assert_frame_equal(result, expected) + + # invalid terms + df = tm.makeTimeDataFrame() + store.append("df_time", df) + msg = "could not convert string to Timestamp" + with pytest.raises(ValueError, match=msg): + store.select("df_time", "index>0") + + # can't select if not written as table + # store['frame'] = df + # with pytest.raises(ValueError): + # store.select('frame', [crit1, crit2]) + + +def test_frame_select_complex(setup_path): + # select via complex criteria + + df = tm.makeTimeDataFrame() + df["string"] = "foo" + df.loc[df.index[0:4], "string"] = "bar" + + with ensure_clean_store(setup_path) as store: + store.put("df", df, format="table", data_columns=["string"]) + + # empty + result = store.select("df", 'index>df.index[3] & string="bar"') + expected = df.loc[(df.index > df.index[3]) & (df.string == "bar")] + tm.assert_frame_equal(result, expected) + + result = store.select("df", 'index>df.index[3] & string="foo"') + expected = df.loc[(df.index > df.index[3]) & (df.string == "foo")] + tm.assert_frame_equal(result, expected) + + # or + result = store.select("df", 'index>df.index[3] | string="bar"') + expected = df.loc[(df.index > df.index[3]) | (df.string == "bar")] + tm.assert_frame_equal(result, expected) + + result = store.select( + "df", '(index>df.index[3] & index<=df.index[6]) | string="bar"' + ) + expected = df.loc[ + ((df.index > df.index[3]) & (df.index <= df.index[6])) + | (df.string == "bar") + ] + tm.assert_frame_equal(result, expected) + + # invert + result = store.select("df", 'string!="bar"') + expected = df.loc[df.string != "bar"] + tm.assert_frame_equal(result, expected) + + # invert not implemented in numexpr :( + msg = "cannot use an invert condition when passing to numexpr" + with pytest.raises(NotImplementedError, match=msg): + store.select("df", '~(string="bar")') + + # invert ok for filters + result = store.select("df", "~(columns=['A','B'])") + expected = df.loc[:, df.columns.difference(["A", "B"])] + tm.assert_frame_equal(result, expected) + + # in + result = store.select("df", "index>df.index[3] & columns in ['A','B']") + expected = df.loc[df.index > df.index[3]].reindex(columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + +def test_frame_select_complex2(setup_path): + + with ensure_clean_path(["parms.hdf", "hist.hdf"]) as paths: + + pp, hh = paths + + # use non-trivial selection criteria + parms = DataFrame({"A": [1, 1, 2, 2, 3]}) + parms.to_hdf(pp, "df", mode="w", format="table", data_columns=["A"]) + + selection = read_hdf(pp, "df", where="A=[2,3]") + hist = DataFrame( + np.random.randn(25, 1), + columns=["data"], + index=MultiIndex.from_tuples( + [(i, j) for i in range(5) for j in range(5)], names=["l1", "l2"] + ), + ) + + hist.to_hdf(hh, "df", mode="w", format="table") + + expected = read_hdf(hh, "df", where="l1=[2, 3, 4]") + + # scope with list like + l = selection.index.tolist() # noqa + store = HDFStore(hh) + result = store.select("df", where="l1=l") + tm.assert_frame_equal(result, expected) + store.close() + + result = read_hdf(hh, "df", where="l1=l") + tm.assert_frame_equal(result, expected) + + # index + index = selection.index # noqa + result = read_hdf(hh, "df", where="l1=index") + tm.assert_frame_equal(result, expected) + + result = read_hdf(hh, "df", where="l1=selection.index") + tm.assert_frame_equal(result, expected) + + result = read_hdf(hh, "df", where="l1=selection.index.tolist()") + tm.assert_frame_equal(result, expected) + + result = read_hdf(hh, "df", where="l1=list(selection.index)") + tm.assert_frame_equal(result, expected) + + # scope with index + store = HDFStore(hh) + + result = store.select("df", where="l1=index") + tm.assert_frame_equal(result, expected) + + result = store.select("df", where="l1=selection.index") + tm.assert_frame_equal(result, expected) + + result = store.select("df", where="l1=selection.index.tolist()") + tm.assert_frame_equal(result, expected) + + result = store.select("df", where="l1=list(selection.index)") + tm.assert_frame_equal(result, expected) + + store.close() + + +def test_invalid_filtering(setup_path): + + # can't use more than one filter (atm) + + df = tm.makeTimeDataFrame() + + with ensure_clean_store(setup_path) as store: + store.put("df", df, format="table") + + msg = "unable to collapse Joint Filters" + # not implemented + with pytest.raises(NotImplementedError, match=msg): + store.select("df", "columns=['A'] | columns=['B']") + + # in theory we could deal with this + with pytest.raises(NotImplementedError, match=msg): + store.select("df", "columns=['A','B'] & columns=['C']") + + +def test_string_select(setup_path): + # GH 2973 + with ensure_clean_store(setup_path) as store: + + df = tm.makeTimeDataFrame() + + # test string ==/!= + df["x"] = "none" + df.loc[df.index[2:7], "x"] = "" + + store.append("df", df, data_columns=["x"]) + + result = store.select("df", "x=none") + expected = df[df.x == "none"] + tm.assert_frame_equal(result, expected) + + result = store.select("df", "x!=none") + expected = df[df.x != "none"] + tm.assert_frame_equal(result, expected) + + df2 = df.copy() + df2.loc[df2.x == "", "x"] = np.nan + + store.append("df2", df2, data_columns=["x"]) + result = store.select("df2", "x!=none") + expected = df2[isna(df2.x)] + tm.assert_frame_equal(result, expected) + + # int ==/!= + df["int"] = 1 + df.loc[df.index[2:7], "int"] = 2 + + store.append("df3", df, data_columns=["int"]) + + result = store.select("df3", "int=2") + expected = df[df.int == 2] + tm.assert_frame_equal(result, expected) + + result = store.select("df3", "int!=2") + expected = df[df.int != 2] + tm.assert_frame_equal(result, expected) + + +def test_select_as_multiple(setup_path): + + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) + df2["foo"] = "bar" + + with ensure_clean_store(setup_path) as store: + + msg = "keys must be a list/tuple" + # no tables stored + with pytest.raises(TypeError, match=msg): + store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1") + + store.append("df1", df1, data_columns=["A", "B"]) + store.append("df2", df2) + + # exceptions + with pytest.raises(TypeError, match=msg): + store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1") + + with pytest.raises(TypeError, match=msg): + store.select_as_multiple([None], where=["A>0", "B>0"], selector="df1") + + msg = "'No object named df3 in the file'" + with pytest.raises(KeyError, match=msg): + store.select_as_multiple( + ["df1", "df3"], where=["A>0", "B>0"], selector="df1" + ) + + with pytest.raises(KeyError, match=msg): + store.select_as_multiple(["df3"], where=["A>0", "B>0"], selector="df1") + + with pytest.raises(KeyError, match="'No object named df4 in the file'"): + store.select_as_multiple( + ["df1", "df2"], where=["A>0", "B>0"], selector="df4" + ) + + # default select + result = store.select("df1", ["A>0", "B>0"]) + expected = store.select_as_multiple( + ["df1"], where=["A>0", "B>0"], selector="df1" + ) + tm.assert_frame_equal(result, expected) + expected = store.select_as_multiple("df1", where=["A>0", "B>0"], selector="df1") + tm.assert_frame_equal(result, expected) + + # multiple + result = store.select_as_multiple( + ["df1", "df2"], where=["A>0", "B>0"], selector="df1" + ) + expected = concat([df1, df2], axis=1) + expected = expected[(expected.A > 0) & (expected.B > 0)] + tm.assert_frame_equal(result, expected) + + # multiple (diff selector) + result = store.select_as_multiple( + ["df1", "df2"], where="index>df2.index[4]", selector="df2" + ) + expected = concat([df1, df2], axis=1) + expected = expected[5:] + tm.assert_frame_equal(result, expected) + + # test exception for diff rows + store.append("df3", tm.makeTimeDataFrame(nper=50)) + msg = "all tables must have exactly the same nrows!" + with pytest.raises(ValueError, match=msg): + store.select_as_multiple( + ["df1", "df3"], where=["A>0", "B>0"], selector="df1" + ) + + +@pytest.mark.skipif( + LooseVersion(tables.__version__) < LooseVersion("3.1.0"), + reason=("tables version does not support fix for nan selection bug: GH 4858"), +) +def test_nan_selection_bug_4858(setup_path): + + with ensure_clean_store(setup_path) as store: + + df = DataFrame({"cols": range(6), "values": range(6)}, dtype="float64") + df["cols"] = (df["cols"] + 10).apply(str) + df.iloc[0] = np.nan + + expected = DataFrame( + {"cols": ["13.0", "14.0", "15.0"], "values": [3.0, 4.0, 5.0]}, + index=[3, 4, 5], + ) + + # write w/o the index on that particular column + store.append("df", df, data_columns=True, index=["cols"]) + result = store.select("df", where="values>2.0") + tm.assert_frame_equal(result, expected) + + +def test_query_with_nested_special_character(setup_path): + df = DataFrame( + { + "a": ["a", "a", "c", "b", "test & test", "c", "b", "e"], + "b": [1, 2, 3, 4, 5, 6, 7, 8], + } + ) + expected = df[df.a == "test & test"] + with ensure_clean_store(setup_path) as store: + store.append("test", df, format="table", data_columns=True) + result = store.select("test", 'a = "test & test"') + tm.assert_frame_equal(expected, result) + + +def test_query_long_float_literal(setup_path): + # GH 14241 + df = DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]}) + + with ensure_clean_store(setup_path) as store: + store.append("test", df, format="table", data_columns=True) + + cutoff = 1000000000.0006 + result = store.select("test", f"A < {cutoff:.4f}") + assert result.empty + + cutoff = 1000000000.0010 + result = store.select("test", f"A > {cutoff:.4f}") + expected = df.loc[[1, 2], :] + tm.assert_frame_equal(expected, result) + + exact = 1000000000.0011 + result = store.select("test", f"A == {exact:.4f}") + expected = df.loc[[1], :] + tm.assert_frame_equal(expected, result) + + +def test_query_compare_column_type(setup_path): + # GH 15492 + df = DataFrame( + { + "date": ["2014-01-01", "2014-01-02"], + "real_date": date_range("2014-01-01", periods=2), + "float": [1.1, 1.2], + "int": [1, 2], + }, + columns=["date", "real_date", "float", "int"], + ) + + with ensure_clean_store(setup_path) as store: + store.append("test", df, format="table", data_columns=True) + + ts = Timestamp("2014-01-01") # noqa + result = store.select("test", where="real_date > ts") + expected = df.loc[[1], :] + tm.assert_frame_equal(expected, result) + + for op in ["<", ">", "=="]: + # non strings to string column always fail + for v in [2.1, True, Timestamp("2014-01-01"), pd.Timedelta(1, "s")]: + query = f"date {op} v" + msg = f"Cannot compare {v} of type {type(v)} to string column" + with pytest.raises(TypeError, match=msg): + store.select("test", where=query) + + # strings to other columns must be convertible to type + v = "a" + for col in ["int", "float", "real_date"]: + query = f"{col} {op} v" + msg = "could not convert string to " + with pytest.raises(ValueError, match=msg): + store.select("test", where=query) + + for v, col in zip( + ["1", "1.1", "2014-01-01"], ["int", "float", "real_date"] + ): + query = f"{col} {op} v" + result = store.select("test", where=query) + + if op == "==": + expected = df.loc[[0], :] + elif op == ">": + expected = df.loc[[1], :] + else: + expected = df.loc[[], :] + tm.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize("where", ["", (), (None,), [], [None]]) +def test_select_empty_where(where): + # GH26610 + + df = DataFrame([1, 2, 3]) + with ensure_clean_path("empty_where.h5") as path: + with HDFStore(path) as store: + store.put("df", df, "t") + result = pd.read_hdf(store, "df", where=where) + tm.assert_frame_equal(result, df) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 131711a32d114..819f8ed9bc07f 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -1,36 +1,24 @@ import datetime -from datetime import timedelta -from distutils.version import LooseVersion import hashlib -from io import BytesIO import os -from pathlib import Path -import re import time from warnings import catch_warnings, simplefilter import numpy as np import pytest -from pandas.compat import is_platform_little_endian, is_platform_windows import pandas.util._test_decorators as td import pandas as pd from pandas import ( - Categorical, - CategoricalIndex, DataFrame, DatetimeIndex, Index, - Int64Index, MultiIndex, - RangeIndex, Series, Timestamp, - bdate_range, concat, date_range, - isna, timedelta_range, ) import pandas._testing as tm @@ -39,22 +27,8 @@ ensure_clean_path, ensure_clean_store, safe_close, - tables, ) -from pandas.io.pytables import ( - ClosedFileError, - HDFStore, - PossibleDataLossError, - Term, - _maybe_adjust_name, - read_hdf, -) - -from pandas.io import pytables as pytables # isort:skip -from pandas.io.pytables import TableIterator # isort:skip - - # TODO(ArrayManager) HDFStore relies on accessing the blocks pytestmark = td.skip_array_manager_not_yet_implemented @@ -64,4971 +38,972 @@ "ignore:object name:tables.exceptions.NaturalNameWarning" ) +from pandas.io.pytables import HDFStore, read_hdf -@pytest.mark.single -class TestHDFStore: - def test_format_type(self, setup_path): - df = DataFrame({"A": [1, 2]}) - with ensure_clean_path(setup_path) as path: - with HDFStore(path) as store: - store.put("a", df, format="fixed") - store.put("b", df, format="table") - - assert store.get_storer("a").format_type == "fixed" - assert store.get_storer("b").format_type == "table" - - def test_format_kwarg_in_constructor(self, setup_path): - # GH 13291 - - msg = "format is not a defined argument for HDFStore" +pytestmark = pytest.mark.single - with tm.ensure_clean(setup_path) as path: - with pytest.raises(ValueError, match=msg): - HDFStore(path, format="table") - def test_context(self, setup_path): - with tm.ensure_clean(setup_path) as path: - try: - with HDFStore(path) as tbl: - raise ValueError("blah") - except ValueError: - pass - with tm.ensure_clean(setup_path) as path: +def test_context(setup_path): + with tm.ensure_clean(setup_path) as path: + try: with HDFStore(path) as tbl: - tbl["a"] = tm.makeDataFrame() - assert len(tbl) == 1 - assert type(tbl["a"]) == DataFrame - - def test_conv_read_write(self, setup_path): - with tm.ensure_clean() as path: - - def roundtrip(key, obj, **kwargs): - obj.to_hdf(path, key, **kwargs) - return read_hdf(path, key) - - o = tm.makeTimeSeries() - tm.assert_series_equal(o, roundtrip("series", o)) - - o = tm.makeStringSeries() - tm.assert_series_equal(o, roundtrip("string_series", o)) - - o = tm.makeDataFrame() - tm.assert_frame_equal(o, roundtrip("frame", o)) - - # table - df = DataFrame({"A": range(5), "B": range(5)}) - df.to_hdf(path, "table", append=True) - result = read_hdf(path, "table", where=["index>2"]) - tm.assert_frame_equal(df[df.index > 2], result) - - def test_long_strings(self, setup_path): - - # GH6166 - df = DataFrame( - {"a": tm.rands_array(100, size=10)}, index=tm.rands_array(100, size=10) - ) - - with ensure_clean_store(setup_path) as store: - store.append("df", df, data_columns=["a"]) - - result = store.select("df") - tm.assert_frame_equal(df, result) + raise ValueError("blah") + except ValueError: + pass + with tm.ensure_clean(setup_path) as path: + with HDFStore(path) as tbl: + tbl["a"] = tm.makeDataFrame() + assert len(tbl) == 1 + assert type(tbl["a"]) == DataFrame + + +def test_no_track_times(setup_path): + + # GH 32682 + # enables to set track_times (see `pytables` `create_table` documentation) + + def checksum(filename, hash_factory=hashlib.md5, chunk_num_blocks=128): + h = hash_factory() + with open(filename, "rb") as f: + for chunk in iter(lambda: f.read(chunk_num_blocks * h.block_size), b""): + h.update(chunk) + return h.digest() + + def create_h5_and_return_checksum(track_times): + with ensure_clean_path(setup_path) as path: + df = DataFrame({"a": [1]}) + + with HDFStore(path, mode="w") as hdf: + hdf.put( + "table", + df, + format="table", + data_columns=True, + index=None, + track_times=track_times, + ) - def test_api(self, setup_path): + return checksum(path) - # GH4584 - # API issue when to_hdf doesn't accept append AND format args - with ensure_clean_path(setup_path) as path: + checksum_0_tt_false = create_h5_and_return_checksum(track_times=False) + checksum_0_tt_true = create_h5_and_return_checksum(track_times=True) - df = tm.makeDataFrame() - df.iloc[:10].to_hdf(path, "df", append=True, format="table") - df.iloc[10:].to_hdf(path, "df", append=True, format="table") - tm.assert_frame_equal(read_hdf(path, "df"), df) + # sleep is necessary to create h5 with different creation time + time.sleep(1) - # append to False - df.iloc[:10].to_hdf(path, "df", append=False, format="table") - df.iloc[10:].to_hdf(path, "df", append=True, format="table") - tm.assert_frame_equal(read_hdf(path, "df"), df) + checksum_1_tt_false = create_h5_and_return_checksum(track_times=False) + checksum_1_tt_true = create_h5_and_return_checksum(track_times=True) - with ensure_clean_path(setup_path) as path: + # checksums are the same if track_time = False + assert checksum_0_tt_false == checksum_1_tt_false - df = tm.makeDataFrame() - df.iloc[:10].to_hdf(path, "df", append=True) - df.iloc[10:].to_hdf(path, "df", append=True, format="table") - tm.assert_frame_equal(read_hdf(path, "df"), df) + # checksums are NOT same if track_time = True + assert checksum_0_tt_true != checksum_1_tt_true - # append to False - df.iloc[:10].to_hdf(path, "df", append=False, format="table") - df.iloc[10:].to_hdf(path, "df", append=True) - tm.assert_frame_equal(read_hdf(path, "df"), df) - with ensure_clean_path(setup_path) as path: +def test_iter_empty(setup_path): - df = tm.makeDataFrame() - df.to_hdf(path, "df", append=False, format="fixed") - tm.assert_frame_equal(read_hdf(path, "df"), df) + with ensure_clean_store(setup_path) as store: + # GH 12221 + assert list(store) == [] - df.to_hdf(path, "df", append=False, format="f") - tm.assert_frame_equal(read_hdf(path, "df"), df) - df.to_hdf(path, "df", append=False) - tm.assert_frame_equal(read_hdf(path, "df"), df) +def test_repr(setup_path): - df.to_hdf(path, "df") - tm.assert_frame_equal(read_hdf(path, "df"), df) + with ensure_clean_store(setup_path) as store: + repr(store) + store.info() + store["a"] = tm.makeTimeSeries() + store["b"] = tm.makeStringSeries() + store["c"] = tm.makeDataFrame() - with ensure_clean_store(setup_path) as store: + df = tm.makeDataFrame() + df["obj1"] = "foo" + df["obj2"] = "bar" + df["bool1"] = df["A"] > 0 + df["bool2"] = df["B"] > 0 + df["bool3"] = True + df["int1"] = 1 + df["int2"] = 2 + df["timestamp1"] = Timestamp("20010102") + df["timestamp2"] = Timestamp("20010103") + df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) + df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) + df.loc[df.index[3:6], ["obj1"]] = np.nan + df = df._consolidate()._convert(datetime=True) - path = store._path - df = tm.makeDataFrame() + with catch_warnings(record=True): + simplefilter("ignore", pd.errors.PerformanceWarning) + store["df"] = df - _maybe_remove(store, "df") - store.append("df", df.iloc[:10], append=True, format="table") - store.append("df", df.iloc[10:], append=True, format="table") - tm.assert_frame_equal(store.select("df"), df) + # make a random group in hdf space + store._handle.create_group(store._handle.root, "bah") - # append to False - _maybe_remove(store, "df") - store.append("df", df.iloc[:10], append=False, format="table") - store.append("df", df.iloc[10:], append=True, format="table") - tm.assert_frame_equal(store.select("df"), df) + assert store.filename in repr(store) + assert store.filename in str(store) + store.info() - # formats - _maybe_remove(store, "df") - store.append("df", df.iloc[:10], append=False, format="table") - store.append("df", df.iloc[10:], append=True, format="table") - tm.assert_frame_equal(store.select("df"), df) + # storers + with ensure_clean_store(setup_path) as store: - _maybe_remove(store, "df") - store.append("df", df.iloc[:10], append=False, format="table") - store.append("df", df.iloc[10:], append=True, format=None) - tm.assert_frame_equal(store.select("df"), df) + df = tm.makeDataFrame() + store.append("df", df) - with ensure_clean_path(setup_path) as path: - # Invalid. - df = tm.makeDataFrame() + s = store.get_storer("df") + repr(s) + str(s) - msg = "Can only append to Tables" - with pytest.raises(ValueError, match=msg): - df.to_hdf(path, "df", append=True, format="f") +@pytest.mark.filterwarnings("ignore:object name:tables.exceptions.NaturalNameWarning") +def test_contains(setup_path): - with pytest.raises(ValueError, match=msg): - df.to_hdf(path, "df", append=True, format="fixed") + with ensure_clean_store(setup_path) as store: + store["a"] = tm.makeTimeSeries() + store["b"] = tm.makeDataFrame() + store["foo/bar"] = tm.makeDataFrame() + assert "a" in store + assert "b" in store + assert "c" not in store + assert "foo/bar" in store + assert "/foo/bar" in store + assert "/foo/b" not in store + assert "bar" not in store - msg = r"invalid HDFStore format specified \[foo\]" + # gh-2694: tables.NaturalNameWarning + with catch_warnings(record=True): + store["node())"] = tm.makeDataFrame() + assert "node())" in store - with pytest.raises(TypeError, match=msg): - df.to_hdf(path, "df", append=True, format="foo") - with pytest.raises(TypeError, match=msg): - df.to_hdf(path, "df", append=False, format="foo") +def test_versioning(setup_path): - # File path doesn't exist - path = "" - msg = f"File {path} does not exist" + with ensure_clean_store(setup_path) as store: + store["a"] = tm.makeTimeSeries() + store["b"] = tm.makeDataFrame() + df = tm.makeTimeDataFrame() + _maybe_remove(store, "df1") + store.append("df1", df[:10]) + store.append("df1", df[10:]) + assert store.root.a._v_attrs.pandas_version == "0.15.2" + assert store.root.b._v_attrs.pandas_version == "0.15.2" + assert store.root.df1._v_attrs.pandas_version == "0.15.2" - with pytest.raises(FileNotFoundError, match=msg): - read_hdf(path, "df") + # write a file and wipe its versioning + _maybe_remove(store, "df2") + store.append("df2", df) - def test_api_default_format(self, setup_path): + # this is an error because its table_type is appendable, but no + # version info + store.get_node("df2")._v_attrs.pandas_version = None - # default_format option - with ensure_clean_store(setup_path) as store: - df = tm.makeDataFrame() + msg = "'NoneType' object has no attribute 'startswith'" - pd.set_option("io.hdf.default_format", "fixed") - _maybe_remove(store, "df") - store.put("df", df) - assert not store.get_storer("df").is_table + with pytest.raises(Exception, match=msg): + store.select("df2") - msg = "Can only append to Tables" - with pytest.raises(ValueError, match=msg): - store.append("df2", df) +@pytest.mark.parametrize( + "where, expected", + [ + ( + "/", + { + "": ({"first_group", "second_group"}, set()), + "/first_group": (set(), {"df1", "df2"}), + "/second_group": ({"third_group"}, {"df3", "s1"}), + "/second_group/third_group": (set(), {"df4"}), + }, + ), + ( + "/second_group", + { + "/second_group": ({"third_group"}, {"df3", "s1"}), + "/second_group/third_group": (set(), {"df4"}), + }, + ), + ], +) +def test_walk(where, expected, setup_path): + # GH10143 + objs = { + "df1": DataFrame([1, 2, 3]), + "df2": DataFrame([4, 5, 6]), + "df3": DataFrame([6, 7, 8]), + "df4": DataFrame([9, 10, 11]), + "s1": Series([10, 9, 8]), + # Next 3 items aren't pandas objects and should be ignored + "a1": np.array([[1, 2, 3], [4, 5, 6]]), + "tb1": np.array([(1, 2, 3), (4, 5, 6)], dtype="i,i,i"), + "tb2": np.array([(7, 8, 9), (10, 11, 12)], dtype="i,i,i"), + } + + with ensure_clean_store("walk_groups.hdf", mode="w") as store: + store.put("/first_group/df1", objs["df1"]) + store.put("/first_group/df2", objs["df2"]) + store.put("/second_group/df3", objs["df3"]) + store.put("/second_group/s1", objs["s1"]) + store.put("/second_group/third_group/df4", objs["df4"]) + # Create non-pandas objects + store._handle.create_array("/first_group", "a1", objs["a1"]) + store._handle.create_table("/first_group", "tb1", obj=objs["tb1"]) + store._handle.create_table("/second_group", "tb2", obj=objs["tb2"]) + + assert len(list(store.walk(where=where))) == len(expected) + for path, groups, leaves in store.walk(where=where): + assert path in expected + expected_groups, expected_frames = expected[path] + assert expected_groups == set(groups) + assert expected_frames == set(leaves) + for leaf in leaves: + frame_path = "/".join([path, leaf]) + obj = store.get(frame_path) + if "df" in leaf: + tm.assert_frame_equal(obj, objs[leaf]) + else: + tm.assert_series_equal(obj, objs[leaf]) - pd.set_option("io.hdf.default_format", "table") - _maybe_remove(store, "df") - store.put("df", df) - assert store.get_storer("df").is_table - _maybe_remove(store, "df2") - store.append("df2", df) - assert store.get_storer("df").is_table - pd.set_option("io.hdf.default_format", None) +def test_getattr(setup_path): - with ensure_clean_path(setup_path) as path: + with ensure_clean_store(setup_path) as store: - df = tm.makeDataFrame() + s = tm.makeTimeSeries() + store["a"] = s - pd.set_option("io.hdf.default_format", "fixed") - df.to_hdf(path, "df") - with HDFStore(path) as store: - assert not store.get_storer("df").is_table - with pytest.raises(ValueError, match=msg): - df.to_hdf(path, "df2", append=True) + # test attribute access + result = store.a + tm.assert_series_equal(result, s) + result = getattr(store, "a") + tm.assert_series_equal(result, s) - pd.set_option("io.hdf.default_format", "table") - df.to_hdf(path, "df3") - with HDFStore(path) as store: - assert store.get_storer("df3").is_table - df.to_hdf(path, "df4", append=True) - with HDFStore(path) as store: - assert store.get_storer("df4").is_table + df = tm.makeTimeDataFrame() + store["df"] = df + result = store.df + tm.assert_frame_equal(result, df) - pd.set_option("io.hdf.default_format", None) + # errors + for x in ["d", "mode", "path", "handle", "complib"]: + msg = f"'HDFStore' object has no attribute '{x}'" + with pytest.raises(AttributeError, match=msg): + getattr(store, x) - def test_keys(self, setup_path): + # not stores + for x in ["mode", "path", "handle", "complib"]: + getattr(store, f"_{x}") - with ensure_clean_store(setup_path) as store: - store["a"] = tm.makeTimeSeries() - store["b"] = tm.makeStringSeries() - store["c"] = tm.makeDataFrame() - assert len(store) == 3 - expected = {"/a", "/b", "/c"} - assert set(store.keys()) == expected - assert set(store) == expected +def test_store_dropna(setup_path): + df_with_missing = DataFrame( + {"col1": [0.0, np.nan, 2.0], "col2": [1.0, np.nan, np.nan]}, + index=list("abc"), + ) + df_without_missing = DataFrame( + {"col1": [0.0, 2.0], "col2": [1.0, np.nan]}, index=list("ac") + ) - def test_no_track_times(self, setup_path): + # # Test to make sure defaults are to not drop. + # # Corresponding to Issue 9382 + with ensure_clean_path(setup_path) as path: + df_with_missing.to_hdf(path, "df", format="table") + reloaded = read_hdf(path, "df") + tm.assert_frame_equal(df_with_missing, reloaded) - # GH 32682 - # enables to set track_times (see `pytables` `create_table` documentation) + with ensure_clean_path(setup_path) as path: + df_with_missing.to_hdf(path, "df", format="table", dropna=False) + reloaded = read_hdf(path, "df") + tm.assert_frame_equal(df_with_missing, reloaded) - def checksum(filename, hash_factory=hashlib.md5, chunk_num_blocks=128): - h = hash_factory() - with open(filename, "rb") as f: - for chunk in iter(lambda: f.read(chunk_num_blocks * h.block_size), b""): - h.update(chunk) - return h.digest() + with ensure_clean_path(setup_path) as path: + df_with_missing.to_hdf(path, "df", format="table", dropna=True) + reloaded = read_hdf(path, "df") + tm.assert_frame_equal(df_without_missing, reloaded) - def create_h5_and_return_checksum(track_times): - with ensure_clean_path(setup_path) as path: - df = DataFrame({"a": [1]}) - with HDFStore(path, mode="w") as hdf: - hdf.put( - "table", - df, - format="table", - data_columns=True, - index=None, - track_times=track_times, - ) +def test_to_hdf_with_min_itemsize(setup_path): - return checksum(path) + with ensure_clean_path(setup_path) as path: - checksum_0_tt_false = create_h5_and_return_checksum(track_times=False) - checksum_0_tt_true = create_h5_and_return_checksum(track_times=True) + # min_itemsize in index with to_hdf (GH 10381) + df = tm.makeMixedDataFrame().set_index("C") + df.to_hdf(path, "ss3", format="table", min_itemsize={"index": 6}) + # just make sure there is a longer string: + df2 = df.copy().reset_index().assign(C="longer").set_index("C") + df2.to_hdf(path, "ss3", append=True, format="table") + tm.assert_frame_equal(pd.read_hdf(path, "ss3"), pd.concat([df, df2])) - # sleep is necessary to create h5 with different creation time - time.sleep(1) + # same as above, with a Series + df["B"].to_hdf(path, "ss4", format="table", min_itemsize={"index": 6}) + df2["B"].to_hdf(path, "ss4", append=True, format="table") + tm.assert_series_equal(pd.read_hdf(path, "ss4"), pd.concat([df["B"], df2["B"]])) - checksum_1_tt_false = create_h5_and_return_checksum(track_times=False) - checksum_1_tt_true = create_h5_and_return_checksum(track_times=True) - # checksums are the same if track_time = False - assert checksum_0_tt_false == checksum_1_tt_false +@pytest.mark.parametrize("format", ["fixed", "table"]) +def test_to_hdf_errors(format, setup_path): - # checksums are NOT same if track_time = True - assert checksum_0_tt_true != checksum_1_tt_true + data = ["\ud800foo"] + ser = Series(data, index=Index(data)) + with ensure_clean_path(setup_path) as path: + # GH 20835 + ser.to_hdf(path, "table", format=format, errors="surrogatepass") - def test_non_pandas_keys(self, setup_path): - class Table1(tables.IsDescription): - value1 = tables.Float32Col() + result = pd.read_hdf(path, "table", errors="surrogatepass") + tm.assert_series_equal(result, ser) - class Table2(tables.IsDescription): - value2 = tables.Float32Col() - class Table3(tables.IsDescription): - value3 = tables.Float32Col() +def test_create_table_index(setup_path): - with ensure_clean_path(setup_path) as path: - with tables.open_file(path, mode="w") as h5file: - group = h5file.create_group("/", "group") - h5file.create_table(group, "table1", Table1, "Table 1") - h5file.create_table(group, "table2", Table2, "Table 2") - h5file.create_table(group, "table3", Table3, "Table 3") - with HDFStore(path) as store: - assert len(store.keys(include="native")) == 3 - expected = {"/group/table1", "/group/table2", "/group/table3"} - assert set(store.keys(include="native")) == expected - assert set(store.keys(include="pandas")) == set() - for name in expected: - df = store.get(name) - assert len(df.columns) == 1 - - def test_keys_illegal_include_keyword_value(self, setup_path): - with ensure_clean_store(setup_path) as store: - with pytest.raises( - ValueError, - match="`include` should be either 'pandas' or 'native' " - "but is 'illegal'", - ): - store.keys(include="illegal") - - def test_keys_ignore_hdf_softlink(self, setup_path): - - # GH 20523 - # Puts a softlink into HDF file and rereads - - with ensure_clean_store(setup_path) as store: - - df = DataFrame({"A": range(5), "B": range(5)}) - store.put("df", df) - - assert store.keys() == ["/df"] - - store._handle.create_soft_link(store._handle.root, "symlink", "df") - - # Should ignore the softlink - assert store.keys() == ["/df"] - - def test_iter_empty(self, setup_path): - - with ensure_clean_store(setup_path) as store: - # GH 12221 - assert list(store) == [] - - def test_repr(self, setup_path): - - with ensure_clean_store(setup_path) as store: - repr(store) - store.info() - store["a"] = tm.makeTimeSeries() - store["b"] = tm.makeStringSeries() - store["c"] = tm.makeDataFrame() - - df = tm.makeDataFrame() - df["obj1"] = "foo" - df["obj2"] = "bar" - df["bool1"] = df["A"] > 0 - df["bool2"] = df["B"] > 0 - df["bool3"] = True - df["int1"] = 1 - df["int2"] = 2 - df["timestamp1"] = Timestamp("20010102") - df["timestamp2"] = Timestamp("20010103") - df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) - df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) - df.loc[df.index[3:6], ["obj1"]] = np.nan - df = df._consolidate()._convert(datetime=True) + with ensure_clean_store(setup_path) as store: - with catch_warnings(record=True): - simplefilter("ignore", pd.errors.PerformanceWarning) - store["df"] = df - - # make a random group in hdf space - store._handle.create_group(store._handle.root, "bah") - - assert store.filename in repr(store) - assert store.filename in str(store) - store.info() - - # storers - with ensure_clean_store(setup_path) as store: - - df = tm.makeDataFrame() - store.append("df", df) - - s = store.get_storer("df") - repr(s) - str(s) - - @ignore_natural_naming_warning - def test_contains(self, setup_path): - - with ensure_clean_store(setup_path) as store: - store["a"] = tm.makeTimeSeries() - store["b"] = tm.makeDataFrame() - store["foo/bar"] = tm.makeDataFrame() - assert "a" in store - assert "b" in store - assert "c" not in store - assert "foo/bar" in store - assert "/foo/bar" in store - assert "/foo/b" not in store - assert "bar" not in store - - # gh-2694: tables.NaturalNameWarning - with catch_warnings(record=True): - store["node())"] = tm.makeDataFrame() - assert "node())" in store + with catch_warnings(record=True): - def test_versioning(self, setup_path): + def col(t, column): + return getattr(store.get_storer(t).table.cols, column) - with ensure_clean_store(setup_path) as store: - store["a"] = tm.makeTimeSeries() - store["b"] = tm.makeDataFrame() + # data columns df = tm.makeTimeDataFrame() - _maybe_remove(store, "df1") - store.append("df1", df[:10]) - store.append("df1", df[10:]) - assert store.root.a._v_attrs.pandas_version == "0.15.2" - assert store.root.b._v_attrs.pandas_version == "0.15.2" - assert store.root.df1._v_attrs.pandas_version == "0.15.2" - - # write a file and wipe its versioning - _maybe_remove(store, "df2") - store.append("df2", df) - - # this is an error because its table_type is appendable, but no - # version info - store.get_node("df2")._v_attrs.pandas_version = None - - msg = "'NoneType' object has no attribute 'startswith'" - - with pytest.raises(Exception, match=msg): - store.select("df2") - - def test_mode(self, setup_path): - - df = tm.makeTimeDataFrame() - - def check(mode): - - msg = r"[\S]* does not exist" - with ensure_clean_path(setup_path) as path: - - # constructor - if mode in ["r", "r+"]: - with pytest.raises(IOError, match=msg): - HDFStore(path, mode=mode) - - else: - store = HDFStore(path, mode=mode) - assert store._handle.mode == mode - store.close() - - with ensure_clean_path(setup_path) as path: - - # context - if mode in ["r", "r+"]: - with pytest.raises(IOError, match=msg): - with HDFStore(path, mode=mode) as store: - pass - else: - with HDFStore(path, mode=mode) as store: - assert store._handle.mode == mode - - with ensure_clean_path(setup_path) as path: - - # conv write - if mode in ["r", "r+"]: - with pytest.raises(IOError, match=msg): - df.to_hdf(path, "df", mode=mode) - df.to_hdf(path, "df", mode="w") - else: - df.to_hdf(path, "df", mode=mode) - - # conv read - if mode in ["w"]: - msg = ( - "mode w is not allowed while performing a read. " - r"Allowed modes are r, r\+ and a." - ) - with pytest.raises(ValueError, match=msg): - read_hdf(path, "df", mode=mode) - else: - result = read_hdf(path, "df", mode=mode) - tm.assert_frame_equal(result, df) - - def check_default_mode(): - - # read_hdf uses default mode - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", mode="w") - result = read_hdf(path, "df") - tm.assert_frame_equal(result, df) - - check("r") - check("r+") - check("a") - check("w") - check_default_mode() + df["string"] = "foo" + df["string2"] = "bar" + store.append("f", df, data_columns=["string", "string2"]) + assert col("f", "index").is_indexed is True + assert col("f", "string").is_indexed is True + assert col("f", "string2").is_indexed is True + + # specify index=columns + store.append("f2", df, index=["string"], data_columns=["string", "string2"]) + assert col("f2", "index").is_indexed is False + assert col("f2", "string").is_indexed is True + assert col("f2", "string2").is_indexed is False + + # try to index a non-table + _maybe_remove(store, "f2") + store.put("f2", df) + msg = "cannot create table index on a Fixed format store" + with pytest.raises(TypeError, match=msg): + store.create_table_index("f2") - def test_reopen_handle(self, setup_path): - with ensure_clean_path(setup_path) as path: +def test_create_table_index_data_columns_argument(setup_path): + # GH 28156 - store = HDFStore(path, mode="a") - store["a"] = tm.makeTimeSeries() + with ensure_clean_store(setup_path) as store: - msg = ( - r"Re-opening the file \[[\S]*\] with mode \[a\] will delete the " - "current file!" - ) - # invalid mode change - with pytest.raises(PossibleDataLossError, match=msg): - store.open("w") - - store.close() - assert not store.is_open - - # truncation ok here - store.open("w") - assert store.is_open - assert len(store) == 0 - store.close() - assert not store.is_open - - store = HDFStore(path, mode="a") - store["a"] = tm.makeTimeSeries() - - # reopen as read - store.open("r") - assert store.is_open - assert len(store) == 1 - assert store._mode == "r" - store.close() - assert not store.is_open - - # reopen as append - store.open("a") - assert store.is_open - assert len(store) == 1 - assert store._mode == "a" - store.close() - assert not store.is_open - - # reopen as append (again) - store.open("a") - assert store.is_open - assert len(store) == 1 - assert store._mode == "a" - store.close() - assert not store.is_open - - def test_open_args(self, setup_path): - - with tm.ensure_clean(setup_path) as path: - - df = tm.makeDataFrame() - - # create an in memory store - store = HDFStore( - path, mode="a", driver="H5FD_CORE", driver_core_backing_store=0 - ) - store["df"] = df - store.append("df2", df) - - tm.assert_frame_equal(store["df"], df) - tm.assert_frame_equal(store["df2"], df) - - store.close() - - # the file should not have actually been written - assert not os.path.exists(path) - - def test_flush(self, setup_path): - - with ensure_clean_store(setup_path) as store: - store["a"] = tm.makeTimeSeries() - store.flush() - store.flush(fsync=True) - - def test_get(self, setup_path): - - with ensure_clean_store(setup_path) as store: - store["a"] = tm.makeTimeSeries() - left = store.get("a") - right = store["a"] - tm.assert_series_equal(left, right) - - left = store.get("/a") - right = store["/a"] - tm.assert_series_equal(left, right) - - with pytest.raises(KeyError, match="'No object named b in the file'"): - store.get("b") - - @pytest.mark.parametrize( - "where, expected", - [ - ( - "/", - { - "": ({"first_group", "second_group"}, set()), - "/first_group": (set(), {"df1", "df2"}), - "/second_group": ({"third_group"}, {"df3", "s1"}), - "/second_group/third_group": (set(), {"df4"}), - }, - ), - ( - "/second_group", - { - "/second_group": ({"third_group"}, {"df3", "s1"}), - "/second_group/third_group": (set(), {"df4"}), - }, - ), - ], - ) - def test_walk(self, where, expected, setup_path): - # GH10143 - objs = { - "df1": DataFrame([1, 2, 3]), - "df2": DataFrame([4, 5, 6]), - "df3": DataFrame([6, 7, 8]), - "df4": DataFrame([9, 10, 11]), - "s1": Series([10, 9, 8]), - # Next 3 items aren't pandas objects and should be ignored - "a1": np.array([[1, 2, 3], [4, 5, 6]]), - "tb1": np.array([(1, 2, 3), (4, 5, 6)], dtype="i,i,i"), - "tb2": np.array([(7, 8, 9), (10, 11, 12)], dtype="i,i,i"), - } + with catch_warnings(record=True): - with ensure_clean_store("walk_groups.hdf", mode="w") as store: - store.put("/first_group/df1", objs["df1"]) - store.put("/first_group/df2", objs["df2"]) - store.put("/second_group/df3", objs["df3"]) - store.put("/second_group/s1", objs["s1"]) - store.put("/second_group/third_group/df4", objs["df4"]) - # Create non-pandas objects - store._handle.create_array("/first_group", "a1", objs["a1"]) - store._handle.create_table("/first_group", "tb1", obj=objs["tb1"]) - store._handle.create_table("/second_group", "tb2", obj=objs["tb2"]) - - assert len(list(store.walk(where=where))) == len(expected) - for path, groups, leaves in store.walk(where=where): - assert path in expected - expected_groups, expected_frames = expected[path] - assert expected_groups == set(groups) - assert expected_frames == set(leaves) - for leaf in leaves: - frame_path = "/".join([path, leaf]) - obj = store.get(frame_path) - if "df" in leaf: - tm.assert_frame_equal(obj, objs[leaf]) - else: - tm.assert_series_equal(obj, objs[leaf]) - - def test_getattr(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - s = tm.makeTimeSeries() - store["a"] = s - - # test attribute access - result = store.a - tm.assert_series_equal(result, s) - result = getattr(store, "a") - tm.assert_series_equal(result, s) + def col(t, column): + return getattr(store.get_storer(t).table.cols, column) + # data columns df = tm.makeTimeDataFrame() - store["df"] = df - result = store.df - tm.assert_frame_equal(result, df) - - # errors - for x in ["d", "mode", "path", "handle", "complib"]: - msg = f"'HDFStore' object has no attribute '{x}'" - with pytest.raises(AttributeError, match=msg): - getattr(store, x) - - # not stores - for x in ["mode", "path", "handle", "complib"]: - getattr(store, f"_{x}") - - def test_put(self, setup_path): + df["string"] = "foo" + df["string2"] = "bar" + store.append("f", df, data_columns=["string"]) + assert col("f", "index").is_indexed is True + assert col("f", "string").is_indexed is True - with ensure_clean_store(setup_path) as store: + msg = "'Cols' object has no attribute 'string2'" + with pytest.raises(AttributeError, match=msg): + col("f", "string2").is_indexed - ts = tm.makeTimeSeries() - df = tm.makeTimeDataFrame() - store["a"] = ts - store["b"] = df[:10] - store["foo/bar/bah"] = df[:10] - store["foo"] = df[:10] - store["/foo"] = df[:10] - store.put("c", df[:10], format="table") - - # not OK, not a table - msg = "Can only append to Tables" - with pytest.raises(ValueError, match=msg): - store.put("b", df[10:], append=True) - - # node does not currently exist, test _is_table_type returns False - # in this case - _maybe_remove(store, "f") - with pytest.raises(ValueError, match=msg): - store.put("f", df[10:], append=True) - - # can't put to a table (use append instead) - with pytest.raises(ValueError, match=msg): - store.put("c", df[10:], append=True) - - # overwrite table - store.put("c", df[:10], format="table", append=False) - tm.assert_frame_equal(df[:10], store["c"]) - - def test_put_string_index(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - index = Index([f"I am a very long string index: {i}" for i in range(20)]) - s = Series(np.arange(20), index=index) - df = DataFrame({"A": s, "B": s}) - - store["a"] = s - tm.assert_series_equal(store["a"], s) - - store["b"] = df - tm.assert_frame_equal(store["b"], df) - - # mixed length - index = Index( - ["abcdefghijklmnopqrstuvwxyz1234567890"] - + [f"I am a very long string index: {i}" for i in range(20)] + # try to index a col which isn't a data_column + msg = ( + "column string2 is not a data_column.\n" + "In order to read column string2 you must reload the dataframe \n" + "into HDFStore and include string2 with the data_columns argument." ) - s = Series(np.arange(21), index=index) - df = DataFrame({"A": s, "B": s}) - store["a"] = s - tm.assert_series_equal(store["a"], s) - - store["b"] = df - tm.assert_frame_equal(store["b"], df) + with pytest.raises(AttributeError, match=msg): + store.create_table_index("f", columns=["string2"]) - def test_put_compression(self, setup_path): - with ensure_clean_store(setup_path) as store: - df = tm.makeTimeDataFrame() +def test_mi_data_columns(setup_path): + # GH 14435 + idx = MultiIndex.from_arrays( + [date_range("2000-01-01", periods=5), range(5)], names=["date", "id"] + ) + df = DataFrame({"a": [1.1, 1.2, 1.3, 1.4, 1.5]}, index=idx) + + with ensure_clean_store(setup_path) as store: + store.append("df", df, data_columns=True) + + actual = store.select("df", where="id == 1") + expected = df.iloc[[1], :] + tm.assert_frame_equal(actual, expected) + + +def test_table_mixed_dtypes(setup_path): + + # frame + df = tm.makeDataFrame() + df["obj1"] = "foo" + df["obj2"] = "bar" + df["bool1"] = df["A"] > 0 + df["bool2"] = df["B"] > 0 + df["bool3"] = True + df["int1"] = 1 + df["int2"] = 2 + df["timestamp1"] = Timestamp("20010102") + df["timestamp2"] = Timestamp("20010103") + df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) + df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) + df.loc[df.index[3:6], ["obj1"]] = np.nan + df = df._consolidate()._convert(datetime=True) + + with ensure_clean_store(setup_path) as store: + store.append("df1_mixed", df) + tm.assert_frame_equal(store.select("df1_mixed"), df) + + +def test_calendar_roundtrip_issue(setup_path): + + # 8591 + # doc example from tseries holiday section + weekmask_egypt = "Sun Mon Tue Wed Thu" + holidays = [ + "2012-05-01", + datetime.datetime(2013, 5, 1), + np.datetime64("2014-05-01"), + ] + bday_egypt = pd.offsets.CustomBusinessDay( + holidays=holidays, weekmask=weekmask_egypt + ) + dt = datetime.datetime(2013, 4, 30) + dts = date_range(dt, periods=5, freq=bday_egypt) - store.put("c", df, format="table", complib="zlib") - tm.assert_frame_equal(store["c"], df) + s = Series(dts.weekday, dts).map(Series("Mon Tue Wed Thu Fri Sat Sun".split())) - # can't compress if format='fixed' - msg = "Compression not supported on Fixed format stores" - with pytest.raises(ValueError, match=msg): - store.put("b", df, format="fixed", complib="zlib") + with ensure_clean_store(setup_path) as store: - @td.skip_if_windows_python_3 - def test_put_compression_blosc(self, setup_path): - df = tm.makeTimeDataFrame() + store.put("fixed", s) + result = store.select("fixed") + tm.assert_series_equal(result, s) - with ensure_clean_store(setup_path) as store: + store.append("table", s) + result = store.select("table") + tm.assert_series_equal(result, s) - # can't compress if format='fixed' - msg = "Compression not supported on Fixed format stores" - with pytest.raises(ValueError, match=msg): - store.put("b", df, format="fixed", complib="blosc") - store.put("c", df, format="table", complib="blosc") - tm.assert_frame_equal(store["c"], df) +def test_remove(setup_path): - def test_complibs_default_settings(self, setup_path): - # GH15943 - df = tm.makeDataFrame() + with ensure_clean_store(setup_path) as store: - # Set complevel and check if complib is automatically set to - # default value - with ensure_clean_path(setup_path) as tmpfile: - df.to_hdf(tmpfile, "df", complevel=9) - result = pd.read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) - - with tables.open_file(tmpfile, mode="r") as h5file: - for node in h5file.walk_nodes(where="/df", classname="Leaf"): - assert node.filters.complevel == 9 - assert node.filters.complib == "zlib" - - # Set complib and check to see if compression is disabled - with ensure_clean_path(setup_path) as tmpfile: - df.to_hdf(tmpfile, "df", complib="zlib") - result = pd.read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) - - with tables.open_file(tmpfile, mode="r") as h5file: - for node in h5file.walk_nodes(where="/df", classname="Leaf"): - assert node.filters.complevel == 0 - assert node.filters.complib is None - - # Check if not setting complib or complevel results in no compression - with ensure_clean_path(setup_path) as tmpfile: - df.to_hdf(tmpfile, "df") - result = pd.read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) - - with tables.open_file(tmpfile, mode="r") as h5file: - for node in h5file.walk_nodes(where="/df", classname="Leaf"): - assert node.filters.complevel == 0 - assert node.filters.complib is None - - # Check if file-defaults can be overridden on a per table basis - with ensure_clean_path(setup_path) as tmpfile: - store = HDFStore(tmpfile) - store.append("dfc", df, complevel=9, complib="blosc") - store.append("df", df) - store.close() - - with tables.open_file(tmpfile, mode="r") as h5file: - for node in h5file.walk_nodes(where="/df", classname="Leaf"): - assert node.filters.complevel == 0 - assert node.filters.complib is None - for node in h5file.walk_nodes(where="/dfc", classname="Leaf"): - assert node.filters.complevel == 9 - assert node.filters.complib == "blosc" - - def test_complibs(self, setup_path): - # GH14478 + ts = tm.makeTimeSeries() df = tm.makeDataFrame() + store["a"] = ts + store["b"] = df + _maybe_remove(store, "a") + assert len(store) == 1 + tm.assert_frame_equal(df, store["b"]) - # Building list of all complibs and complevels tuples - all_complibs = tables.filters.all_complibs - # Remove lzo if its not available on this platform - if not tables.which_lib_version("lzo"): - all_complibs.remove("lzo") - # Remove bzip2 if its not available on this platform - if not tables.which_lib_version("bzip2"): - all_complibs.remove("bzip2") - - all_levels = range(0, 10) - all_tests = [(lib, lvl) for lib in all_complibs for lvl in all_levels] - - for (lib, lvl) in all_tests: - with ensure_clean_path(setup_path) as tmpfile: - gname = "foo" - - # Write and read file to see if data is consistent - df.to_hdf(tmpfile, gname, complib=lib, complevel=lvl) - result = pd.read_hdf(tmpfile, gname) - tm.assert_frame_equal(result, df) - - # Open file and check metadata - # for correct amount of compression - h5table = tables.open_file(tmpfile, mode="r") - for node in h5table.walk_nodes(where="/" + gname, classname="Leaf"): - assert node.filters.complevel == lvl - if lvl == 0: - assert node.filters.complib is None - else: - assert node.filters.complib == lib - h5table.close() - - def test_put_integer(self, setup_path): - # non-date, non-string index - df = DataFrame(np.random.randn(50, 100)) - self._check_roundtrip(df, tm.assert_frame_equal, setup_path) - - def test_put_mixed_type(self, setup_path): - df = tm.makeTimeDataFrame() - df["obj1"] = "foo" - df["obj2"] = "bar" - df["bool1"] = df["A"] > 0 - df["bool2"] = df["B"] > 0 - df["bool3"] = True - df["int1"] = 1 - df["int2"] = 2 - df["timestamp1"] = Timestamp("20010102") - df["timestamp2"] = Timestamp("20010103") - df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) - df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) - df.loc[df.index[3:6], ["obj1"]] = np.nan - df = df._consolidate()._convert(datetime=True) - - with ensure_clean_store(setup_path) as store: - _maybe_remove(store, "df") - - # PerformanceWarning - with catch_warnings(record=True): - simplefilter("ignore", pd.errors.PerformanceWarning) - store.put("df", df) - - expected = store.get("df") - tm.assert_frame_equal(expected, df) + _maybe_remove(store, "b") + assert len(store) == 0 - @pytest.mark.filterwarnings( - "ignore:object name:tables.exceptions.NaturalNameWarning" - ) - def test_append(self, setup_path): + # nonexistence + with pytest.raises( + KeyError, match="'No object named a_nonexistent_store in the file'" + ): + store.remove("a_nonexistent_store") - with ensure_clean_store(setup_path) as store: + # pathing + store["a"] = ts + store["b/foo"] = df + _maybe_remove(store, "foo") + _maybe_remove(store, "b/foo") + assert len(store) == 1 - # this is allowed by almost always don't want to do it - # tables.NaturalNameWarning): - with catch_warnings(record=True): + store["a"] = ts + store["b/foo"] = df + _maybe_remove(store, "b") + assert len(store) == 1 - df = tm.makeTimeDataFrame() - _maybe_remove(store, "df1") - store.append("df1", df[:10]) - store.append("df1", df[10:]) - tm.assert_frame_equal(store["df1"], df) - - _maybe_remove(store, "df2") - store.put("df2", df[:10], format="table") - store.append("df2", df[10:]) - tm.assert_frame_equal(store["df2"], df) - - _maybe_remove(store, "df3") - store.append("/df3", df[:10]) - store.append("/df3", df[10:]) - tm.assert_frame_equal(store["df3"], df) - - # this is allowed by almost always don't want to do it - # tables.NaturalNameWarning - _maybe_remove(store, "/df3 foo") - store.append("/df3 foo", df[:10]) - store.append("/df3 foo", df[10:]) - tm.assert_frame_equal(store["df3 foo"], df) - - # dtype issues - mizxed type in a single object column - df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]]) - df["mixed_column"] = "testing" - df.loc[2, "mixed_column"] = np.nan - _maybe_remove(store, "df") - store.append("df", df) - tm.assert_frame_equal(store["df"], df) - - # uints - test storage of uints - uint_data = DataFrame( - { - "u08": Series( - np.random.randint(0, high=255, size=5), dtype=np.uint8 - ), - "u16": Series( - np.random.randint(0, high=65535, size=5), dtype=np.uint16 - ), - "u32": Series( - np.random.randint(0, high=2 ** 30, size=5), dtype=np.uint32 - ), - "u64": Series( - [2 ** 58, 2 ** 59, 2 ** 60, 2 ** 61, 2 ** 62], - dtype=np.uint64, - ), - }, - index=np.arange(5), - ) - _maybe_remove(store, "uints") - store.append("uints", uint_data) - tm.assert_frame_equal(store["uints"], uint_data) - - # uints - test storage of uints in indexable columns - _maybe_remove(store, "uints") - # 64-bit indices not yet supported - store.append("uints", uint_data, data_columns=["u08", "u16", "u32"]) - tm.assert_frame_equal(store["uints"], uint_data) - - def test_append_series(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - # basic - ss = tm.makeStringSeries() - ts = tm.makeTimeSeries() - ns = Series(np.arange(100)) - - store.append("ss", ss) - result = store["ss"] - tm.assert_series_equal(result, ss) - assert result.name is None - - store.append("ts", ts) - result = store["ts"] - tm.assert_series_equal(result, ts) - assert result.name is None - - ns.name = "foo" - store.append("ns", ns) - result = store["ns"] - tm.assert_series_equal(result, ns) - assert result.name == ns.name - - # select on the values - expected = ns[ns > 60] - result = store.select("ns", "foo>60") - tm.assert_series_equal(result, expected) - - # select on the index and values - expected = ns[(ns > 70) & (ns.index < 90)] - result = store.select("ns", "foo>70 and index<90") - tm.assert_series_equal(result, expected) - - # multi-index - mi = DataFrame(np.random.randn(5, 1), columns=["A"]) - mi["B"] = np.arange(len(mi)) - mi["C"] = "foo" - mi.loc[3:5, "C"] = "bar" - mi.set_index(["C", "B"], inplace=True) - s = mi.stack() - s.index = s.index.droplevel(2) - store.append("mi", s) - tm.assert_series_equal(store["mi"], s) - - def test_store_index_types(self, setup_path): - # GH5386 - # test storing various index types - - with ensure_clean_store(setup_path) as store: - - def check(format, index): - df = DataFrame(np.random.randn(10, 2), columns=list("AB")) - df.index = index(len(df)) - - _maybe_remove(store, "df") - store.put("df", df, format=format) - tm.assert_frame_equal(df, store["df"]) - - for index in [ - tm.makeFloatIndex, - tm.makeStringIndex, - tm.makeIntIndex, - tm.makeDateIndex, - ]: - - check("table", index) - check("fixed", index) - - # period index currently broken for table - # seee GH7796 FIXME - check("fixed", tm.makePeriodIndex) - # check('table',tm.makePeriodIndex) - - # unicode - index = tm.makeUnicodeIndex - check("table", index) - check("fixed", index) - - @pytest.mark.skipif( - not is_platform_little_endian(), reason="reason platform is not little endian" - ) - def test_encoding(self, setup_path): - - with ensure_clean_store(setup_path) as store: - df = DataFrame({"A": "foo", "B": "bar"}, index=range(5)) - df.loc[2, "A"] = np.nan - df.loc[3, "B"] = np.nan - _maybe_remove(store, "df") - store.append("df", df, encoding="ascii") - tm.assert_frame_equal(store["df"], df) - - expected = df.reindex(columns=["A"]) - result = store.select("df", Term("columns=A", encoding="ascii")) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "val", - [ - [b"E\xc9, 17", b"", b"a", b"b", b"c"], - [b"E\xc9, 17", b"a", b"b", b"c"], - [b"EE, 17", b"", b"a", b"b", b"c"], - [b"E\xc9, 17", b"\xf8\xfc", b"a", b"b", b"c"], - [b"", b"a", b"b", b"c"], - [b"\xf8\xfc", b"a", b"b", b"c"], - [b"A\xf8\xfc", b"", b"a", b"b", b"c"], - [np.nan, b"", b"b", b"c"], - [b"A\xf8\xfc", np.nan, b"", b"b", b"c"], - ], - ) - @pytest.mark.parametrize("dtype", ["category", object]) - def test_latin_encoding(self, setup_path, dtype, val): - enc = "latin-1" - nan_rep = "" - key = "data" - - val = [x.decode(enc) if isinstance(x, bytes) else x for x in val] - ser = Series(val, dtype=dtype) - - with ensure_clean_path(setup_path) as store: - ser.to_hdf(store, key, format="table", encoding=enc, nan_rep=nan_rep) - retr = read_hdf(store, key) - - s_nan = ser.replace(nan_rep, np.nan) - - tm.assert_series_equal(s_nan, retr) - - def test_append_some_nans(self, setup_path): - - with ensure_clean_store(setup_path) as store: - df = DataFrame( - { - "A": Series(np.random.randn(20)).astype("int32"), - "A1": np.random.randn(20), - "A2": np.random.randn(20), - "B": "foo", - "C": "bar", - "D": Timestamp("20010101"), - "E": datetime.datetime(2001, 1, 2, 0, 0), - }, - index=np.arange(20), - ) - # some nans - _maybe_remove(store, "df1") - df.loc[0:15, ["A1", "B", "D", "E"]] = np.nan - store.append("df1", df[:10]) - store.append("df1", df[10:]) - tm.assert_frame_equal(store["df1"], df) - - # first column - df1 = df.copy() - df1.loc[:, "A1"] = np.nan - _maybe_remove(store, "df1") - store.append("df1", df1[:10]) - store.append("df1", df1[10:]) - tm.assert_frame_equal(store["df1"], df1) - - # 2nd column - df2 = df.copy() - df2.loc[:, "A2"] = np.nan - _maybe_remove(store, "df2") - store.append("df2", df2[:10]) - store.append("df2", df2[10:]) - tm.assert_frame_equal(store["df2"], df2) - - # datetimes - df3 = df.copy() - df3.loc[:, "E"] = np.nan - _maybe_remove(store, "df3") - store.append("df3", df3[:10]) - store.append("df3", df3[10:]) - tm.assert_frame_equal(store["df3"], df3) - - def test_append_all_nans(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - df = DataFrame( - {"A1": np.random.randn(20), "A2": np.random.randn(20)}, - index=np.arange(20), - ) - df.loc[0:15, :] = np.nan - - # nan some entire rows (dropna=True) - _maybe_remove(store, "df") - store.append("df", df[:10], dropna=True) - store.append("df", df[10:], dropna=True) - tm.assert_frame_equal(store["df"], df[-4:]) - - # nan some entire rows (dropna=False) - _maybe_remove(store, "df2") - store.append("df2", df[:10], dropna=False) - store.append("df2", df[10:], dropna=False) - tm.assert_frame_equal(store["df2"], df) - - # tests the option io.hdf.dropna_table - pd.set_option("io.hdf.dropna_table", False) - _maybe_remove(store, "df3") - store.append("df3", df[:10]) - store.append("df3", df[10:]) - tm.assert_frame_equal(store["df3"], df) - - pd.set_option("io.hdf.dropna_table", True) - _maybe_remove(store, "df4") - store.append("df4", df[:10]) - store.append("df4", df[10:]) - tm.assert_frame_equal(store["df4"], df[-4:]) - - # nan some entire rows (string are still written!) - df = DataFrame( - { - "A1": np.random.randn(20), - "A2": np.random.randn(20), - "B": "foo", - "C": "bar", - }, - index=np.arange(20), - ) + # __delitem__ + store["a"] = ts + store["b"] = df + del store["a"] + del store["b"] + assert len(store) == 0 - df.loc[0:15, :] = np.nan - - _maybe_remove(store, "df") - store.append("df", df[:10], dropna=True) - store.append("df", df[10:], dropna=True) - tm.assert_frame_equal(store["df"], df) - - _maybe_remove(store, "df2") - store.append("df2", df[:10], dropna=False) - store.append("df2", df[10:], dropna=False) - tm.assert_frame_equal(store["df2"], df) - - # nan some entire rows (but since we have dates they are still - # written!) - df = DataFrame( - { - "A1": np.random.randn(20), - "A2": np.random.randn(20), - "B": "foo", - "C": "bar", - "D": Timestamp("20010101"), - "E": datetime.datetime(2001, 1, 2, 0, 0), - }, - index=np.arange(20), - ) - df.loc[0:15, :] = np.nan +def test_same_name_scoping(setup_path): - _maybe_remove(store, "df") - store.append("df", df[:10], dropna=True) - store.append("df", df[10:], dropna=True) - tm.assert_frame_equal(store["df"], df) + with ensure_clean_store(setup_path) as store: - _maybe_remove(store, "df2") - store.append("df2", df[:10], dropna=False) - store.append("df2", df[10:], dropna=False) - tm.assert_frame_equal(store["df2"], df) + import pandas as pd - def test_store_dropna(self, setup_path): - df_with_missing = DataFrame( - {"col1": [0.0, np.nan, 2.0], "col2": [1.0, np.nan, np.nan]}, - index=list("abc"), - ) - df_without_missing = DataFrame( - {"col1": [0.0, 2.0], "col2": [1.0, np.nan]}, index=list("ac") + df = DataFrame( + np.random.randn(20, 2), index=pd.date_range("20130101", periods=20) ) + store.put("df", df, format="table") + expected = df[df.index > Timestamp("20130105")] - # # Test to make sure defaults are to not drop. - # # Corresponding to Issue 9382 - with ensure_clean_path(setup_path) as path: - df_with_missing.to_hdf(path, "df", format="table") - reloaded = read_hdf(path, "df") - tm.assert_frame_equal(df_with_missing, reloaded) - - with ensure_clean_path(setup_path) as path: - df_with_missing.to_hdf(path, "df", format="table", dropna=False) - reloaded = read_hdf(path, "df") - tm.assert_frame_equal(df_with_missing, reloaded) - - with ensure_clean_path(setup_path) as path: - df_with_missing.to_hdf(path, "df", format="table", dropna=True) - reloaded = read_hdf(path, "df") - tm.assert_frame_equal(df_without_missing, reloaded) - - def test_read_missing_key_close_store(self, setup_path): - # GH 25766 - with ensure_clean_path(setup_path) as path: - df = DataFrame({"a": range(2), "b": range(2)}) - df.to_hdf(path, "k1") - - with pytest.raises(KeyError, match="'No object named k2 in the file'"): - pd.read_hdf(path, "k2") - - # smoke test to test that file is properly closed after - # read with KeyError before another write - df.to_hdf(path, "k2") - - def test_read_missing_key_opened_store(self, setup_path): - # GH 28699 - with ensure_clean_path(setup_path) as path: - df = DataFrame({"a": range(2), "b": range(2)}) - df.to_hdf(path, "k1") - - with HDFStore(path, "r") as store: + result = store.select("df", "index>datetime.datetime(2013,1,5)") + tm.assert_frame_equal(result, expected) - with pytest.raises(KeyError, match="'No object named k2 in the file'"): - pd.read_hdf(store, "k2") + from datetime import datetime # noqa - # Test that the file is still open after a KeyError and that we can - # still read from it. - pd.read_hdf(store, "k1") + # technically an error, but allow it + result = store.select("df", "index>datetime.datetime(2013,1,5)") + tm.assert_frame_equal(result, expected) - def test_append_frame_column_oriented(self, setup_path): - with ensure_clean_store(setup_path) as store: + result = store.select("df", "index>datetime(2013,1,5)") + tm.assert_frame_equal(result, expected) - # column oriented - df = tm.makeTimeDataFrame() - df.index = df.index._with_freq(None) # freq doesnt round-trip - - _maybe_remove(store, "df1") - store.append("df1", df.iloc[:, :2], axes=["columns"]) - store.append("df1", df.iloc[:, 2:]) - tm.assert_frame_equal(store["df1"], df) - - result = store.select("df1", "columns=A") - expected = df.reindex(columns=["A"]) - tm.assert_frame_equal(expected, result) - - # selection on the non-indexable - result = store.select("df1", ("columns=A", "index=df.index[0:4]")) - expected = df.reindex(columns=["A"], index=df.index[0:4]) - tm.assert_frame_equal(expected, result) - - # this isn't supported - msg = re.escape( - "passing a filterable condition to a non-table indexer " - "[Filter: Not Initialized]" - ) - with pytest.raises(TypeError, match=msg): - store.select("df1", "columns=A and index>df.index[4]") - def test_append_with_different_block_ordering(self, setup_path): +def test_store_index_name(setup_path): + df = tm.makeDataFrame() + df.index.name = "foo" - # GH 4096; using same frames, but different block orderings - with ensure_clean_store(setup_path) as store: + with ensure_clean_store(setup_path) as store: + store["frame"] = df + recons = store["frame"] + tm.assert_frame_equal(recons, df) - for i in range(10): - df = DataFrame(np.random.randn(10, 2), columns=list("AB")) - df["index"] = range(10) - df["index"] += i * 10 - df["int64"] = Series([1] * len(df), dtype="int64") - df["int16"] = Series([1] * len(df), dtype="int16") +@pytest.mark.parametrize("table_format", ["table", "fixed"]) +def test_store_index_name_numpy_str(table_format, setup_path): + # GH #13492 + idx = Index( + pd.to_datetime([datetime.date(2000, 1, 1), datetime.date(2000, 1, 2)]), + name="cols\u05d2", + ) + idx1 = Index( + pd.to_datetime([datetime.date(2010, 1, 1), datetime.date(2010, 1, 2)]), + name="rows\u05d0", + ) + df = DataFrame(np.arange(4).reshape(2, 2), columns=idx, index=idx1) - if i % 2 == 0: - del df["int64"] - df["int64"] = Series([1] * len(df), dtype="int64") - if i % 3 == 0: - a = df.pop("A") - df["A"] = a + # This used to fail, returning numpy strings instead of python strings. + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", format=table_format) + df2 = read_hdf(path, "df") - df.set_index("index", inplace=True) + tm.assert_frame_equal(df, df2, check_names=True) - store.append("df", df) + assert type(df2.index.name) == str + assert type(df2.columns.name) == str - # test a different ordering but with more fields (like invalid - # combinate) - with ensure_clean_store(setup_path) as store: - df = DataFrame(np.random.randn(10, 2), columns=list("AB"), dtype="float64") - df["int64"] = Series([1] * len(df), dtype="int64") - df["int16"] = Series([1] * len(df), dtype="int16") - store.append("df", df) +def test_store_series_name(setup_path): + df = tm.makeDataFrame() + series = df["A"] - # store additional fields in different blocks - df["int16_2"] = Series([1] * len(df), dtype="int16") - msg = re.escape( - "cannot match existing table structure for [int16] on appending data" - ) - with pytest.raises(ValueError, match=msg): - store.append("df", df) + with ensure_clean_store(setup_path) as store: + store["series"] = series + recons = store["series"] + tm.assert_series_equal(recons, series) - # store multiple additional fields in different blocks - df["float_3"] = Series([1.0] * len(df), dtype="float64") - msg = re.escape( - "cannot match existing table structure for [A,B] on appending data" - ) - with pytest.raises(ValueError, match=msg): - store.append("df", df) - def test_append_with_strings(self, setup_path): +@pytest.mark.filterwarnings("ignore:\\nduplicate:pandas.io.pytables.DuplicateWarning") +def test_overwrite_node(setup_path): - with ensure_clean_store(setup_path) as store: - with catch_warnings(record=True): + with ensure_clean_store(setup_path) as store: + store["a"] = tm.makeTimeDataFrame() + ts = tm.makeTimeSeries() + store["a"] = ts - def check_col(key, name, size): - assert ( - getattr(store.get_storer(key).table.description, name).itemsize - == size - ) - - # avoid truncation on elements - df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]]) - store.append("df_big", df) - tm.assert_frame_equal(store.select("df_big"), df) - check_col("df_big", "values_block_1", 15) - - # appending smaller string ok - df2 = DataFrame([[124, "asdqy"], [346, "dggnhefbdfb"]]) - store.append("df_big", df2) - expected = concat([df, df2]) - tm.assert_frame_equal(store.select("df_big"), expected) - check_col("df_big", "values_block_1", 15) - - # avoid truncation on elements - df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]]) - store.append("df_big2", df, min_itemsize={"values": 50}) - tm.assert_frame_equal(store.select("df_big2"), df) - check_col("df_big2", "values_block_1", 50) - - # bigger string on next append - store.append("df_new", df) - df_new = DataFrame( - [[124, "abcdefqhij"], [346, "abcdefghijklmnopqrtsuvwxyz"]] - ) - msg = ( - r"Trying to store a string with len \[26\] in " - r"\[values_block_1\] column but\n" - r"this column has a limit of \[15\]!\n" - "Consider using min_itemsize to preset the sizes on these " - "columns" - ) - with pytest.raises(ValueError, match=msg): - store.append("df_new", df_new) + tm.assert_series_equal(store["a"], ts) - # min_itemsize on Series index (GH 11412) - df = tm.makeMixedDataFrame().set_index("C") - store.append("ss", df["B"], min_itemsize={"index": 4}) - tm.assert_series_equal(store.select("ss"), df["B"]) - # same as above, with data_columns=True - store.append( - "ss2", df["B"], data_columns=True, min_itemsize={"index": 4} - ) - tm.assert_series_equal(store.select("ss2"), df["B"]) - - # min_itemsize in index without appending (GH 10381) - store.put("ss3", df, format="table", min_itemsize={"index": 6}) - # just make sure there is a longer string: - df2 = df.copy().reset_index().assign(C="longer").set_index("C") - store.append("ss3", df2) - tm.assert_frame_equal(store.select("ss3"), pd.concat([df, df2])) - - # same as above, with a Series - store.put("ss4", df["B"], format="table", min_itemsize={"index": 6}) - store.append("ss4", df2["B"]) - tm.assert_series_equal( - store.select("ss4"), pd.concat([df["B"], df2["B"]]) - ) +@pytest.mark.filterwarnings( + "ignore:\\nthe :pandas.io.pytables.AttributeConflictWarning" +) +def test_coordinates(setup_path): + df = tm.makeTimeDataFrame() + + with ensure_clean_store(setup_path) as store: + + _maybe_remove(store, "df") + store.append("df", df) + + # all + c = store.select_as_coordinates("df") + assert (c.values == np.arange(len(df.index))).all() + + # get coordinates back & test vs frame + _maybe_remove(store, "df") + + df = DataFrame({"A": range(5), "B": range(5)}) + store.append("df", df) + c = store.select_as_coordinates("df", ["index<3"]) + assert (c.values == np.arange(3)).all() + result = store.select("df", where=c) + expected = df.loc[0:2, :] + tm.assert_frame_equal(result, expected) + + c = store.select_as_coordinates("df", ["index>=3", "index<=4"]) + assert (c.values == np.arange(2) + 3).all() + result = store.select("df", where=c) + expected = df.loc[3:4, :] + tm.assert_frame_equal(result, expected) + assert isinstance(c, Index) + + # multiple tables + _maybe_remove(store, "df1") + _maybe_remove(store, "df2") + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) + store.append("df1", df1, data_columns=["A", "B"]) + store.append("df2", df2) - # with nans - _maybe_remove(store, "df") - df = tm.makeTimeDataFrame() - df["string"] = "foo" - df.loc[df.index[1:4], "string"] = np.nan - df["string2"] = "bar" - df.loc[df.index[4:8], "string2"] = np.nan - df["string3"] = "bah" - df.loc[df.index[1:], "string3"] = np.nan - store.append("df", df) - result = store.select("df") - tm.assert_frame_equal(result, df) - - with ensure_clean_store(setup_path) as store: - - def check_col(key, name, size): - assert getattr( - store.get_storer(key).table.description, name - ).itemsize, size - - df = DataFrame({"A": "foo", "B": "bar"}, index=range(10)) - - # a min_itemsize that creates a data_column - _maybe_remove(store, "df") - store.append("df", df, min_itemsize={"A": 200}) - check_col("df", "A", 200) - assert store.get_storer("df").data_columns == ["A"] - - # a min_itemsize that creates a data_column2 - _maybe_remove(store, "df") - store.append("df", df, data_columns=["B"], min_itemsize={"A": 200}) - check_col("df", "A", 200) - assert store.get_storer("df").data_columns == ["B", "A"] - - # a min_itemsize that creates a data_column2 - _maybe_remove(store, "df") - store.append("df", df, data_columns=["B"], min_itemsize={"values": 200}) - check_col("df", "B", 200) - check_col("df", "values_block_0", 200) - assert store.get_storer("df").data_columns == ["B"] - - # infer the .typ on subsequent appends - _maybe_remove(store, "df") - store.append("df", df[:5], min_itemsize=200) - store.append("df", df[5:], min_itemsize=200) - tm.assert_frame_equal(store["df"], df) - - # invalid min_itemsize keys - df = DataFrame(["foo", "foo", "foo", "barh", "barh", "barh"], columns=["A"]) - _maybe_remove(store, "df") - msg = re.escape( - "min_itemsize has the key [foo] which is not an axis or data_column" - ) - with pytest.raises(ValueError, match=msg): - store.append("df", df, min_itemsize={"foo": 20, "foobar": 20}) + c = store.select_as_coordinates("df1", ["A>0", "B>0"]) + df1_result = store.select("df1", c) + df2_result = store.select("df2", c) + result = concat([df1_result, df2_result], axis=1) - def test_append_with_empty_string(self, setup_path): + expected = concat([df1, df2], axis=1) + expected = expected[(expected.A > 0) & (expected.B > 0)] + tm.assert_frame_equal(result, expected) - with ensure_clean_store(setup_path) as store: + # pass array/mask as the coordinates + with ensure_clean_store(setup_path) as store: - # with all empty strings (GH 12242) - df = DataFrame({"x": ["a", "b", "c", "d", "e", "f", ""]}) - store.append("df", df[:-1], min_itemsize={"x": 1}) - store.append("df", df[-1:], min_itemsize={"x": 1}) - tm.assert_frame_equal(store.select("df"), df) + df = DataFrame( + np.random.randn(1000, 2), index=date_range("20000101", periods=1000) + ) + store.append("df", df) + c = store.select_column("df", "index") + where = c[DatetimeIndex(c).month == 5].index + expected = df.iloc[where] - def test_to_hdf_with_min_itemsize(self, setup_path): + # locations + result = store.select("df", where=where) + tm.assert_frame_equal(result, expected) - with ensure_clean_path(setup_path) as path: + # boolean + result = store.select("df", where=where) + tm.assert_frame_equal(result, expected) - # min_itemsize in index with to_hdf (GH 10381) - df = tm.makeMixedDataFrame().set_index("C") - df.to_hdf(path, "ss3", format="table", min_itemsize={"index": 6}) - # just make sure there is a longer string: - df2 = df.copy().reset_index().assign(C="longer").set_index("C") - df2.to_hdf(path, "ss3", append=True, format="table") - tm.assert_frame_equal(pd.read_hdf(path, "ss3"), pd.concat([df, df2])) - - # same as above, with a Series - df["B"].to_hdf(path, "ss4", format="table", min_itemsize={"index": 6}) - df2["B"].to_hdf(path, "ss4", append=True, format="table") - tm.assert_series_equal( - pd.read_hdf(path, "ss4"), pd.concat([df["B"], df2["B"]]) - ) + # invalid + msg = "cannot process expression" + with pytest.raises(ValueError, match=msg): + store.select("df", where=np.arange(len(df), dtype="float64")) - @pytest.mark.parametrize("format", ["fixed", "table"]) - def test_to_hdf_errors(self, format, setup_path): + with pytest.raises(ValueError, match=msg): + store.select("df", where=np.arange(len(df) + 1)) - data = ["\ud800foo"] - ser = Series(data, index=Index(data)) - with ensure_clean_path(setup_path) as path: - # GH 20835 - ser.to_hdf(path, "table", format=format, errors="surrogatepass") + with pytest.raises(ValueError, match=msg): + store.select("df", where=np.arange(len(df)), start=5) - result = pd.read_hdf(path, "table", errors="surrogatepass") - tm.assert_series_equal(result, ser) + with pytest.raises(ValueError, match=msg): + store.select("df", where=np.arange(len(df)), start=5, stop=10) - def test_append_with_data_columns(self, setup_path): + # selection with filter + selection = date_range("20000101", periods=500) + result = store.select("df", where="index in selection") + expected = df[df.index.isin(selection)] + tm.assert_frame_equal(result, expected) - with ensure_clean_store(setup_path) as store: - df = tm.makeTimeDataFrame() - df.iloc[0, df.columns.get_loc("B")] = 1.0 - _maybe_remove(store, "df") - store.append("df", df[:2], data_columns=["B"]) - store.append("df", df[2:]) - tm.assert_frame_equal(store["df"], df) - - # check that we have indices created - assert store._handle.root.df.table.cols.index.is_indexed is True - assert store._handle.root.df.table.cols.B.is_indexed is True - - # data column searching - result = store.select("df", "B>0") - expected = df[df.B > 0] - tm.assert_frame_equal(result, expected) - - # data column searching (with an indexable and a data_columns) - result = store.select("df", "B>0 and index>df.index[3]") - df_new = df.reindex(index=df.index[4:]) - expected = df_new[df_new.B > 0] - tm.assert_frame_equal(result, expected) - - # data column selection with a string data_column - df_new = df.copy() - df_new["string"] = "foo" - df_new.loc[df_new.index[1:4], "string"] = np.nan - df_new.loc[df_new.index[5:6], "string"] = "bar" - _maybe_remove(store, "df") - store.append("df", df_new, data_columns=["string"]) - result = store.select("df", "string='foo'") - expected = df_new[df_new.string == "foo"] - tm.assert_frame_equal(result, expected) - - # using min_itemsize and a data column - def check_col(key, name, size): - assert ( - getattr(store.get_storer(key).table.description, name).itemsize - == size - ) + # list + df = DataFrame(np.random.randn(10, 2)) + store.append("df2", df) + result = store.select("df2", where=[0, 3, 5]) + expected = df.iloc[[0, 3, 5]] + tm.assert_frame_equal(result, expected) - with ensure_clean_store(setup_path) as store: - _maybe_remove(store, "df") - store.append( - "df", df_new, data_columns=["string"], min_itemsize={"string": 30} - ) - check_col("df", "string", 30) - _maybe_remove(store, "df") - store.append("df", df_new, data_columns=["string"], min_itemsize=30) - check_col("df", "string", 30) - _maybe_remove(store, "df") - store.append( - "df", df_new, data_columns=["string"], min_itemsize={"values": 30} - ) - check_col("df", "string", 30) - - with ensure_clean_store(setup_path) as store: - df_new["string2"] = "foobarbah" - df_new["string_block1"] = "foobarbah1" - df_new["string_block2"] = "foobarbah2" - _maybe_remove(store, "df") - store.append( - "df", - df_new, - data_columns=["string", "string2"], - min_itemsize={"string": 30, "string2": 40, "values": 50}, - ) - check_col("df", "string", 30) - check_col("df", "string2", 40) - check_col("df", "values_block_1", 50) - - with ensure_clean_store(setup_path) as store: - # multiple data columns - df_new = df.copy() - df_new.iloc[0, df_new.columns.get_loc("A")] = 1.0 - df_new.iloc[0, df_new.columns.get_loc("B")] = -1.0 - df_new["string"] = "foo" - - sl = df_new.columns.get_loc("string") - df_new.iloc[1:4, sl] = np.nan - df_new.iloc[5:6, sl] = "bar" - - df_new["string2"] = "foo" - sl = df_new.columns.get_loc("string2") - df_new.iloc[2:5, sl] = np.nan - df_new.iloc[7:8, sl] = "bar" - _maybe_remove(store, "df") - store.append("df", df_new, data_columns=["A", "B", "string", "string2"]) - result = store.select( - "df", "string='foo' and string2='foo' and A>0 and B<0" - ) - expected = df_new[ - (df_new.string == "foo") - & (df_new.string2 == "foo") - & (df_new.A > 0) - & (df_new.B < 0) - ] - tm.assert_frame_equal(result, expected, check_freq=False) - # FIXME: 2020-05-07 freq check randomly fails in the CI - - # yield an empty frame - result = store.select("df", "string='foo' and string2='cool'") - expected = df_new[(df_new.string == "foo") & (df_new.string2 == "cool")] - tm.assert_frame_equal(result, expected) - - with ensure_clean_store(setup_path) as store: - # doc example - df_dc = df.copy() - df_dc["string"] = "foo" - df_dc.loc[df_dc.index[4:6], "string"] = np.nan - df_dc.loc[df_dc.index[7:9], "string"] = "bar" - df_dc["string2"] = "cool" - df_dc["datetime"] = Timestamp("20010102") - df_dc = df_dc._convert(datetime=True) - df_dc.loc[df_dc.index[3:5], ["A", "B", "datetime"]] = np.nan - - _maybe_remove(store, "df_dc") - store.append( - "df_dc", df_dc, data_columns=["B", "C", "string", "string2", "datetime"] - ) - result = store.select("df_dc", "B>0") - - expected = df_dc[df_dc.B > 0] - tm.assert_frame_equal(result, expected) - - result = store.select("df_dc", ["B > 0", "C > 0", "string == foo"]) - expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] - tm.assert_frame_equal(result, expected, check_freq=False) - # FIXME: 2020-12-07 intermittent build failures here with freq of - # None instead of BDay(4) - - with ensure_clean_store(setup_path) as store: - # doc example part 2 - np.random.seed(1234) - index = date_range("1/1/2000", periods=8) - df_dc = DataFrame( - np.random.randn(8, 3), index=index, columns=["A", "B", "C"] - ) - df_dc["string"] = "foo" - df_dc.loc[df_dc.index[4:6], "string"] = np.nan - df_dc.loc[df_dc.index[7:9], "string"] = "bar" - df_dc.loc[:, ["B", "C"]] = df_dc.loc[:, ["B", "C"]].abs() - df_dc["string2"] = "cool" + # boolean + where = [True] * 10 + where[-2] = False + result = store.select("df2", where=where) + expected = df.loc[where] + tm.assert_frame_equal(result, expected) - # on-disk operations - store.append("df_dc", df_dc, data_columns=["B", "C", "string", "string2"]) + # start/stop + result = store.select("df2", start=5, stop=10) + expected = df[5:10] + tm.assert_frame_equal(result, expected) - result = store.select("df_dc", "B>0") - expected = df_dc[df_dc.B > 0] - tm.assert_frame_equal(result, expected) - result = store.select("df_dc", ["B > 0", "C > 0", 'string == "foo"']) - expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] - tm.assert_frame_equal(result, expected) +def test_start_stop_table(setup_path): - def test_create_table_index(self, setup_path): + with ensure_clean_store(setup_path) as store: - with ensure_clean_store(setup_path) as store: + # table + df = DataFrame({"A": np.random.rand(20), "B": np.random.rand(20)}) + store.append("df", df) - with catch_warnings(record=True): + result = store.select("df", "columns=['A']", start=0, stop=5) + expected = df.loc[0:4, ["A"]] + tm.assert_frame_equal(result, expected) - def col(t, column): - return getattr(store.get_storer(t).table.cols, column) - - # data columns - df = tm.makeTimeDataFrame() - df["string"] = "foo" - df["string2"] = "bar" - store.append("f", df, data_columns=["string", "string2"]) - assert col("f", "index").is_indexed is True - assert col("f", "string").is_indexed is True - assert col("f", "string2").is_indexed is True - - # specify index=columns - store.append( - "f2", df, index=["string"], data_columns=["string", "string2"] - ) - assert col("f2", "index").is_indexed is False - assert col("f2", "string").is_indexed is True - assert col("f2", "string2").is_indexed is False + # out of range + result = store.select("df", "columns=['A']", start=30, stop=40) + assert len(result) == 0 + expected = df.loc[30:40, ["A"]] + tm.assert_frame_equal(result, expected) - # try to index a non-table - _maybe_remove(store, "f2") - store.put("f2", df) - msg = "cannot create table index on a Fixed format store" - with pytest.raises(TypeError, match=msg): - store.create_table_index("f2") - def test_create_table_index_data_columns_argument(self, setup_path): - # GH 28156 +def test_start_stop_multiple(setup_path): - with ensure_clean_store(setup_path) as store: + # GH 16209 + with ensure_clean_store(setup_path) as store: - with catch_warnings(record=True): + df = DataFrame({"foo": [1, 2], "bar": [1, 2]}) - def col(t, column): - return getattr(store.get_storer(t).table.cols, column) - - # data columns - df = tm.makeTimeDataFrame() - df["string"] = "foo" - df["string2"] = "bar" - store.append("f", df, data_columns=["string"]) - assert col("f", "index").is_indexed is True - assert col("f", "string").is_indexed is True - - msg = "'Cols' object has no attribute 'string2'" - with pytest.raises(AttributeError, match=msg): - col("f", "string2").is_indexed - - # try to index a col which isn't a data_column - msg = ( - "column string2 is not a data_column.\n" - "In order to read column string2 you must reload the dataframe \n" - "into HDFStore and include string2 with the data_columns argument." - ) - with pytest.raises(AttributeError, match=msg): - store.create_table_index("f", columns=["string2"]) - - def test_append_hierarchical(self, setup_path): - index = MultiIndex( - levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=["foo", "bar"], + store.append_to_multiple( + {"selector": ["foo"], "data": None}, df, selector="selector" ) - df = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) - - with ensure_clean_store(setup_path) as store: - store.append("mi", df) - result = store.select("mi") - tm.assert_frame_equal(result, df) - - # GH 3748 - result = store.select("mi", columns=["A", "B"]) - expected = df.reindex(columns=["A", "B"]) - tm.assert_frame_equal(result, expected) - - with ensure_clean_path("test.hdf") as path: - df.to_hdf(path, "df", format="table") - result = read_hdf(path, "df", columns=["A", "B"]) - expected = df.reindex(columns=["A", "B"]) - tm.assert_frame_equal(result, expected) - - def test_column_multiindex(self, setup_path): - # GH 4710 - # recreate multi-indexes properly - - index = MultiIndex.from_tuples( - [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")], names=["first", "second"] + result = store.select_as_multiple( + ["selector", "data"], selector="selector", start=0, stop=1 ) - df = DataFrame(np.arange(12).reshape(3, 4), columns=index) - expected = df.copy() - if isinstance(expected.index, RangeIndex): - expected.index = Int64Index(expected.index) - - with ensure_clean_store(setup_path) as store: - - store.put("df", df) - tm.assert_frame_equal( - store["df"], expected, check_index_type=True, check_column_type=True - ) - - store.put("df1", df, format="table") - tm.assert_frame_equal( - store["df1"], expected, check_index_type=True, check_column_type=True - ) + expected = df.loc[[0], ["foo", "bar"]] + tm.assert_frame_equal(result, expected) - msg = re.escape( - "cannot use a multi-index on axis [1] with data_columns ['A']" - ) - with pytest.raises(ValueError, match=msg): - store.put("df2", df, format="table", data_columns=["A"]) - msg = re.escape( - "cannot use a multi-index on axis [1] with data_columns True" - ) - with pytest.raises(ValueError, match=msg): - store.put("df3", df, format="table", data_columns=True) - # appending multi-column on existing table (see GH 6167) - with ensure_clean_store(setup_path) as store: - store.append("df2", df) - store.append("df2", df) +def test_start_stop_fixed(setup_path): - tm.assert_frame_equal(store["df2"], concat((df, df))) + with ensure_clean_store(setup_path) as store: - # non_index_axes name + # fixed, GH 8287 df = DataFrame( - np.arange(12).reshape(3, 4), columns=Index(list("ABCD"), name="foo") + {"A": np.random.rand(20), "B": np.random.rand(20)}, + index=pd.date_range("20130101", periods=20), ) - expected = df.copy() - if isinstance(expected.index, RangeIndex): - expected.index = Int64Index(expected.index) + store.put("df", df) - with ensure_clean_store(setup_path) as store: + result = store.select("df", start=0, stop=5) + expected = df.iloc[0:5, :] + tm.assert_frame_equal(result, expected) - store.put("df1", df, format="table") - tm.assert_frame_equal( - store["df1"], expected, check_index_type=True, check_column_type=True - ) + result = store.select("df", start=5, stop=10) + expected = df.iloc[5:10, :] + tm.assert_frame_equal(result, expected) - def test_store_multiindex(self, setup_path): - - # validate multi-index names - # GH 5527 - with ensure_clean_store(setup_path) as store: - - def make_index(names=None): - return MultiIndex.from_tuples( - [ - (datetime.datetime(2013, 12, d), s, t) - for d in range(1, 3) - for s in range(2) - for t in range(3) - ], - names=names, - ) + # out of range + result = store.select("df", start=30, stop=40) + expected = df.iloc[30:40, :] + tm.assert_frame_equal(result, expected) - # no names - _maybe_remove(store, "df") - df = DataFrame(np.zeros((12, 2)), columns=["a", "b"], index=make_index()) - store.append("df", df) - tm.assert_frame_equal(store.select("df"), df) - - # partial names - _maybe_remove(store, "df") - df = DataFrame( - np.zeros((12, 2)), - columns=["a", "b"], - index=make_index(["date", None, None]), - ) - store.append("df", df) - tm.assert_frame_equal(store.select("df"), df) - - # series - _maybe_remove(store, "s") - s = Series(np.zeros(12), index=make_index(["date", None, None])) - store.append("s", s) - xp = Series(np.zeros(12), index=make_index(["date", "level_1", "level_2"])) - tm.assert_series_equal(store.select("s"), xp) - - # dup with column - _maybe_remove(store, "df") - df = DataFrame( - np.zeros((12, 2)), - columns=["a", "b"], - index=make_index(["date", "a", "t"]), - ) - msg = "duplicate names/columns in the multi-index when storing as a table" - with pytest.raises(ValueError, match=msg): - store.append("df", df) - - # dup within level - _maybe_remove(store, "df") - df = DataFrame( - np.zeros((12, 2)), - columns=["a", "b"], - index=make_index(["date", "date", "date"]), - ) - with pytest.raises(ValueError, match=msg): - store.append("df", df) - - # fully names - _maybe_remove(store, "df") - df = DataFrame( - np.zeros((12, 2)), - columns=["a", "b"], - index=make_index(["date", "s", "t"]), - ) - store.append("df", df) - tm.assert_frame_equal(store.select("df"), df) - - def test_select_columns_in_where(self, setup_path): - - # GH 6169 - # recreate multi-indexes when columns is passed - # in the `where` argument - index = MultiIndex( - levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=["foo_name", "bar_name"], - ) + # series + s = df.A + store.put("s", s) + result = store.select("s", start=0, stop=5) + expected = s.iloc[0:5] + tm.assert_series_equal(result, expected) - # With a DataFrame - df = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) + result = store.select("s", start=5, stop=10) + expected = s.iloc[5:10] + tm.assert_series_equal(result, expected) - with ensure_clean_store(setup_path) as store: - store.put("df", df, format="table") - expected = df[["A"]] + # sparse; not implemented + df = tm.makeDataFrame() + df.iloc[3:5, 1:3] = np.nan + df.iloc[8:10, -2] = np.nan - tm.assert_frame_equal(store.select("df", columns=["A"]), expected) - tm.assert_frame_equal(store.select("df", where="columns=['A']"), expected) +def test_select_filter_corner(setup_path): - # With a Series - s = Series(np.random.randn(10), index=index, name="A") - with ensure_clean_store(setup_path) as store: - store.put("s", s, format="table") - tm.assert_series_equal(store.select("s", where="columns=['A']"), s) + df = DataFrame(np.random.randn(50, 100)) + df.index = [f"{c:3d}" for c in df.index] + df.columns = [f"{c:3d}" for c in df.columns] - def test_mi_data_columns(self, setup_path): - # GH 14435 - idx = MultiIndex.from_arrays( - [date_range("2000-01-01", periods=5), range(5)], names=["date", "id"] - ) - df = DataFrame({"a": [1.1, 1.2, 1.3, 1.4, 1.5]}, index=idx) + with ensure_clean_store(setup_path) as store: + store.put("frame", df, format="table") - with ensure_clean_store(setup_path) as store: - store.append("df", df, data_columns=True) + crit = "columns=df.columns[:75]" + result = store.select("frame", [crit]) + tm.assert_frame_equal(result, df.loc[:, df.columns[:75]]) - actual = store.select("df", where="id == 1") - expected = df.iloc[[1], :] - tm.assert_frame_equal(actual, expected) + crit = "columns=df.columns[:75:2]" + result = store.select("frame", [crit]) + tm.assert_frame_equal(result, df.loc[:, df.columns[:75:2]]) - def test_pass_spec_to_storer(self, setup_path): - df = tm.makeDataFrame() +def test_path_pathlib(setup_path): + df = tm.makeDataFrame() - with ensure_clean_store(setup_path) as store: - store.put("df", df) - msg = ( - "cannot pass a column specification when reading a Fixed format " - "store. this store must be selected in its entirety" - ) - with pytest.raises(TypeError, match=msg): - store.select("df", columns=["A"]) - msg = ( - "cannot pass a where specification when reading from a Fixed " - "format store. this store must be selected in its entirety" - ) - with pytest.raises(TypeError, match=msg): - store.select("df", where=[("columns=A")]) + result = tm.round_trip_pathlib( + lambda p: df.to_hdf(p, "df"), lambda p: pd.read_hdf(p, "df") + ) + tm.assert_frame_equal(df, result) - def test_append_misc(self, setup_path): - with ensure_clean_store(setup_path) as store: - df = tm.makeDataFrame() - store.append("df", df, chunksize=1) - result = store.select("df") - tm.assert_frame_equal(result, df) +@pytest.mark.parametrize("start, stop", [(0, 2), (1, 2), (None, None)]) +def test_contiguous_mixed_data_table(start, stop, setup_path): + # GH 17021 + df = DataFrame( + { + "a": Series([20111010, 20111011, 20111012]), + "b": Series(["ab", "cd", "ab"]), + } + ) - store.append("df1", df, expectedrows=10) - result = store.select("df1") - tm.assert_frame_equal(result, df) + with ensure_clean_store(setup_path) as store: + store.append("test_dataset", df) - # more chunksize in append tests - def check(obj, comparator): - for c in [10, 200, 1000]: - with ensure_clean_store(setup_path, mode="w") as store: - store.append("obj", obj, chunksize=c) - result = store.select("obj") - comparator(result, obj) + result = store.select("test_dataset", start=start, stop=stop) + tm.assert_frame_equal(df[start:stop], result) - df = tm.makeDataFrame() - df["string"] = "foo" - df["float322"] = 1.0 - df["float322"] = df["float322"].astype("float32") - df["bool"] = df["float322"] > 0 - df["time1"] = Timestamp("20130101") - df["time2"] = Timestamp("20130102") - check(df, tm.assert_frame_equal) - - # empty frame, GH4273 - with ensure_clean_store(setup_path) as store: - - # 0 len - df_empty = DataFrame(columns=list("ABC")) - store.append("df", df_empty) - with pytest.raises(KeyError, match="'No object named df in the file'"): - store.select("df") - - # repeated append of 0/non-zero frames - df = DataFrame(np.random.rand(10, 3), columns=list("ABC")) - store.append("df", df) - tm.assert_frame_equal(store.select("df"), df) - store.append("df", df_empty) - tm.assert_frame_equal(store.select("df"), df) - - # store - df = DataFrame(columns=list("ABC")) - store.put("df2", df) - tm.assert_frame_equal(store.select("df2"), df) - - def test_append_raise(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - # test append with invalid input to get good error messages - - # list in column - df = tm.makeDataFrame() - df["invalid"] = [["a"]] * len(df) - assert df.dtypes["invalid"] == np.object_ - msg = re.escape( - """Cannot serialize the column [invalid] -because its data contents are not [string] but [mixed] object dtype""" - ) - with pytest.raises(TypeError, match=msg): - store.append("df", df) - # multiple invalid columns - df["invalid2"] = [["a"]] * len(df) - df["invalid3"] = [["a"]] * len(df) - with pytest.raises(TypeError, match=msg): - store.append("df", df) - - # datetime with embedded nans as object - df = tm.makeDataFrame() - s = Series(datetime.datetime(2001, 1, 2), index=df.index) - s = s.astype(object) - s[0:5] = np.nan - df["invalid"] = s - assert df.dtypes["invalid"] == np.object_ - msg = "too many timezones in this block, create separate data columns" - with pytest.raises(TypeError, match=msg): - store.append("df", df) +def test_path_pathlib_hdfstore(setup_path): + df = tm.makeDataFrame() - # directly ndarray - msg = "value must be None, Series, or DataFrame" - with pytest.raises(TypeError, match=msg): - store.append("df", np.arange(10)) + def writer(path): + with HDFStore(path) as store: + df.to_hdf(store, "df") - # series directly - msg = re.escape( - "cannot properly create the storer for: " - "[group->df,value->]" - ) - with pytest.raises(TypeError, match=msg): - store.append("df", Series(np.arange(10))) + def reader(path): + with HDFStore(path) as store: + return pd.read_hdf(store, "df") - # appending an incompatible table - df = tm.makeDataFrame() - store.append("df", df) + result = tm.round_trip_pathlib(writer, reader) + tm.assert_frame_equal(df, result) - df["foo"] = "foo" - msg = re.escape( - "invalid combination of [non_index_axes] on appending data " - "[(1, ['A', 'B', 'C', 'D', 'foo'])] vs current table " - "[(1, ['A', 'B', 'C', 'D'])]" - ) - with pytest.raises(ValueError, match=msg): - store.append("df", df) - def test_table_index_incompatible_dtypes(self, setup_path): - df1 = DataFrame({"a": [1, 2, 3]}) - df2 = DataFrame({"a": [4, 5, 6]}, index=date_range("1/1/2000", periods=3)) +def test_pickle_path_localpath(setup_path): + df = tm.makeDataFrame() + result = tm.round_trip_pathlib( + lambda p: df.to_hdf(p, "df"), lambda p: pd.read_hdf(p, "df") + ) + tm.assert_frame_equal(df, result) - with ensure_clean_store(setup_path) as store: - store.put("frame", df1, format="table") - msg = re.escape("incompatible kind in col [integer - datetime64]") - with pytest.raises(TypeError, match=msg): - store.put("frame", df2, format="table", append=True) - - def test_table_values_dtypes_roundtrip(self, setup_path): - - with ensure_clean_store(setup_path) as store: - df1 = DataFrame({"a": [1, 2, 3]}, dtype="f8") - store.append("df_f8", df1) - tm.assert_series_equal(df1.dtypes, store["df_f8"].dtypes) - - df2 = DataFrame({"a": [1, 2, 3]}, dtype="i8") - store.append("df_i8", df2) - tm.assert_series_equal(df2.dtypes, store["df_i8"].dtypes) - - # incompatible dtype - msg = re.escape( - "invalid combination of [values_axes] on appending data " - "[name->values_block_0,cname->values_block_0," - "dtype->float64,kind->float,shape->(1, 3)] vs " - "current table [name->values_block_0," - "cname->values_block_0,dtype->int64,kind->integer," - "shape->None]" - ) - with pytest.raises(ValueError, match=msg): - store.append("df_i8", df1) - - # check creation/storage/retrieval of float32 (a bit hacky to - # actually create them thought) - df1 = DataFrame(np.array([[1], [2], [3]], dtype="f4"), columns=["A"]) - store.append("df_f4", df1) - tm.assert_series_equal(df1.dtypes, store["df_f4"].dtypes) - assert df1.dtypes[0] == "float32" - - # check with mixed dtypes - df1 = DataFrame( - { - c: Series(np.random.randint(5), dtype=c) - for c in ["float32", "float64", "int32", "int64", "int16", "int8"] - } - ) - df1["string"] = "foo" - df1["float322"] = 1.0 - df1["float322"] = df1["float322"].astype("float32") - df1["bool"] = df1["float32"] > 0 - df1["time1"] = Timestamp("20130101") - df1["time2"] = Timestamp("20130102") - - store.append("df_mixed_dtypes1", df1) - result = store.select("df_mixed_dtypes1").dtypes.value_counts() - result.index = [str(i) for i in result.index] - expected = Series( - { - "float32": 2, - "float64": 1, - "int32": 1, - "bool": 1, - "int16": 1, - "int8": 1, - "int64": 1, - "object": 1, - "datetime64[ns]": 2, - } - ) - result = result.sort_index() - expected = expected.sort_index() - tm.assert_series_equal(result, expected) - def test_table_mixed_dtypes(self, setup_path): +def test_path_localpath_hdfstore(setup_path): + df = tm.makeDataFrame() - # frame - df = tm.makeDataFrame() - df["obj1"] = "foo" - df["obj2"] = "bar" - df["bool1"] = df["A"] > 0 - df["bool2"] = df["B"] > 0 - df["bool3"] = True - df["int1"] = 1 - df["int2"] = 2 - df["timestamp1"] = Timestamp("20010102") - df["timestamp2"] = Timestamp("20010103") - df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) - df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) - df.loc[df.index[3:6], ["obj1"]] = np.nan - df = df._consolidate()._convert(datetime=True) + def writer(path): + with HDFStore(path) as store: + df.to_hdf(store, "df") - with ensure_clean_store(setup_path) as store: - store.append("df1_mixed", df) - tm.assert_frame_equal(store.select("df1_mixed"), df) + def reader(path): + with HDFStore(path) as store: + return pd.read_hdf(store, "df") - def test_unimplemented_dtypes_table_columns(self, setup_path): + result = tm.round_trip_localpath(writer, reader) + tm.assert_frame_equal(df, result) - with ensure_clean_store(setup_path) as store: - dtypes = [("date", datetime.date(2001, 1, 2))] +def test_copy(setup_path): - # currently not supported dtypes #### - for n, f in dtypes: - df = tm.makeDataFrame() - df[n] = f - msg = re.escape(f"[{n}] is not implemented as a table column") - with pytest.raises(TypeError, match=msg): - store.append(f"df1_{n}", df) + with catch_warnings(record=True): - # frame - df = tm.makeDataFrame() - df["obj1"] = "foo" - df["obj2"] = "bar" - df["datetime1"] = datetime.date(2001, 1, 2) - df = df._consolidate()._convert(datetime=True) + def do_copy(f, new_f=None, keys=None, propindexes=True, **kwargs): + try: + store = HDFStore(f, "r") - with ensure_clean_store(setup_path) as store: - # this fails because we have a date in the object block...... - msg = re.escape( - """Cannot serialize the column [datetime1] -because its data contents are not [string] but [date] object dtype""" - ) - with pytest.raises(TypeError, match=msg): - store.append("df_unimplemented", df) - - def test_calendar_roundtrip_issue(self, setup_path): - - # 8591 - # doc example from tseries holiday section - weekmask_egypt = "Sun Mon Tue Wed Thu" - holidays = [ - "2012-05-01", - datetime.datetime(2013, 5, 1), - np.datetime64("2014-05-01"), - ] - bday_egypt = pd.offsets.CustomBusinessDay( - holidays=holidays, weekmask=weekmask_egypt - ) - dt = datetime.datetime(2013, 4, 30) - dts = date_range(dt, periods=5, freq=bday_egypt) + if new_f is None: + import tempfile - s = Series(dts.weekday, dts).map(Series("Mon Tue Wed Thu Fri Sat Sun".split())) + fd, new_f = tempfile.mkstemp() + tstore = store.copy(new_f, keys=keys, propindexes=propindexes, **kwargs) - with ensure_clean_store(setup_path) as store: + # check keys + if keys is None: + keys = store.keys() + assert set(keys) == set(tstore.keys()) - store.put("fixed", s) - result = store.select("fixed") - tm.assert_series_equal(result, s) + # check indices & nrows + for k in tstore.keys(): + if tstore.get_storer(k).is_table: + new_t = tstore.get_storer(k) + orig_t = store.get_storer(k) - store.append("table", s) - result = store.select("table") - tm.assert_series_equal(result, s) + assert orig_t.nrows == new_t.nrows - def test_append_with_timedelta(self, setup_path): - # GH 3577 - # append timedelta + # check propindixes + if propindexes: + for a in orig_t.axes: + if a.is_indexed: + assert new_t[a.name].is_indexed - df = DataFrame( - { - "A": Timestamp("20130101"), - "B": [ - Timestamp("20130101") + timedelta(days=i, seconds=10) - for i in range(10) - ], - } - ) - df["C"] = df["A"] - df["B"] - df.loc[3:5, "C"] = np.nan + finally: + safe_close(store) + safe_close(tstore) + try: + os.close(fd) + except (OSError, ValueError): + pass + os.remove(new_f) - with ensure_clean_store(setup_path) as store: + # new table + df = tm.makeDataFrame() - # table - _maybe_remove(store, "df") - store.append("df", df, data_columns=True) - result = store.select("df") - tm.assert_frame_equal(result, df) + with tm.ensure_clean() as path: + st = HDFStore(path) + st.append("df", df, data_columns=["A"]) + st.close() + do_copy(f=path) + do_copy(f=path, propindexes=False) - result = store.select("df", where="C<100000") - tm.assert_frame_equal(result, df) - result = store.select("df", where="C") - - # from the docs - with ensure_clean_path(setup_path) as path: - dfq = DataFrame( - np.random.randn(10, 4), - columns=list("ABCD"), - index=date_range("20130101", periods=10), - ) - dfq.to_hdf(path, "dfq", format="table", data_columns=True) - - # check ok - read_hdf( - path, "dfq", where="index>Timestamp('20130104') & columns=['A', 'B']" - ) - read_hdf(path, "dfq", where="A>0 or C>0") - - # catch the invalid reference - with ensure_clean_path(setup_path) as path: - dfq = DataFrame( - np.random.randn(10, 4), - columns=list("ABCD"), - index=date_range("20130101", periods=10), - ) - dfq.to_hdf(path, "dfq", format="table") - - msg = ( - r"The passed where expression: A>0 or C>0\n\s*" - r"contains an invalid variable reference\n\s*" - r"all of the variable references must be a reference to\n\s*" - r"an axis \(e.g. 'index' or 'columns'\), or a data_column\n\s*" - r"The currently defined references are: index,columns\n" - ) - with pytest.raises(ValueError, match=msg): - read_hdf(path, "dfq", where="A>0 or C>0") - - def test_same_name_scoping(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - import pandas as pd - - df = DataFrame( - np.random.randn(20, 2), index=pd.date_range("20130101", periods=20) - ) - store.put("df", df, format="table") - expected = df[df.index > Timestamp("20130105")] - - import datetime - - result = store.select("df", "index>datetime.datetime(2013,1,5)") - tm.assert_frame_equal(result, expected) - - from datetime import datetime # noqa - - # technically an error, but allow it - result = store.select("df", "index>datetime.datetime(2013,1,5)") - tm.assert_frame_equal(result, expected) - - result = store.select("df", "index>datetime(2013,1,5)") - tm.assert_frame_equal(result, expected) - - def test_series(self, setup_path): - - s = tm.makeStringSeries() - self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) - - ts = tm.makeTimeSeries() - self._check_roundtrip(ts, tm.assert_series_equal, path=setup_path) - - ts2 = Series(ts.index, Index(ts.index, dtype=object)) - self._check_roundtrip(ts2, tm.assert_series_equal, path=setup_path) - - ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) - self._check_roundtrip( - ts3, tm.assert_series_equal, path=setup_path, check_index_type=False - ) - - def test_float_index(self, setup_path): - - # GH #454 - index = np.random.randn(10) - s = Series(np.random.randn(10), index=index) - self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) - - def test_tuple_index(self, setup_path): - - # GH #492 - col = np.arange(10) - idx = [(0.0, 1.0), (2.0, 3.0), (4.0, 5.0)] - data = np.random.randn(30).reshape((3, 10)) - DF = DataFrame(data, index=idx, columns=col) - - with catch_warnings(record=True): - simplefilter("ignore", pd.errors.PerformanceWarning) - self._check_roundtrip(DF, tm.assert_frame_equal, path=setup_path) - - @pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning") - def test_index_types(self, setup_path): - - with catch_warnings(record=True): - values = np.random.randn(2) - - func = lambda l, r: tm.assert_series_equal(l, r, check_index_type=True) - - with catch_warnings(record=True): - ser = Series(values, [0, "y"]) - self._check_roundtrip(ser, func, path=setup_path) - - with catch_warnings(record=True): - ser = Series(values, [datetime.datetime.today(), 0]) - self._check_roundtrip(ser, func, path=setup_path) - - with catch_warnings(record=True): - ser = Series(values, ["y", 0]) - self._check_roundtrip(ser, func, path=setup_path) - - with catch_warnings(record=True): - ser = Series(values, [datetime.date.today(), "a"]) - self._check_roundtrip(ser, func, path=setup_path) - - with catch_warnings(record=True): - - ser = Series(values, [0, "y"]) - self._check_roundtrip(ser, func, path=setup_path) - - ser = Series(values, [datetime.datetime.today(), 0]) - self._check_roundtrip(ser, func, path=setup_path) - - ser = Series(values, ["y", 0]) - self._check_roundtrip(ser, func, path=setup_path) - - ser = Series(values, [datetime.date.today(), "a"]) - self._check_roundtrip(ser, func, path=setup_path) - - ser = Series(values, [1.23, "b"]) - self._check_roundtrip(ser, func, path=setup_path) - - ser = Series(values, [1, 1.53]) - self._check_roundtrip(ser, func, path=setup_path) - - ser = Series(values, [1, 5]) - self._check_roundtrip(ser, func, path=setup_path) - - ser = Series( - values, [datetime.datetime(2012, 1, 1), datetime.datetime(2012, 1, 2)] - ) - self._check_roundtrip(ser, func, path=setup_path) - - def test_timeseries_preepoch(self, setup_path): - - dr = bdate_range("1/1/1940", "1/1/1960") - ts = Series(np.random.randn(len(dr)), index=dr) - try: - self._check_roundtrip(ts, tm.assert_series_equal, path=setup_path) - except OverflowError: - pytest.skip("known failer on some windows platforms") - - @pytest.mark.parametrize( - "compression", [False, pytest.param(True, marks=td.skip_if_windows_python_3)] - ) - def test_frame(self, compression, setup_path): - - df = tm.makeDataFrame() - - # put in some random NAs - df.values[0, 0] = np.nan - df.values[5, 3] = np.nan - - self._check_roundtrip_table( - df, tm.assert_frame_equal, path=setup_path, compression=compression - ) - self._check_roundtrip( - df, tm.assert_frame_equal, path=setup_path, compression=compression - ) - - tdf = tm.makeTimeDataFrame() - self._check_roundtrip( - tdf, tm.assert_frame_equal, path=setup_path, compression=compression - ) - - with ensure_clean_store(setup_path) as store: - # not consolidated - df["foo"] = np.random.randn(len(df)) - store["df"] = df - recons = store["df"] - assert recons._mgr.is_consolidated() - - # empty - self._check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path) - - def test_empty_series_frame(self, setup_path): - s0 = Series(dtype=object) - s1 = Series(name="myseries", dtype=object) - df0 = DataFrame() - df1 = DataFrame(index=["a", "b", "c"]) - df2 = DataFrame(columns=["d", "e", "f"]) - - self._check_roundtrip(s0, tm.assert_series_equal, path=setup_path) - self._check_roundtrip(s1, tm.assert_series_equal, path=setup_path) - self._check_roundtrip(df0, tm.assert_frame_equal, path=setup_path) - self._check_roundtrip(df1, tm.assert_frame_equal, path=setup_path) - self._check_roundtrip(df2, tm.assert_frame_equal, path=setup_path) - - @pytest.mark.parametrize( - "dtype", [np.int64, np.float64, object, "m8[ns]", "M8[ns]"] - ) - def test_empty_series(self, dtype, setup_path): - s = Series(dtype=dtype) - self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) - - def test_can_serialize_dates(self, setup_path): - - rng = [x.date() for x in bdate_range("1/1/2000", "1/30/2000")] - frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - - self._check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) - - def test_store_hierarchical(self, setup_path): - index = MultiIndex( - levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=["foo", "bar"], - ) - frame = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) - - self._check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) - self._check_roundtrip(frame.T, tm.assert_frame_equal, path=setup_path) - self._check_roundtrip(frame["A"], tm.assert_series_equal, path=setup_path) - - # check that the names are stored - with ensure_clean_store(setup_path) as store: - store["frame"] = frame - recons = store["frame"] - tm.assert_frame_equal(recons, frame) - - def test_store_index_name(self, setup_path): - df = tm.makeDataFrame() - df.index.name = "foo" - - with ensure_clean_store(setup_path) as store: - store["frame"] = df - recons = store["frame"] - tm.assert_frame_equal(recons, df) - - @pytest.mark.parametrize("table_format", ["table", "fixed"]) - def test_store_index_name_numpy_str(self, table_format, setup_path): - # GH #13492 - idx = Index( - pd.to_datetime([datetime.date(2000, 1, 1), datetime.date(2000, 1, 2)]), - name="cols\u05d2", - ) - idx1 = Index( - pd.to_datetime([datetime.date(2010, 1, 1), datetime.date(2010, 1, 2)]), - name="rows\u05d0", - ) - df = DataFrame(np.arange(4).reshape(2, 2), columns=idx, index=idx1) - - # This used to fail, returning numpy strings instead of python strings. - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", format=table_format) - df2 = read_hdf(path, "df") - - tm.assert_frame_equal(df, df2, check_names=True) - - assert type(df2.index.name) == str - assert type(df2.columns.name) == str - - def test_store_series_name(self, setup_path): - df = tm.makeDataFrame() - series = df["A"] - - with ensure_clean_store(setup_path) as store: - store["series"] = series - recons = store["series"] - tm.assert_series_equal(recons, series) - - @pytest.mark.parametrize( - "compression", [False, pytest.param(True, marks=td.skip_if_windows_python_3)] - ) - def test_store_mixed(self, compression, setup_path): - def _make_one(): - df = tm.makeDataFrame() - df["obj1"] = "foo" - df["obj2"] = "bar" - df["bool1"] = df["A"] > 0 - df["bool2"] = df["B"] > 0 - df["int1"] = 1 - df["int2"] = 2 - return df._consolidate() - - df1 = _make_one() - df2 = _make_one() - - self._check_roundtrip(df1, tm.assert_frame_equal, path=setup_path) - self._check_roundtrip(df2, tm.assert_frame_equal, path=setup_path) - - with ensure_clean_store(setup_path) as store: - store["obj"] = df1 - tm.assert_frame_equal(store["obj"], df1) - store["obj"] = df2 - tm.assert_frame_equal(store["obj"], df2) - - # check that can store Series of all of these types - self._check_roundtrip( - df1["obj1"], - tm.assert_series_equal, - path=setup_path, - compression=compression, - ) - self._check_roundtrip( - df1["bool1"], - tm.assert_series_equal, - path=setup_path, - compression=compression, - ) - self._check_roundtrip( - df1["int1"], - tm.assert_series_equal, - path=setup_path, - compression=compression, - ) - - @pytest.mark.filterwarnings( - "ignore:\\nduplicate:pandas.io.pytables.DuplicateWarning" - ) - def test_select_with_dups(self, setup_path): - - # single dtypes - df = DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]) - df.index = date_range("20130101 9:30", periods=10, freq="T") - - with ensure_clean_store(setup_path) as store: - store.append("df", df) - - result = store.select("df") - expected = df - tm.assert_frame_equal(result, expected, by_blocks=True) - - result = store.select("df", columns=df.columns) - expected = df - tm.assert_frame_equal(result, expected, by_blocks=True) - - result = store.select("df", columns=["A"]) - expected = df.loc[:, ["A"]] - tm.assert_frame_equal(result, expected) - - # dups across dtypes - df = concat( - [ - DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]), - DataFrame( - np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"] - ), - ], - axis=1, - ) - df.index = date_range("20130101 9:30", periods=10, freq="T") - - with ensure_clean_store(setup_path) as store: - store.append("df", df) - - result = store.select("df") - expected = df - tm.assert_frame_equal(result, expected, by_blocks=True) - - result = store.select("df", columns=df.columns) - expected = df - tm.assert_frame_equal(result, expected, by_blocks=True) - - expected = df.loc[:, ["A"]] - result = store.select("df", columns=["A"]) - tm.assert_frame_equal(result, expected, by_blocks=True) - - expected = df.loc[:, ["B", "A"]] - result = store.select("df", columns=["B", "A"]) - tm.assert_frame_equal(result, expected, by_blocks=True) - - # duplicates on both index and columns - with ensure_clean_store(setup_path) as store: - store.append("df", df) - store.append("df", df) - - expected = df.loc[:, ["B", "A"]] - expected = concat([expected, expected]) - result = store.select("df", columns=["B", "A"]) - tm.assert_frame_equal(result, expected, by_blocks=True) - - def test_overwrite_node(self, setup_path): - - with ensure_clean_store(setup_path) as store: - store["a"] = tm.makeTimeDataFrame() - ts = tm.makeTimeSeries() - store["a"] = ts - - tm.assert_series_equal(store["a"], ts) - - def test_select(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - with catch_warnings(record=True): - - # select with columns= - df = tm.makeTimeDataFrame() - _maybe_remove(store, "df") - store.append("df", df) - result = store.select("df", columns=["A", "B"]) - expected = df.reindex(columns=["A", "B"]) - tm.assert_frame_equal(expected, result) - - # equivalently - result = store.select("df", [("columns=['A', 'B']")]) - expected = df.reindex(columns=["A", "B"]) - tm.assert_frame_equal(expected, result) - - # with a data column - _maybe_remove(store, "df") - store.append("df", df, data_columns=["A"]) - result = store.select("df", ["A > 0"], columns=["A", "B"]) - expected = df[df.A > 0].reindex(columns=["A", "B"]) - tm.assert_frame_equal(expected, result) - - # all a data columns - _maybe_remove(store, "df") - store.append("df", df, data_columns=True) - result = store.select("df", ["A > 0"], columns=["A", "B"]) - expected = df[df.A > 0].reindex(columns=["A", "B"]) - tm.assert_frame_equal(expected, result) - - # with a data column, but different columns - _maybe_remove(store, "df") - store.append("df", df, data_columns=["A"]) - result = store.select("df", ["A > 0"], columns=["C", "D"]) - expected = df[df.A > 0].reindex(columns=["C", "D"]) - tm.assert_frame_equal(expected, result) - - def test_select_dtypes(self, setup_path): - - with ensure_clean_store(setup_path) as store: - # with a Timestamp data column (GH #2637) - df = DataFrame( - { - "ts": bdate_range("2012-01-01", periods=300), - "A": np.random.randn(300), - } - ) - _maybe_remove(store, "df") - store.append("df", df, data_columns=["ts", "A"]) - - result = store.select("df", "ts>=Timestamp('2012-02-01')") - expected = df[df.ts >= Timestamp("2012-02-01")] - tm.assert_frame_equal(expected, result) - - # bool columns (GH #2849) - df = DataFrame(np.random.randn(5, 2), columns=["A", "B"]) - df["object"] = "foo" - df.loc[4:5, "object"] = "bar" - df["boolv"] = df["A"] > 0 - _maybe_remove(store, "df") - store.append("df", df, data_columns=True) - - expected = df[df.boolv == True].reindex(columns=["A", "boolv"]) # noqa - for v in [True, "true", 1]: - result = store.select("df", f"boolv == {v}", columns=["A", "boolv"]) - tm.assert_frame_equal(expected, result) - - expected = df[df.boolv == False].reindex(columns=["A", "boolv"]) # noqa - for v in [False, "false", 0]: - result = store.select("df", f"boolv == {v}", columns=["A", "boolv"]) - tm.assert_frame_equal(expected, result) - - # integer index - df = DataFrame({"A": np.random.rand(20), "B": np.random.rand(20)}) - _maybe_remove(store, "df_int") - store.append("df_int", df) - result = store.select("df_int", "index<10 and columns=['A']") - expected = df.reindex(index=list(df.index)[0:10], columns=["A"]) - tm.assert_frame_equal(expected, result) - - # float index - df = DataFrame( - { - "A": np.random.rand(20), - "B": np.random.rand(20), - "index": np.arange(20, dtype="f8"), - } - ) - _maybe_remove(store, "df_float") - store.append("df_float", df) - result = store.select("df_float", "index<10.0 and columns=['A']") - expected = df.reindex(index=list(df.index)[0:10], columns=["A"]) - tm.assert_frame_equal(expected, result) - - with ensure_clean_store(setup_path) as store: - - # floats w/o NaN - df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64") - df["cols"] = (df["cols"] + 10).apply(str) - - store.append("df1", df, data_columns=True) - result = store.select("df1", where="values>2.0") - expected = df[df["values"] > 2.0] - tm.assert_frame_equal(expected, result) - - # floats with NaN - df.iloc[0] = np.nan - expected = df[df["values"] > 2.0] - - store.append("df2", df, data_columns=True, index=False) - result = store.select("df2", where="values>2.0") - tm.assert_frame_equal(expected, result) - - # https://github.com/PyTables/PyTables/issues/282 - # bug in selection when 0th row has a np.nan and an index - # store.append('df3',df,data_columns=True) - # result = store.select( - # 'df3', where='values>2.0') - # tm.assert_frame_equal(expected, result) - - # not in first position float with NaN ok too - df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64") - df["cols"] = (df["cols"] + 10).apply(str) - - df.iloc[1] = np.nan - expected = df[df["values"] > 2.0] - - store.append("df4", df, data_columns=True) - result = store.select("df4", where="values>2.0") - tm.assert_frame_equal(expected, result) - - # test selection with comparison against numpy scalar - # GH 11283 - with ensure_clean_store(setup_path) as store: - df = tm.makeDataFrame() - - expected = df[df["A"] > 0] - - store.append("df", df, data_columns=True) - np_zero = np.float64(0) # noqa - result = store.select("df", where=["A>np_zero"]) - tm.assert_frame_equal(expected, result) - - def test_select_with_many_inputs(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - df = DataFrame( - { - "ts": bdate_range("2012-01-01", periods=300), - "A": np.random.randn(300), - "B": range(300), - "users": ["a"] * 50 - + ["b"] * 50 - + ["c"] * 100 - + [f"a{i:03d}" for i in range(100)], - } - ) - _maybe_remove(store, "df") - store.append("df", df, data_columns=["ts", "A", "B", "users"]) - - # regular select - result = store.select("df", "ts>=Timestamp('2012-02-01')") - expected = df[df.ts >= Timestamp("2012-02-01")] - tm.assert_frame_equal(expected, result) - - # small selector - result = store.select( - "df", "ts>=Timestamp('2012-02-01') & users=['a','b','c']" - ) - expected = df[ - (df.ts >= Timestamp("2012-02-01")) & df.users.isin(["a", "b", "c"]) - ] - tm.assert_frame_equal(expected, result) - - # big selector along the columns - selector = ["a", "b", "c"] + [f"a{i:03d}" for i in range(60)] - result = store.select( - "df", "ts>=Timestamp('2012-02-01') and users=selector" - ) - expected = df[(df.ts >= Timestamp("2012-02-01")) & df.users.isin(selector)] - tm.assert_frame_equal(expected, result) - - selector = range(100, 200) - result = store.select("df", "B=selector") - expected = df[df.B.isin(selector)] - tm.assert_frame_equal(expected, result) - assert len(result) == 100 - - # big selector along the index - selector = Index(df.ts[0:100].values) - result = store.select("df", "ts=selector") - expected = df[df.ts.isin(selector.values)] - tm.assert_frame_equal(expected, result) - assert len(result) == 100 - - def test_select_iterator(self, setup_path): - - # single table - with ensure_clean_store(setup_path) as store: - - df = tm.makeTimeDataFrame(500) - _maybe_remove(store, "df") - store.append("df", df) - - expected = store.select("df") - - results = list(store.select("df", iterator=True)) - result = concat(results) - tm.assert_frame_equal(expected, result) - - results = list(store.select("df", chunksize=100)) - assert len(results) == 5 - result = concat(results) - tm.assert_frame_equal(expected, result) - - results = list(store.select("df", chunksize=150)) - result = concat(results) - tm.assert_frame_equal(result, expected) - - with ensure_clean_path(setup_path) as path: - - df = tm.makeTimeDataFrame(500) - df.to_hdf(path, "df_non_table") - - msg = "can only use an iterator or chunksize on a table" - with pytest.raises(TypeError, match=msg): - read_hdf(path, "df_non_table", chunksize=100) - - with pytest.raises(TypeError, match=msg): - read_hdf(path, "df_non_table", iterator=True) - - with ensure_clean_path(setup_path) as path: - - df = tm.makeTimeDataFrame(500) - df.to_hdf(path, "df", format="table") - - results = list(read_hdf(path, "df", chunksize=100)) - result = concat(results) - - assert len(results) == 5 - tm.assert_frame_equal(result, df) - tm.assert_frame_equal(result, read_hdf(path, "df")) - - # multiple - - with ensure_clean_store(setup_path) as store: - - df1 = tm.makeTimeDataFrame(500) - store.append("df1", df1, data_columns=True) - df2 = tm.makeTimeDataFrame(500).rename(columns="{}_2".format) - df2["foo"] = "bar" - store.append("df2", df2) - - df = concat([df1, df2], axis=1) - - # full selection - expected = store.select_as_multiple(["df1", "df2"], selector="df1") - results = list( - store.select_as_multiple(["df1", "df2"], selector="df1", chunksize=150) - ) - result = concat(results) - tm.assert_frame_equal(expected, result) - - def test_select_iterator_complete_8014(self, setup_path): - - # GH 8014 - # using iterator and where clause - chunksize = 1e4 - - # no iterator - with ensure_clean_store(setup_path) as store: - - expected = tm.makeTimeDataFrame(100064, "S") - _maybe_remove(store, "df") - store.append("df", expected) - - beg_dt = expected.index[0] - end_dt = expected.index[-1] - - # select w/o iteration and no where clause works - result = store.select("df") - tm.assert_frame_equal(expected, result) - - # select w/o iterator and where clause, single term, begin - # of range, works - where = f"index >= '{beg_dt}'" - result = store.select("df", where=where) - tm.assert_frame_equal(expected, result) - - # select w/o iterator and where clause, single term, end - # of range, works - where = f"index <= '{end_dt}'" - result = store.select("df", where=where) - tm.assert_frame_equal(expected, result) - - # select w/o iterator and where clause, inclusive range, - # works - where = f"index >= '{beg_dt}' & index <= '{end_dt}'" - result = store.select("df", where=where) - tm.assert_frame_equal(expected, result) - - # with iterator, full range - with ensure_clean_store(setup_path) as store: - - expected = tm.makeTimeDataFrame(100064, "S") - _maybe_remove(store, "df") - store.append("df", expected) - - beg_dt = expected.index[0] - end_dt = expected.index[-1] - - # select w/iterator and no where clause works - results = list(store.select("df", chunksize=chunksize)) - result = concat(results) - tm.assert_frame_equal(expected, result) - - # select w/iterator and where clause, single term, begin of range - where = f"index >= '{beg_dt}'" - results = list(store.select("df", where=where, chunksize=chunksize)) - result = concat(results) - tm.assert_frame_equal(expected, result) - - # select w/iterator and where clause, single term, end of range - where = f"index <= '{end_dt}'" - results = list(store.select("df", where=where, chunksize=chunksize)) - result = concat(results) - tm.assert_frame_equal(expected, result) - - # select w/iterator and where clause, inclusive range - where = f"index >= '{beg_dt}' & index <= '{end_dt}'" - results = list(store.select("df", where=where, chunksize=chunksize)) - result = concat(results) - tm.assert_frame_equal(expected, result) - - def test_select_iterator_non_complete_8014(self, setup_path): - - # GH 8014 - # using iterator and where clause - chunksize = 1e4 - - # with iterator, non complete range - with ensure_clean_store(setup_path) as store: - - expected = tm.makeTimeDataFrame(100064, "S") - _maybe_remove(store, "df") - store.append("df", expected) - - beg_dt = expected.index[1] - end_dt = expected.index[-2] - - # select w/iterator and where clause, single term, begin of range - where = f"index >= '{beg_dt}'" - results = list(store.select("df", where=where, chunksize=chunksize)) - result = concat(results) - rexpected = expected[expected.index >= beg_dt] - tm.assert_frame_equal(rexpected, result) - - # select w/iterator and where clause, single term, end of range - where = f"index <= '{end_dt}'" - results = list(store.select("df", where=where, chunksize=chunksize)) - result = concat(results) - rexpected = expected[expected.index <= end_dt] - tm.assert_frame_equal(rexpected, result) - - # select w/iterator and where clause, inclusive range - where = f"index >= '{beg_dt}' & index <= '{end_dt}'" - results = list(store.select("df", where=where, chunksize=chunksize)) - result = concat(results) - rexpected = expected[ - (expected.index >= beg_dt) & (expected.index <= end_dt) - ] - tm.assert_frame_equal(rexpected, result) - - # with iterator, empty where - with ensure_clean_store(setup_path) as store: - - expected = tm.makeTimeDataFrame(100064, "S") - _maybe_remove(store, "df") - store.append("df", expected) - - end_dt = expected.index[-1] - - # select w/iterator and where clause, single term, begin of range - where = f"index > '{end_dt}'" - results = list(store.select("df", where=where, chunksize=chunksize)) - assert 0 == len(results) - - def test_select_iterator_many_empty_frames(self, setup_path): - - # GH 8014 - # using iterator and where clause can return many empty - # frames. - chunksize = 10_000 - - # with iterator, range limited to the first chunk - with ensure_clean_store(setup_path) as store: - - expected = tm.makeTimeDataFrame(100000, "S") - _maybe_remove(store, "df") - store.append("df", expected) - - beg_dt = expected.index[0] - end_dt = expected.index[chunksize - 1] - - # select w/iterator and where clause, single term, begin of range - where = f"index >= '{beg_dt}'" - results = list(store.select("df", where=where, chunksize=chunksize)) - result = concat(results) - rexpected = expected[expected.index >= beg_dt] - tm.assert_frame_equal(rexpected, result) - - # select w/iterator and where clause, single term, end of range - where = f"index <= '{end_dt}'" - results = list(store.select("df", where=where, chunksize=chunksize)) - - assert len(results) == 1 - result = concat(results) - rexpected = expected[expected.index <= end_dt] - tm.assert_frame_equal(rexpected, result) - - # select w/iterator and where clause, inclusive range - where = f"index >= '{beg_dt}' & index <= '{end_dt}'" - results = list(store.select("df", where=where, chunksize=chunksize)) - - # should be 1, is 10 - assert len(results) == 1 - result = concat(results) - rexpected = expected[ - (expected.index >= beg_dt) & (expected.index <= end_dt) - ] - tm.assert_frame_equal(rexpected, result) - - # select w/iterator and where clause which selects - # *nothing*. - # - # To be consistent with Python idiom I suggest this should - # return [] e.g. `for e in []: print True` never prints - # True. - - where = f"index <= '{beg_dt}' & index >= '{end_dt}'" - results = list(store.select("df", where=where, chunksize=chunksize)) - - # should be [] - assert len(results) == 0 - - @pytest.mark.filterwarnings( - "ignore:\\nthe :pandas.io.pytables.AttributeConflictWarning" - ) - def test_retain_index_attributes(self, setup_path): - - # GH 3499, losing frequency info on index recreation - df = DataFrame( - {"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="H"))} - ) - - with ensure_clean_store(setup_path) as store: - _maybe_remove(store, "data") - store.put("data", df, format="table") - - result = store.get("data") - tm.assert_frame_equal(df, result) - - for attr in ["freq", "tz", "name"]: - for idx in ["index", "columns"]: - assert getattr(getattr(df, idx), attr, None) == getattr( - getattr(result, idx), attr, None - ) - - # try to append a table with a different frequency - with catch_warnings(record=True): - df2 = DataFrame( - { - "A": Series( - range(3), index=date_range("2002-1-1", periods=3, freq="D") - ) - } - ) - store.append("data", df2) - - assert store.get_storer("data").info["index"]["freq"] is None - - # this is ok - _maybe_remove(store, "df2") - df2 = DataFrame( - { - "A": Series( - range(3), - index=[ - Timestamp("20010101"), - Timestamp("20010102"), - Timestamp("20020101"), - ], - ) - } - ) - store.append("df2", df2) - df3 = DataFrame( - { - "A": Series( - range(3), index=date_range("2002-1-1", periods=3, freq="D") - ) - } - ) - store.append("df2", df3) - - @pytest.mark.filterwarnings( - "ignore:\\nthe :pandas.io.pytables.AttributeConflictWarning" - ) - def test_retain_index_attributes2(self, setup_path): - with ensure_clean_path(setup_path) as path: - - with catch_warnings(record=True): - - df = DataFrame( - { - "A": Series( - range(3), index=date_range("2000-1-1", periods=3, freq="H") - ) - } - ) - df.to_hdf(path, "data", mode="w", append=True) - df2 = DataFrame( - { - "A": Series( - range(3), index=date_range("2002-1-1", periods=3, freq="D") - ) - } - ) - - df2.to_hdf(path, "data", append=True) - - idx = date_range("2000-1-1", periods=3, freq="H") - idx.name = "foo" - df = DataFrame({"A": Series(range(3), index=idx)}) - df.to_hdf(path, "data", mode="w", append=True) - - assert read_hdf(path, "data").index.name == "foo" - - with catch_warnings(record=True): - - idx2 = date_range("2001-1-1", periods=3, freq="H") - idx2.name = "bar" - df2 = DataFrame({"A": Series(range(3), index=idx2)}) - df2.to_hdf(path, "data", append=True) - - assert read_hdf(path, "data").index.name is None - - def test_frame_select(self, setup_path): - - df = tm.makeTimeDataFrame() - - with ensure_clean_store(setup_path) as store: - store.put("frame", df, format="table") - date = df.index[len(df) // 2] - - crit1 = Term("index>=date") - assert crit1.env.scope["date"] == date - - crit2 = "columns=['A', 'D']" - crit3 = "columns=A" - - result = store.select("frame", [crit1, crit2]) - expected = df.loc[date:, ["A", "D"]] - tm.assert_frame_equal(result, expected) - - result = store.select("frame", [crit3]) - expected = df.loc[:, ["A"]] - tm.assert_frame_equal(result, expected) - - # invalid terms - df = tm.makeTimeDataFrame() - store.append("df_time", df) - msg = "could not convert string to Timestamp" - with pytest.raises(ValueError, match=msg): - store.select("df_time", "index>0") - - # can't select if not written as table - # store['frame'] = df - # with pytest.raises(ValueError): - # store.select('frame', [crit1, crit2]) - - def test_frame_select_complex(self, setup_path): - # select via complex criteria - - df = tm.makeTimeDataFrame() - df["string"] = "foo" - df.loc[df.index[0:4], "string"] = "bar" - - with ensure_clean_store(setup_path) as store: - store.put("df", df, format="table", data_columns=["string"]) - - # empty - result = store.select("df", 'index>df.index[3] & string="bar"') - expected = df.loc[(df.index > df.index[3]) & (df.string == "bar")] - tm.assert_frame_equal(result, expected) - - result = store.select("df", 'index>df.index[3] & string="foo"') - expected = df.loc[(df.index > df.index[3]) & (df.string == "foo")] - tm.assert_frame_equal(result, expected) - - # or - result = store.select("df", 'index>df.index[3] | string="bar"') - expected = df.loc[(df.index > df.index[3]) | (df.string == "bar")] - tm.assert_frame_equal(result, expected) - - result = store.select( - "df", '(index>df.index[3] & index<=df.index[6]) | string="bar"' - ) - expected = df.loc[ - ((df.index > df.index[3]) & (df.index <= df.index[6])) - | (df.string == "bar") - ] - tm.assert_frame_equal(result, expected) - - # invert - result = store.select("df", 'string!="bar"') - expected = df.loc[df.string != "bar"] - tm.assert_frame_equal(result, expected) - - # invert not implemented in numexpr :( - msg = "cannot use an invert condition when passing to numexpr" - with pytest.raises(NotImplementedError, match=msg): - store.select("df", '~(string="bar")') - - # invert ok for filters - result = store.select("df", "~(columns=['A','B'])") - expected = df.loc[:, df.columns.difference(["A", "B"])] - tm.assert_frame_equal(result, expected) - - # in - result = store.select("df", "index>df.index[3] & columns in ['A','B']") - expected = df.loc[df.index > df.index[3]].reindex(columns=["A", "B"]) - tm.assert_frame_equal(result, expected) - - def test_frame_select_complex2(self, setup_path): - - with ensure_clean_path(["parms.hdf", "hist.hdf"]) as paths: - - pp, hh = paths - - # use non-trivial selection criteria - parms = DataFrame({"A": [1, 1, 2, 2, 3]}) - parms.to_hdf(pp, "df", mode="w", format="table", data_columns=["A"]) - - selection = read_hdf(pp, "df", where="A=[2,3]") - hist = DataFrame( - np.random.randn(25, 1), - columns=["data"], - index=MultiIndex.from_tuples( - [(i, j) for i in range(5) for j in range(5)], names=["l1", "l2"] - ), - ) - - hist.to_hdf(hh, "df", mode="w", format="table") - - expected = read_hdf(hh, "df", where="l1=[2, 3, 4]") - - # scope with list like - l = selection.index.tolist() # noqa - store = HDFStore(hh) - result = store.select("df", where="l1=l") - tm.assert_frame_equal(result, expected) - store.close() - - result = read_hdf(hh, "df", where="l1=l") - tm.assert_frame_equal(result, expected) - - # index - index = selection.index # noqa - result = read_hdf(hh, "df", where="l1=index") - tm.assert_frame_equal(result, expected) - - result = read_hdf(hh, "df", where="l1=selection.index") - tm.assert_frame_equal(result, expected) - - result = read_hdf(hh, "df", where="l1=selection.index.tolist()") - tm.assert_frame_equal(result, expected) - - result = read_hdf(hh, "df", where="l1=list(selection.index)") - tm.assert_frame_equal(result, expected) - - # scope with index - store = HDFStore(hh) - - result = store.select("df", where="l1=index") - tm.assert_frame_equal(result, expected) - - result = store.select("df", where="l1=selection.index") - tm.assert_frame_equal(result, expected) - - result = store.select("df", where="l1=selection.index.tolist()") - tm.assert_frame_equal(result, expected) - - result = store.select("df", where="l1=list(selection.index)") - tm.assert_frame_equal(result, expected) - - store.close() - - def test_invalid_filtering(self, setup_path): - - # can't use more than one filter (atm) - - df = tm.makeTimeDataFrame() - - with ensure_clean_store(setup_path) as store: - store.put("df", df, format="table") - - msg = "unable to collapse Joint Filters" - # not implemented - with pytest.raises(NotImplementedError, match=msg): - store.select("df", "columns=['A'] | columns=['B']") - - # in theory we could deal with this - with pytest.raises(NotImplementedError, match=msg): - store.select("df", "columns=['A','B'] & columns=['C']") - - def test_string_select(self, setup_path): - # GH 2973 - with ensure_clean_store(setup_path) as store: - - df = tm.makeTimeDataFrame() - - # test string ==/!= - df["x"] = "none" - df.loc[df.index[2:7], "x"] = "" - - store.append("df", df, data_columns=["x"]) - - result = store.select("df", "x=none") - expected = df[df.x == "none"] - tm.assert_frame_equal(result, expected) - - result = store.select("df", "x!=none") - expected = df[df.x != "none"] - tm.assert_frame_equal(result, expected) - - df2 = df.copy() - df2.loc[df2.x == "", "x"] = np.nan - - store.append("df2", df2, data_columns=["x"]) - result = store.select("df2", "x!=none") - expected = df2[isna(df2.x)] - tm.assert_frame_equal(result, expected) - - # int ==/!= - df["int"] = 1 - df.loc[df.index[2:7], "int"] = 2 - - store.append("df3", df, data_columns=["int"]) - - result = store.select("df3", "int=2") - expected = df[df.int == 2] - tm.assert_frame_equal(result, expected) - - result = store.select("df3", "int!=2") - expected = df[df.int != 2] - tm.assert_frame_equal(result, expected) - - def test_read_column(self, setup_path): - - df = tm.makeTimeDataFrame() - - with ensure_clean_store(setup_path) as store: - _maybe_remove(store, "df") - - # GH 17912 - # HDFStore.select_column should raise a KeyError - # exception if the key is not a valid store - with pytest.raises(KeyError, match="No object named df in the file"): - store.select_column("df", "index") - - store.append("df", df) - # error - with pytest.raises( - KeyError, match=re.escape("'column [foo] not found in the table'") - ): - store.select_column("df", "foo") - - msg = re.escape( - "select_column() got an unexpected keyword argument 'where'" - ) - with pytest.raises(TypeError, match=msg): - store.select_column("df", "index", where=["index>5"]) - - # valid - result = store.select_column("df", "index") - tm.assert_almost_equal(result.values, Series(df.index).values) - assert isinstance(result, Series) - - # not a data indexable column - msg = re.escape( - "column [values_block_0] can not be extracted individually; " - "it is not data indexable" - ) - with pytest.raises(ValueError, match=msg): - store.select_column("df", "values_block_0") - - # a data column - df2 = df.copy() - df2["string"] = "foo" - store.append("df2", df2, data_columns=["string"]) - result = store.select_column("df2", "string") - tm.assert_almost_equal(result.values, df2["string"].values) - - # a data column with NaNs, result excludes the NaNs - df3 = df.copy() - df3["string"] = "foo" - df3.loc[df3.index[4:6], "string"] = np.nan - store.append("df3", df3, data_columns=["string"]) - result = store.select_column("df3", "string") - tm.assert_almost_equal(result.values, df3["string"].values) - - # start/stop - result = store.select_column("df3", "string", start=2) - tm.assert_almost_equal(result.values, df3["string"].values[2:]) - - result = store.select_column("df3", "string", start=-2) - tm.assert_almost_equal(result.values, df3["string"].values[-2:]) - - result = store.select_column("df3", "string", stop=2) - tm.assert_almost_equal(result.values, df3["string"].values[:2]) - - result = store.select_column("df3", "string", stop=-2) - tm.assert_almost_equal(result.values, df3["string"].values[:-2]) - - result = store.select_column("df3", "string", start=2, stop=-2) - tm.assert_almost_equal(result.values, df3["string"].values[2:-2]) - - result = store.select_column("df3", "string", start=-2, stop=2) - tm.assert_almost_equal(result.values, df3["string"].values[-2:2]) - - # GH 10392 - make sure column name is preserved - df4 = DataFrame({"A": np.random.randn(10), "B": "foo"}) - store.append("df4", df4, data_columns=True) - expected = df4["B"] - result = store.select_column("df4", "B") - tm.assert_series_equal(result, expected) - - def test_coordinates(self, setup_path): - df = tm.makeTimeDataFrame() - - with ensure_clean_store(setup_path) as store: - - _maybe_remove(store, "df") - store.append("df", df) - - # all - c = store.select_as_coordinates("df") - assert (c.values == np.arange(len(df.index))).all() - - # get coordinates back & test vs frame - _maybe_remove(store, "df") - - df = DataFrame({"A": range(5), "B": range(5)}) - store.append("df", df) - c = store.select_as_coordinates("df", ["index<3"]) - assert (c.values == np.arange(3)).all() - result = store.select("df", where=c) - expected = df.loc[0:2, :] - tm.assert_frame_equal(result, expected) - - c = store.select_as_coordinates("df", ["index>=3", "index<=4"]) - assert (c.values == np.arange(2) + 3).all() - result = store.select("df", where=c) - expected = df.loc[3:4, :] - tm.assert_frame_equal(result, expected) - assert isinstance(c, Index) - - # multiple tables - _maybe_remove(store, "df1") - _maybe_remove(store, "df2") - df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) - store.append("df1", df1, data_columns=["A", "B"]) - store.append("df2", df2) - - c = store.select_as_coordinates("df1", ["A>0", "B>0"]) - df1_result = store.select("df1", c) - df2_result = store.select("df2", c) - result = concat([df1_result, df2_result], axis=1) - - expected = concat([df1, df2], axis=1) - expected = expected[(expected.A > 0) & (expected.B > 0)] - tm.assert_frame_equal(result, expected) - - # pass array/mask as the coordinates - with ensure_clean_store(setup_path) as store: - - df = DataFrame( - np.random.randn(1000, 2), index=date_range("20000101", periods=1000) - ) - store.append("df", df) - c = store.select_column("df", "index") - where = c[DatetimeIndex(c).month == 5].index - expected = df.iloc[where] - - # locations - result = store.select("df", where=where) - tm.assert_frame_equal(result, expected) - - # boolean - result = store.select("df", where=where) - tm.assert_frame_equal(result, expected) - - # invalid - msg = "cannot process expression" - with pytest.raises(ValueError, match=msg): - store.select("df", where=np.arange(len(df), dtype="float64")) - - with pytest.raises(ValueError, match=msg): - store.select("df", where=np.arange(len(df) + 1)) - - with pytest.raises(ValueError, match=msg): - store.select("df", where=np.arange(len(df)), start=5) - - with pytest.raises(ValueError, match=msg): - store.select("df", where=np.arange(len(df)), start=5, stop=10) - - # selection with filter - selection = date_range("20000101", periods=500) - result = store.select("df", where="index in selection") - expected = df[df.index.isin(selection)] - tm.assert_frame_equal(result, expected) - - # list - df = DataFrame(np.random.randn(10, 2)) - store.append("df2", df) - result = store.select("df2", where=[0, 3, 5]) - expected = df.iloc[[0, 3, 5]] - tm.assert_frame_equal(result, expected) - - # boolean - where = [True] * 10 - where[-2] = False - result = store.select("df2", where=where) - expected = df.loc[where] - tm.assert_frame_equal(result, expected) - - # start/stop - result = store.select("df2", start=5, stop=10) - expected = df[5:10] - tm.assert_frame_equal(result, expected) - - def test_append_to_multiple(self, setup_path): - df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) - df2["foo"] = "bar" - df = concat([df1, df2], axis=1) - - with ensure_clean_store(setup_path) as store: - - # exceptions - msg = "append_to_multiple requires a selector that is in passed dict" - with pytest.raises(ValueError, match=msg): - store.append_to_multiple( - {"df1": ["A", "B"], "df2": None}, df, selector="df3" - ) - - with pytest.raises(ValueError, match=msg): - store.append_to_multiple({"df1": None, "df2": None}, df, selector="df3") - - msg = ( - "append_to_multiple must have a dictionary specified as the way to " - "split the value" - ) - with pytest.raises(ValueError, match=msg): - store.append_to_multiple("df1", df, "df1") - - # regular operation - store.append_to_multiple( - {"df1": ["A", "B"], "df2": None}, df, selector="df1" - ) - result = store.select_as_multiple( - ["df1", "df2"], where=["A>0", "B>0"], selector="df1" - ) - expected = df[(df.A > 0) & (df.B > 0)] - tm.assert_frame_equal(result, expected) - - def test_append_to_multiple_dropna(self, setup_path): - df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) - df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan - df = concat([df1, df2], axis=1) - - with ensure_clean_store(setup_path) as store: - - # dropna=True should guarantee rows are synchronized - store.append_to_multiple( - {"df1": ["A", "B"], "df2": None}, df, selector="df1", dropna=True - ) - result = store.select_as_multiple(["df1", "df2"]) - expected = df.dropna() - tm.assert_frame_equal(result, expected) - tm.assert_index_equal(store.select("df1").index, store.select("df2").index) - - @pytest.mark.xfail( - run=False, reason="append_to_multiple_dropna_false is not raising as failed" - ) - def test_append_to_multiple_dropna_false(self, setup_path): - df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) - df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan - df = concat([df1, df2], axis=1) - - with ensure_clean_store(setup_path) as store: - - # dropna=False shouldn't synchronize row indexes - store.append_to_multiple( - {"df1a": ["A", "B"], "df2a": None}, df, selector="df1a", dropna=False - ) - - # TODO Update error message to desired message for this case - msg = "Cannot select as multiple after appending with dropna=False" - with pytest.raises(ValueError, match=msg): - store.select_as_multiple(["df1a", "df2a"]) - - assert not store.select("df1a").index.equals(store.select("df2a").index) - - def test_append_to_multiple_min_itemsize(self, setup_path): - # GH 11238 - df = DataFrame( - { - "IX": np.arange(1, 21), - "Num": np.arange(1, 21), - "BigNum": np.arange(1, 21) * 88, - "Str": ["a" for _ in range(20)], - "LongStr": ["abcde" for _ in range(20)], - } - ) - expected = df.iloc[[0]] - - with ensure_clean_store(setup_path) as store: - store.append_to_multiple( - { - "index": ["IX"], - "nums": ["Num", "BigNum"], - "strs": ["Str", "LongStr"], - }, - df.iloc[[0]], - "index", - min_itemsize={"Str": 10, "LongStr": 100, "Num": 2}, - ) - result = store.select_as_multiple(["index", "nums", "strs"]) - tm.assert_frame_equal(result, expected) - - def test_select_as_multiple(self, setup_path): - - df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) - df2["foo"] = "bar" - - with ensure_clean_store(setup_path) as store: - - msg = "keys must be a list/tuple" - # no tables stored - with pytest.raises(TypeError, match=msg): - store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1") - - store.append("df1", df1, data_columns=["A", "B"]) - store.append("df2", df2) - - # exceptions - with pytest.raises(TypeError, match=msg): - store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1") - - with pytest.raises(TypeError, match=msg): - store.select_as_multiple([None], where=["A>0", "B>0"], selector="df1") - - msg = "'No object named df3 in the file'" - with pytest.raises(KeyError, match=msg): - store.select_as_multiple( - ["df1", "df3"], where=["A>0", "B>0"], selector="df1" - ) - - with pytest.raises(KeyError, match=msg): - store.select_as_multiple(["df3"], where=["A>0", "B>0"], selector="df1") - - with pytest.raises(KeyError, match="'No object named df4 in the file'"): - store.select_as_multiple( - ["df1", "df2"], where=["A>0", "B>0"], selector="df4" - ) - - # default select - result = store.select("df1", ["A>0", "B>0"]) - expected = store.select_as_multiple( - ["df1"], where=["A>0", "B>0"], selector="df1" - ) - tm.assert_frame_equal(result, expected) - expected = store.select_as_multiple( - "df1", where=["A>0", "B>0"], selector="df1" - ) - tm.assert_frame_equal(result, expected) - - # multiple - result = store.select_as_multiple( - ["df1", "df2"], where=["A>0", "B>0"], selector="df1" - ) - expected = concat([df1, df2], axis=1) - expected = expected[(expected.A > 0) & (expected.B > 0)] - tm.assert_frame_equal(result, expected) - - # multiple (diff selector) - result = store.select_as_multiple( - ["df1", "df2"], where="index>df2.index[4]", selector="df2" - ) - expected = concat([df1, df2], axis=1) - expected = expected[5:] - tm.assert_frame_equal(result, expected) - - # test exception for diff rows - store.append("df3", tm.makeTimeDataFrame(nper=50)) - msg = "all tables must have exactly the same nrows!" - with pytest.raises(ValueError, match=msg): - store.select_as_multiple( - ["df1", "df3"], where=["A>0", "B>0"], selector="df1" - ) - - @pytest.mark.skipif( - LooseVersion(tables.__version__) < LooseVersion("3.1.0"), - reason=("tables version does not support fix for nan selection bug: GH 4858"), - ) - def test_nan_selection_bug_4858(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - df = DataFrame({"cols": range(6), "values": range(6)}, dtype="float64") - df["cols"] = (df["cols"] + 10).apply(str) - df.iloc[0] = np.nan - - expected = DataFrame( - {"cols": ["13.0", "14.0", "15.0"], "values": [3.0, 4.0, 5.0]}, - index=[3, 4, 5], - ) - - # write w/o the index on that particular column - store.append("df", df, data_columns=True, index=["cols"]) - result = store.select("df", where="values>2.0") - tm.assert_frame_equal(result, expected) - - def test_start_stop_table(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - # table - df = DataFrame({"A": np.random.rand(20), "B": np.random.rand(20)}) - store.append("df", df) - - result = store.select("df", "columns=['A']", start=0, stop=5) - expected = df.loc[0:4, ["A"]] - tm.assert_frame_equal(result, expected) - - # out of range - result = store.select("df", "columns=['A']", start=30, stop=40) - assert len(result) == 0 - expected = df.loc[30:40, ["A"]] - tm.assert_frame_equal(result, expected) - - def test_start_stop_multiple(self, setup_path): - - # GH 16209 - with ensure_clean_store(setup_path) as store: - - df = DataFrame({"foo": [1, 2], "bar": [1, 2]}) - - store.append_to_multiple( - {"selector": ["foo"], "data": None}, df, selector="selector" - ) - result = store.select_as_multiple( - ["selector", "data"], selector="selector", start=0, stop=1 - ) - expected = df.loc[[0], ["foo", "bar"]] - tm.assert_frame_equal(result, expected) + types_should_fail = [ + tm.makeIntIndex, + tm.makeFloatIndex, + tm.makeDateIndex, + tm.makeTimedeltaIndex, + tm.makePeriodIndex, + ] + types_should_run = [ + tm.makeStringIndex, + tm.makeCategoricalIndex, + tm.makeUnicodeIndex, + ] - def test_start_stop_fixed(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - # fixed, GH 8287 - df = DataFrame( - {"A": np.random.rand(20), "B": np.random.rand(20)}, - index=pd.date_range("20130101", periods=20), - ) - store.put("df", df) - - result = store.select("df", start=0, stop=5) - expected = df.iloc[0:5, :] - tm.assert_frame_equal(result, expected) - - result = store.select("df", start=5, stop=10) - expected = df.iloc[5:10, :] - tm.assert_frame_equal(result, expected) - - # out of range - result = store.select("df", start=30, stop=40) - expected = df.iloc[30:40, :] - tm.assert_frame_equal(result, expected) - - # series - s = df.A - store.put("s", s) - result = store.select("s", start=0, stop=5) - expected = s.iloc[0:5] - tm.assert_series_equal(result, expected) - - result = store.select("s", start=5, stop=10) - expected = s.iloc[5:10] - tm.assert_series_equal(result, expected) - - # sparse; not implemented - df = tm.makeDataFrame() - df.iloc[3:5, 1:3] = np.nan - df.iloc[8:10, -2] = np.nan - - def test_select_filter_corner(self, setup_path): - - df = DataFrame(np.random.randn(50, 100)) - df.index = [f"{c:3d}" for c in df.index] - df.columns = [f"{c:3d}" for c in df.columns] - - with ensure_clean_store(setup_path) as store: - store.put("frame", df, format="table") - - crit = "columns=df.columns[:75]" - result = store.select("frame", [crit]) - tm.assert_frame_equal(result, df.loc[:, df.columns[:75]]) - - crit = "columns=df.columns[:75:2]" - result = store.select("frame", [crit]) - tm.assert_frame_equal(result, df.loc[:, df.columns[:75:2]]) - - def test_path_pathlib(self, setup_path): - df = tm.makeDataFrame() - - result = tm.round_trip_pathlib( - lambda p: df.to_hdf(p, "df"), lambda p: pd.read_hdf(p, "df") - ) - tm.assert_frame_equal(df, result) - - @pytest.mark.parametrize("start, stop", [(0, 2), (1, 2), (None, None)]) - def test_contiguous_mixed_data_table(self, start, stop, setup_path): - # GH 17021 - # ValueError when reading a contiguous mixed-data table ft. VLArray - df = DataFrame( - { - "a": Series([20111010, 20111011, 20111012]), - "b": Series(["ab", "cd", "ab"]), - } - ) - - with ensure_clean_store(setup_path) as store: - store.append("test_dataset", df) - - result = store.select("test_dataset", start=start, stop=stop) - tm.assert_frame_equal(df[start:stop], result) - - def test_path_pathlib_hdfstore(self, setup_path): - df = tm.makeDataFrame() - - def writer(path): - with HDFStore(path) as store: - df.to_hdf(store, "df") - - def reader(path): - with HDFStore(path) as store: - return pd.read_hdf(store, "df") - - result = tm.round_trip_pathlib(writer, reader) - tm.assert_frame_equal(df, result) - - def test_pickle_path_localpath(self, setup_path): - df = tm.makeDataFrame() - result = tm.round_trip_pathlib( - lambda p: df.to_hdf(p, "df"), lambda p: pd.read_hdf(p, "df") - ) - tm.assert_frame_equal(df, result) - - def test_path_localpath_hdfstore(self, setup_path): - df = tm.makeDataFrame() - - def writer(path): - with HDFStore(path) as store: - df.to_hdf(store, "df") - - def reader(path): - with HDFStore(path) as store: - return pd.read_hdf(store, "df") - - result = tm.round_trip_localpath(writer, reader) - tm.assert_frame_equal(df, result) - - def _check_roundtrip(self, obj, comparator, path, compression=False, **kwargs): - - options = {} - if compression: - options["complib"] = _default_compressor - - with ensure_clean_store(path, "w", **options) as store: - store["obj"] = obj - retrieved = store["obj"] - comparator(retrieved, obj, **kwargs) - - def _check_double_roundtrip( - self, obj, comparator, path, compression=False, **kwargs - ): - options = {} - if compression: - options["complib"] = compression or _default_compressor - - with ensure_clean_store(path, "w", **options) as store: - store["obj"] = obj - retrieved = store["obj"] - comparator(retrieved, obj, **kwargs) - store["obj"] = retrieved - again = store["obj"] - comparator(again, obj, **kwargs) - - def _check_roundtrip_table(self, obj, comparator, path, compression=False): - options = {} - if compression: - options["complib"] = _default_compressor - - with ensure_clean_store(path, "w", **options) as store: - store.put("obj", obj, format="table") - retrieved = store["obj"] - - comparator(retrieved, obj) - - def test_multiple_open_close(self, setup_path): - # gh-4409: open & close multiple times - - with ensure_clean_path(setup_path) as path: - - df = tm.makeDataFrame() - df.to_hdf(path, "df", mode="w", format="table") - - # single - store = HDFStore(path) - assert "CLOSED" not in store.info() - assert store.is_open - - store.close() - assert "CLOSED" in store.info() - assert not store.is_open - - with ensure_clean_path(setup_path) as path: - - if pytables._table_file_open_policy_is_strict: - # multiples - store1 = HDFStore(path) - msg = ( - r"The file [\S]* is already opened\. Please close it before " - r"reopening in write mode\." - ) - with pytest.raises(ValueError, match=msg): - HDFStore(path) - - store1.close() - else: - - # multiples - store1 = HDFStore(path) - store2 = HDFStore(path) - - assert "CLOSED" not in store1.info() - assert "CLOSED" not in store2.info() - assert store1.is_open - assert store2.is_open - - store1.close() - assert "CLOSED" in store1.info() - assert not store1.is_open - assert "CLOSED" not in store2.info() - assert store2.is_open - - store2.close() - assert "CLOSED" in store1.info() - assert "CLOSED" in store2.info() - assert not store1.is_open - assert not store2.is_open - - # nested close - store = HDFStore(path, mode="w") - store.append("df", df) - - store2 = HDFStore(path) - store2.append("df2", df) - store2.close() - assert "CLOSED" in store2.info() - assert not store2.is_open - - store.close() - assert "CLOSED" in store.info() - assert not store.is_open - - # double closing - store = HDFStore(path, mode="w") - store.append("df", df) - - store2 = HDFStore(path) - store.close() - assert "CLOSED" in store.info() - assert not store.is_open - - store2.close() - assert "CLOSED" in store2.info() - assert not store2.is_open - - # ops on a closed store + for index in types_should_fail: + df = DataFrame(np.random.randn(10, 2), columns=index(2)) with ensure_clean_path(setup_path) as path: - - df = tm.makeDataFrame() - df.to_hdf(path, "df", mode="w", format="table") - - store = HDFStore(path) - store.close() - - msg = r"[\S]* file is not open!" - with pytest.raises(ClosedFileError, match=msg): - store.keys() - - with pytest.raises(ClosedFileError, match=msg): - "df" in store - - with pytest.raises(ClosedFileError, match=msg): - len(store) - - with pytest.raises(ClosedFileError, match=msg): - store["df"] - - with pytest.raises(ClosedFileError, match=msg): - store.select("df") - - with pytest.raises(ClosedFileError, match=msg): - store.get("df") - - with pytest.raises(ClosedFileError, match=msg): - store.append("df2", df) - - with pytest.raises(ClosedFileError, match=msg): - store.put("df3", df) - - with pytest.raises(ClosedFileError, match=msg): - store.get_storer("df2") - - with pytest.raises(ClosedFileError, match=msg): - store.remove("df2") - - with pytest.raises(ClosedFileError, match=msg): - store.select("df") - - msg = "'HDFStore' object has no attribute 'df'" - with pytest.raises(AttributeError, match=msg): - store.df - - def test_pytables_native_read(self, datapath, setup_path): - with ensure_clean_store( - datapath("io", "data", "legacy_hdf/pytables_native.h5"), mode="r" - ) as store: - d2 = store["detector/readout"] - assert isinstance(d2, DataFrame) - - @pytest.mark.skipif( - is_platform_windows(), reason="native2 read fails oddly on windows" - ) - def test_pytables_native2_read(self, datapath, setup_path): - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "pytables_native2.h5"), mode="r" - ) as store: - str(store) - d1 = store["detector"] - assert isinstance(d1, DataFrame) - - def test_legacy_table_fixed_format_read_py2(self, datapath, setup_path): - # GH 24510 - # legacy table with fixed format written in Python 2 - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "legacy_table_fixed_py2.h5"), mode="r" - ) as store: - result = store.select("df") - expected = DataFrame( - [[1, 2, 3, "D"]], - columns=["A", "B", "C", "D"], - index=Index(["ABC"], name="INDEX_NAME"), - ) - tm.assert_frame_equal(expected, result) - - def test_legacy_table_fixed_format_read_datetime_py2(self, datapath, setup_path): - # GH 31750 - # legacy table with fixed format and datetime64 column written in Python 2 - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "legacy_table_fixed_datetime_py2.h5"), - mode="r", - ) as store: - result = store.select("df") - expected = DataFrame( - [[Timestamp("2020-02-06T18:00")]], - columns=["A"], - index=Index(["date"]), - ) - tm.assert_frame_equal(expected, result) - - def test_legacy_table_read_py2(self, datapath, setup_path): - # issue: 24925 - # legacy table written in Python 2 - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "legacy_table_py2.h5"), mode="r" - ) as store: - result = store.select("table") - - expected = DataFrame({"a": ["a", "b"], "b": [2, 3]}) - tm.assert_frame_equal(expected, result) - - def test_copy(self, setup_path): - - with catch_warnings(record=True): - - def do_copy(f, new_f=None, keys=None, propindexes=True, **kwargs): - try: - store = HDFStore(f, "r") - - if new_f is None: - import tempfile - - fd, new_f = tempfile.mkstemp() - tstore = store.copy( - new_f, keys=keys, propindexes=propindexes, **kwargs - ) - - # check keys - if keys is None: - keys = store.keys() - assert set(keys) == set(tstore.keys()) - - # check indices & nrows - for k in tstore.keys(): - if tstore.get_storer(k).is_table: - new_t = tstore.get_storer(k) - orig_t = store.get_storer(k) - - assert orig_t.nrows == new_t.nrows - - # check propindixes - if propindexes: - for a in orig_t.axes: - if a.is_indexed: - assert new_t[a.name].is_indexed - - finally: - safe_close(store) - safe_close(tstore) - try: - os.close(fd) - except (OSError, ValueError): - pass - os.remove(new_f) - - # new table - df = tm.makeDataFrame() - - with tm.ensure_clean() as path: - st = HDFStore(path) - st.append("df", df, data_columns=["A"]) - st.close() - do_copy(f=path) - do_copy(f=path, propindexes=False) - - def test_store_datetime_fractional_secs(self, setup_path): - - with ensure_clean_store(setup_path) as store: - dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456) - series = Series([0], [dt]) - store["a"] = series - assert store["a"].index[0] == dt - - def test_tseries_indices_series(self, setup_path): - - with ensure_clean_store(setup_path) as store: - idx = tm.makeDateIndex(10) - ser = Series(np.random.randn(len(idx)), idx) - store["a"] = ser - result = store["a"] - - tm.assert_series_equal(result, ser) - assert result.index.freq == ser.index.freq - tm.assert_class_equal(result.index, ser.index, obj="series index") - - idx = tm.makePeriodIndex(10) - ser = Series(np.random.randn(len(idx)), idx) - store["a"] = ser - result = store["a"] - - tm.assert_series_equal(result, ser) - assert result.index.freq == ser.index.freq - tm.assert_class_equal(result.index, ser.index, obj="series index") - - def test_tseries_indices_frame(self, setup_path): - - with ensure_clean_store(setup_path) as store: - idx = tm.makeDateIndex(10) - df = DataFrame(np.random.randn(len(idx), 3), index=idx) - store["a"] = df - result = store["a"] - - tm.assert_frame_equal(result, df) - assert result.index.freq == df.index.freq - tm.assert_class_equal(result.index, df.index, obj="dataframe index") - - idx = tm.makePeriodIndex(10) - df = DataFrame(np.random.randn(len(idx), 3), idx) - store["a"] = df - result = store["a"] - - tm.assert_frame_equal(result, df) - assert result.index.freq == df.index.freq - tm.assert_class_equal(result.index, df.index, obj="dataframe index") - - def test_unicode_index(self, setup_path): - - unicode_values = ["\u03c3", "\u03c3\u03c3"] - - # PerformanceWarning - with catch_warnings(record=True): - simplefilter("ignore", pd.errors.PerformanceWarning) - s = Series(np.random.randn(len(unicode_values)), unicode_values) - self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) - - def test_unicode_longer_encoded(self, setup_path): - # GH 11234 - char = "\u0394" - df = DataFrame({"A": [char]}) - with ensure_clean_store(setup_path) as store: - store.put("df", df, format="table", encoding="utf-8") - result = store.get("df") - tm.assert_frame_equal(result, df) - - df = DataFrame({"A": ["a", char], "B": ["b", "b"]}) - with ensure_clean_store(setup_path) as store: - store.put("df", df, format="table", encoding="utf-8") - result = store.get("df") - tm.assert_frame_equal(result, df) - - def test_store_datetime_mixed(self, setup_path): - - df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]}) - ts = tm.makeTimeSeries() - df["d"] = ts.index[:3] - self._check_roundtrip(df, tm.assert_frame_equal, path=setup_path) - - # FIXME: don't leave commented-out code - # def test_cant_write_multiindex_table(self): - # # for now, #1848 - # df = DataFrame(np.random.randn(10, 4), - # index=[np.arange(5).repeat(2), - # np.tile(np.arange(2), 5)]) - # - # with pytest.raises(Exception): - # store.put('foo', df, format='table') - - def test_append_with_diff_col_name_types_raises_value_error(self, setup_path): - df = DataFrame(np.random.randn(10, 1)) - df2 = DataFrame({"a": np.random.randn(10)}) - df3 = DataFrame({(1, 2): np.random.randn(10)}) - df4 = DataFrame({("1", 2): np.random.randn(10)}) - df5 = DataFrame({("1", 2, object): np.random.randn(10)}) - - with ensure_clean_store(setup_path) as store: - name = f"df_{tm.rands(10)}" - store.append(name, df) - - for d in (df2, df3, df4, df5): - msg = re.escape( - "cannot match existing table structure for [0] on appending data" - ) + with catch_warnings(record=True): + msg = "cannot have non-object label DataIndexableCol" with pytest.raises(ValueError, match=msg): - store.append(name, d) - - def test_query_with_nested_special_character(self, setup_path): - df = DataFrame( - { - "a": ["a", "a", "c", "b", "test & test", "c", "b", "e"], - "b": [1, 2, 3, 4, 5, 6, 7, 8], - } - ) - expected = df[df.a == "test & test"] - with ensure_clean_store(setup_path) as store: - store.append("test", df, format="table", data_columns=True) - result = store.select("test", 'a = "test & test"') - tm.assert_frame_equal(expected, result) - - def test_categorical(self, setup_path): - - with ensure_clean_store(setup_path) as store: - - # Basic - _maybe_remove(store, "s") - s = Series( - Categorical( - ["a", "b", "b", "a", "a", "c"], - categories=["a", "b", "c", "d"], - ordered=False, - ) - ) - store.append("s", s, format="table") - result = store.select("s") - tm.assert_series_equal(s, result) - - _maybe_remove(store, "s_ordered") - s = Series( - Categorical( - ["a", "b", "b", "a", "a", "c"], - categories=["a", "b", "c", "d"], - ordered=True, - ) - ) - store.append("s_ordered", s, format="table") - result = store.select("s_ordered") - tm.assert_series_equal(s, result) - - _maybe_remove(store, "df") - df = DataFrame({"s": s, "vals": [1, 2, 3, 4, 5, 6]}) - store.append("df", df, format="table") - result = store.select("df") - tm.assert_frame_equal(result, df) - - # Dtypes - _maybe_remove(store, "si") - s = Series([1, 1, 2, 2, 3, 4, 5]).astype("category") - store.append("si", s) - result = store.select("si") - tm.assert_series_equal(result, s) - - _maybe_remove(store, "si2") - s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype("category") - store.append("si2", s) - result = store.select("si2") - tm.assert_series_equal(result, s) - - # Multiple - _maybe_remove(store, "df2") - df2 = df.copy() - df2["s2"] = Series(list("abcdefg")).astype("category") - store.append("df2", df2) - result = store.select("df2") - tm.assert_frame_equal(result, df2) - - # Make sure the metadata is OK - info = store.info() - assert "/df2 " in info - # assert '/df2/meta/values_block_0/meta' in info - assert "/df2/meta/values_block_1/meta" in info - - # unordered - _maybe_remove(store, "s2") - s = Series( - Categorical( - ["a", "b", "b", "a", "a", "c"], - categories=["a", "b", "c", "d"], - ordered=False, - ) - ) - store.append("s2", s, format="table") - result = store.select("s2") - tm.assert_series_equal(result, s) - - # Query - _maybe_remove(store, "df3") - store.append("df3", df, data_columns=["s"]) - expected = df[df.s.isin(["b", "c"])] - result = store.select("df3", where=['s in ["b","c"]']) - tm.assert_frame_equal(result, expected) - - expected = df[df.s.isin(["b", "c"])] - result = store.select("df3", where=['s = ["b","c"]']) - tm.assert_frame_equal(result, expected) - - expected = df[df.s.isin(["d"])] - result = store.select("df3", where=['s in ["d"]']) - tm.assert_frame_equal(result, expected) - - expected = df[df.s.isin(["f"])] - result = store.select("df3", where=['s in ["f"]']) - tm.assert_frame_equal(result, expected) - - # Appending with same categories is ok - store.append("df3", df) - - df = concat([df, df]) - expected = df[df.s.isin(["b", "c"])] - result = store.select("df3", where=['s in ["b","c"]']) - tm.assert_frame_equal(result, expected) - - # Appending must have the same categories - df3 = df.copy() - df3["s"] = df3["s"].cat.remove_unused_categories() - - msg = ( - "cannot append a categorical with different categories to the existing" - ) - with pytest.raises(ValueError, match=msg): - store.append("df3", df3) - - # Remove, and make sure meta data is removed (its a recursive - # removal so should be). - result = store.select("df3/meta/s/meta") - assert result is not None - store.remove("df3") - - with pytest.raises( - KeyError, match="'No object named df3/meta/s/meta in the file'" - ): - store.select("df3/meta/s/meta") - - def test_categorical_conversion(self, setup_path): - - # GH13322 - # Check that read_hdf with categorical columns doesn't return rows if - # where criteria isn't met. - obsids = ["ESP_012345_6789", "ESP_987654_3210"] - imgids = ["APF00006np", "APF0001imm"] - data = [4.3, 9.8] - - # Test without categories - df = DataFrame({"obsids": obsids, "imgids": imgids, "data": data}) - - # We are expecting an empty DataFrame matching types of df - expected = df.iloc[[], :] - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", format="table", data_columns=True) - result = read_hdf(path, "df", where="obsids=B") - tm.assert_frame_equal(result, expected) - - # Test with categories - df.obsids = df.obsids.astype("category") - df.imgids = df.imgids.astype("category") - - # We are expecting an empty DataFrame matching types of df - expected = df.iloc[[], :] - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", format="table", data_columns=True) - result = read_hdf(path, "df", where="obsids=B") - tm.assert_frame_equal(result, expected) - - def test_categorical_nan_only_columns(self, setup_path): - # GH18413 - # Check that read_hdf with categorical columns with NaN-only values can - # be read back. - df = DataFrame( - { - "a": ["a", "b", "c", np.nan], - "b": [np.nan, np.nan, np.nan, np.nan], - "c": [1, 2, 3, 4], - "d": Series([None] * 4, dtype=object), - } - ) - df["a"] = df.a.astype("category") - df["b"] = df.b.astype("category") - df["d"] = df.b.astype("category") - expected = df - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", format="table", data_columns=True) - result = read_hdf(path, "df") - tm.assert_frame_equal(result, expected) - - def test_duplicate_column_name(self, setup_path): - df = DataFrame(columns=["a", "a"], data=[[0, 0]]) - - with ensure_clean_path(setup_path) as path: - msg = "Columns index has to be unique for fixed format" - with pytest.raises(ValueError, match=msg): - df.to_hdf(path, "df", format="fixed") - - df.to_hdf(path, "df", format="table") - other = read_hdf(path, "df") - - tm.assert_frame_equal(df, other) - assert df.equals(other) - assert other.equals(df) - - def test_round_trip_equals(self, setup_path): - # GH 9330 - df = DataFrame({"B": [1, 2], "A": ["x", "y"]}) - - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", format="table") - other = read_hdf(path, "df") - tm.assert_frame_equal(df, other) - assert df.equals(other) - assert other.equals(df) - - def test_preserve_timedeltaindex_type(self, setup_path): - # GH9635 - # Storing TimedeltaIndexed DataFrames in fixed stores did not preserve - # the type of the index. - df = DataFrame(np.random.normal(size=(10, 5))) - df.index = timedelta_range(start="0s", periods=10, freq="1s", name="example") - - with ensure_clean_store(setup_path) as store: - - store["df"] = df - tm.assert_frame_equal(store["df"], df) - - def test_columns_multiindex_modified(self, setup_path): - # BUG: 7212 - # read_hdf store.select modified the passed columns parameters - # when multi-indexed. - - df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) - df.index.name = "letters" - df = df.set_index(keys="E", append=True) - - data_columns = df.index.names + df.columns.tolist() - with ensure_clean_path(setup_path) as path: - df.to_hdf( - path, - "df", - mode="a", - append=True, - data_columns=data_columns, - index=False, - ) - cols2load = list("BCD") - cols2load_original = list(cols2load) - df_loaded = read_hdf(path, "df", columns=cols2load) # noqa - assert cols2load_original == cols2load - - @ignore_natural_naming_warning - def test_to_hdf_with_object_column_names(self, setup_path): - # GH9057 - # Writing HDF5 table format should only work for string-like - # column types - - types_should_fail = [ - tm.makeIntIndex, - tm.makeFloatIndex, - tm.makeDateIndex, - tm.makeTimedeltaIndex, - tm.makePeriodIndex, - ] - types_should_run = [ - tm.makeStringIndex, - tm.makeCategoricalIndex, - tm.makeUnicodeIndex, - ] - - for index in types_should_fail: - df = DataFrame(np.random.randn(10, 2), columns=index(2)) - with ensure_clean_path(setup_path) as path: - with catch_warnings(record=True): - msg = "cannot have non-object label DataIndexableCol" - with pytest.raises(ValueError, match=msg): - df.to_hdf(path, "df", format="table", data_columns=True) - - for index in types_should_run: - df = DataFrame(np.random.randn(10, 2), columns=index(2)) - with ensure_clean_path(setup_path) as path: - with catch_warnings(record=True): df.to_hdf(path, "df", format="table", data_columns=True) - result = pd.read_hdf(path, "df", where=f"index = [{df.index[0]}]") - assert len(result) - - def test_read_hdf_open_store(self, setup_path): - # GH10330 - # No check for non-string path_or-buf, and no test of open store - df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) - df.index.name = "letters" - df = df.set_index(keys="E", append=True) - - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", mode="w") - direct = read_hdf(path, "df") - store = HDFStore(path, mode="r") - indirect = read_hdf(store, "df") - tm.assert_frame_equal(direct, indirect) - assert store.is_open - store.close() - - def test_read_hdf_iterator(self, setup_path): - df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) - df.index.name = "letters" - df = df.set_index(keys="E", append=True) - - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", mode="w", format="t") - direct = read_hdf(path, "df") - iterator = read_hdf(path, "df", iterator=True) - assert isinstance(iterator, TableIterator) - indirect = next(iterator.__iter__()) - tm.assert_frame_equal(direct, indirect) - iterator.store.close() - - def test_read_hdf_errors(self, setup_path): - df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) - - with ensure_clean_path(setup_path) as path: - msg = r"File [\S]* does not exist" - with pytest.raises(IOError, match=msg): - read_hdf(path, "key") - - df.to_hdf(path, "df") - store = HDFStore(path, mode="r") - store.close() - - msg = "The HDFStore must be open for reading." - with pytest.raises(IOError, match=msg): - read_hdf(store, "df") - - def test_read_hdf_generic_buffer_errors(self): - msg = "Support for generic buffers has not been implemented." - with pytest.raises(NotImplementedError, match=msg): - read_hdf(BytesIO(b""), "df") - - def test_invalid_complib(self, setup_path): - df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) - with tm.ensure_clean(setup_path) as path: - msg = r"complib only supports \[.*\] compression." - with pytest.raises(ValueError, match=msg): - df.to_hdf(path, "df", complib="foolib") - - # GH10443 - - def test_read_nokey(self, setup_path): - df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) - - # Categorical dtype not supported for "fixed" format. So no need - # to test with that dtype in the dataframe here. - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", mode="a") - reread = read_hdf(path) - tm.assert_frame_equal(df, reread) - df.to_hdf(path, "df2", mode="a") - - msg = "key must be provided when HDF5 file contains multiple datasets." - with pytest.raises(ValueError, match=msg): - read_hdf(path) - - def test_read_nokey_table(self, setup_path): - # GH13231 - df = DataFrame({"i": range(5), "c": Series(list("abacd"), dtype="category")}) - - with ensure_clean_path(setup_path) as path: - df.to_hdf(path, "df", mode="a", format="table") - reread = read_hdf(path) - tm.assert_frame_equal(df, reread) - df.to_hdf(path, "df2", mode="a", format="table") - - msg = "key must be provided when HDF5 file contains multiple datasets." - with pytest.raises(ValueError, match=msg): - read_hdf(path) - - def test_read_nokey_empty(self, setup_path): - with ensure_clean_path(setup_path) as path: - store = HDFStore(path) - store.close() - msg = re.escape( - "Dataset(s) incompatible with Pandas data types, not table, or no " - "datasets found in HDF5 file." - ) - with pytest.raises(ValueError, match=msg): - read_hdf(path) - - def test_read_from_pathlib_path(self, setup_path): - - # GH11773 - expected = DataFrame( - np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE") - ) - with ensure_clean_path(setup_path) as filename: - path_obj = Path(filename) - expected.to_hdf(path_obj, "df", mode="a") - actual = read_hdf(path_obj, "df") - - tm.assert_frame_equal(expected, actual) - - @td.skip_if_no("py.path") - def test_read_from_py_localpath(self, setup_path): - - # GH11773 - from py.path import local as LocalPath - - expected = DataFrame( - np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE") - ) - with ensure_clean_path(setup_path) as filename: - path_obj = LocalPath(filename) - - expected.to_hdf(path_obj, "df", mode="a") - actual = read_hdf(path_obj, "df") - - tm.assert_frame_equal(expected, actual) - - def test_query_long_float_literal(self, setup_path): - # GH 14241 - df = DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]}) - - with ensure_clean_store(setup_path) as store: - store.append("test", df, format="table", data_columns=True) - - cutoff = 1000000000.0006 - result = store.select("test", f"A < {cutoff:.4f}") - assert result.empty - - cutoff = 1000000000.0010 - result = store.select("test", f"A > {cutoff:.4f}") - expected = df.loc[[1, 2], :] - tm.assert_frame_equal(expected, result) - - exact = 1000000000.0011 - result = store.select("test", f"A == {exact:.4f}") - expected = df.loc[[1], :] - tm.assert_frame_equal(expected, result) - - def test_query_compare_column_type(self, setup_path): - # GH 15492 - df = DataFrame( - { - "date": ["2014-01-01", "2014-01-02"], - "real_date": date_range("2014-01-01", periods=2), - "float": [1.1, 1.2], - "int": [1, 2], - }, - columns=["date", "real_date", "float", "int"], - ) - - with ensure_clean_store(setup_path) as store: - store.append("test", df, format="table", data_columns=True) - - ts = Timestamp("2014-01-01") # noqa - result = store.select("test", where="real_date > ts") - expected = df.loc[[1], :] - tm.assert_frame_equal(expected, result) - - for op in ["<", ">", "=="]: - # non strings to string column always fail - for v in [2.1, True, Timestamp("2014-01-01"), pd.Timedelta(1, "s")]: - query = f"date {op} v" - msg = f"Cannot compare {v} of type {type(v)} to string column" - with pytest.raises(TypeError, match=msg): - store.select("test", where=query) - - # strings to other columns must be convertible to type - v = "a" - for col in ["int", "float", "real_date"]: - query = f"{col} {op} v" - msg = "could not convert string to " - with pytest.raises(ValueError, match=msg): - store.select("test", where=query) - - for v, col in zip( - ["1", "1.1", "2014-01-01"], ["int", "float", "real_date"] - ): - query = f"{col} {op} v" - result = store.select("test", where=query) - - if op == "==": - expected = df.loc[[0], :] - elif op == ">": - expected = df.loc[[1], :] - else: - expected = df.loc[[], :] - tm.assert_frame_equal(expected, result) - - @pytest.mark.parametrize("format", ["fixed", "table"]) - def test_read_hdf_series_mode_r(self, format, setup_path): - # GH 16583 - # Tests that reading a Series saved to an HDF file - # still works if a mode='r' argument is supplied - series = tm.makeFloatSeries() - with ensure_clean_path(setup_path) as path: - series.to_hdf(path, key="data", format=format) - result = pd.read_hdf(path, key="data", mode="r") - tm.assert_series_equal(result, series) - - def test_fspath(self): - with tm.ensure_clean("foo.h5") as path: - with HDFStore(path) as store: - assert os.fspath(store) == str(path) - - def test_read_py2_hdf_file_in_py3(self, datapath): - # GH 16781 - - # tests reading a PeriodIndex DataFrame written in Python2 in Python3 - - # the file was generated in Python 2.7 like so: - # - # df = DataFrame([1.,2,3], index=pd.PeriodIndex( - # ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B')) - # df.to_hdf('periodindex_0.20.1_x86_64_darwin_2.7.13.h5', 'p') - - expected = DataFrame( - [1.0, 2, 3], - index=pd.PeriodIndex(["2015-01-01", "2015-01-02", "2015-01-05"], freq="B"), - ) - - with ensure_clean_store( - datapath( - "io", "data", "legacy_hdf", "periodindex_0.20.1_x86_64_darwin_2.7.13.h5" - ), - mode="r", - ) as store: - result = store["p"] - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("where", ["", (), (None,), [], [None]]) - def test_select_empty_where(self, where): - # GH26610 - - # Using keyword `where` as '' or (), or [None], etc - # while reading from HDF store raises - # "SyntaxError: only a single expression is allowed" - - df = DataFrame([1, 2, 3]) - with ensure_clean_path("empty_where.h5") as path: - with HDFStore(path) as store: - store.put("df", df, "t") - result = pd.read_hdf(store, "df", where=where) - tm.assert_frame_equal(result, df) - - @pytest.mark.parametrize( - "idx", - [ - date_range("2019", freq="D", periods=3, tz="UTC"), - CategoricalIndex(list("abc")), - ], - ) - def test_to_hdf_multiindex_extension_dtype(self, idx, setup_path): - # GH 7775 - mi = MultiIndex.from_arrays([idx, idx]) - df = DataFrame(0, index=mi, columns=["a"]) + for index in types_should_run: + df = DataFrame(np.random.randn(10, 2), columns=index(2)) with ensure_clean_path(setup_path) as path: - with pytest.raises(NotImplementedError, match="Saving a MultiIndex"): - df.to_hdf(path, "df") - - def test_unsuppored_hdf_file_error(self, datapath): - # GH 9539 - data_path = datapath("io", "data", "legacy_hdf/incompatible_dataset.h5") - message = ( - r"Dataset\(s\) incompatible with Pandas data types, " - "not table, or no datasets found in HDF5 file." - ) - - with pytest.raises(ValueError, match=message): - pd.read_hdf(data_path) - - -@pytest.mark.parametrize("bad_version", [(1, 2), (1,), [], "12", "123"]) -def test_maybe_adjust_name_bad_version_raises(bad_version): - msg = "Version is incorrect, expected sequence of 3 integers" - with pytest.raises(ValueError, match=msg): - _maybe_adjust_name("values_block_0", version=bad_version) + with catch_warnings(record=True): + df.to_hdf(path, "df", format="table", data_columns=True) + result = pd.read_hdf(path, "df", where=f"index = [{df.index[0]}]") + assert len(result) diff --git a/pandas/tests/io/pytables/test_time_series.py b/pandas/tests/io/pytables/test_time_series.py new file mode 100644 index 0000000000000..d98ae7c599c52 --- /dev/null +++ b/pandas/tests/io/pytables/test_time_series.py @@ -0,0 +1,62 @@ +import datetime + +import numpy as np +import pytest + +from pandas import DataFrame, Series, _testing as tm +from pandas.tests.io.pytables.common import ensure_clean_store + +pytestmark = pytest.mark.single + + +def test_store_datetime_fractional_secs(setup_path): + + with ensure_clean_store(setup_path) as store: + dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456) + series = Series([0], [dt]) + store["a"] = series + assert store["a"].index[0] == dt + + +def test_tseries_indices_series(setup_path): + + with ensure_clean_store(setup_path) as store: + idx = tm.makeDateIndex(10) + ser = Series(np.random.randn(len(idx)), idx) + store["a"] = ser + result = store["a"] + + tm.assert_series_equal(result, ser) + assert result.index.freq == ser.index.freq + tm.assert_class_equal(result.index, ser.index, obj="series index") + + idx = tm.makePeriodIndex(10) + ser = Series(np.random.randn(len(idx)), idx) + store["a"] = ser + result = store["a"] + + tm.assert_series_equal(result, ser) + assert result.index.freq == ser.index.freq + tm.assert_class_equal(result.index, ser.index, obj="series index") + + +def test_tseries_indices_frame(setup_path): + + with ensure_clean_store(setup_path) as store: + idx = tm.makeDateIndex(10) + df = DataFrame(np.random.randn(len(idx), 3), index=idx) + store["a"] = df + result = store["a"] + + tm.assert_frame_equal(result, df) + assert result.index.freq == df.index.freq + tm.assert_class_equal(result.index, df.index, obj="dataframe index") + + idx = tm.makePeriodIndex(10) + df = DataFrame(np.random.randn(len(idx), 3), idx) + store["a"] = df + result = store["a"] + + tm.assert_frame_equal(result, df) + assert result.index.freq == df.index.freq + tm.assert_class_equal(result.index, df.index, obj="dataframe index")