From 54cff9d9741364bd2a1606298be2ba3f64093ee3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 12 Jan 2023 10:35:39 +0100 Subject: [PATCH 01/14] CI: Try regular 3.11 pipeline --- .github/workflows/macos-windows.yml | 2 +- .github/workflows/ubuntu.yml | 2 +- ci/deps/actions-311.yaml | 56 +++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 2 deletions(-) create mode 100644 ci/deps/actions-311.yaml diff --git a/.github/workflows/macos-windows.yml b/.github/workflows/macos-windows.yml index d762e20db196a..b960098d36508 100644 --- a/.github/workflows/macos-windows.yml +++ b/.github/workflows/macos-windows.yml @@ -31,7 +31,7 @@ jobs: strategy: matrix: os: [macos-latest, windows-latest] - env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml] + env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml, actions-311.yam] fail-fast: false runs-on: ${{ matrix.os }} name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }} diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 9c93725ea15ec..824a95ea59967 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -27,7 +27,7 @@ jobs: timeout-minutes: 180 strategy: matrix: - env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml] + env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml, actions-311.yaml] pattern: ["not single_cpu", "single_cpu"] pyarrow_version: ["7", "8", "9", "10"] include: diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml new file mode 100644 index 0000000000000..9b5a3d473f601 --- /dev/null +++ b/ci/deps/actions-311.yaml @@ -0,0 +1,56 @@ +name: pandas-dev-test +channels: + - conda-forge +dependencies: + - python=3.11 + + # build dependencies + - versioneer[toml] + - cython>=0.29.32 + + # test dependencies + - pytest>=6.0 + - pytest-cov + - pytest-xdist>=1.31 + - psutil + - pytest-asyncio>=0.17 + - boto3 + + # required dependencies + - python-dateutil + - numpy<1.24 + - pytz + + # optional dependencies + - beautifulsoup4 + - blosc + - bottleneck + - brotlipy + - fastparquet + - fsspec + - html5lib + - hypothesis + - gcsfs + - jinja2 + - lxml + - matplotlib>=3.6.1 + - numexpr + - openpyxl + - odfpy + - pandas-gbq + - psycopg2 + - pymysql + - pytables + - pyarrow + - pyreadstat + - python-snappy + - pyxlsb + - s3fs>=2021.08.0 + - scipy + - sqlalchemy<1.4.46 + - tabulate + - tzdata>=2022a + - xarray + - xlrd + - xlsxwriter + - zstandard From b4cf4fbc353a8ca22434208bf97e29c1f93d466f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 12 Jan 2023 10:38:56 +0100 Subject: [PATCH 02/14] Avoid pyarrow test on all versions --- .github/workflows/ubuntu.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 824a95ea59967..f7bd5980439e3 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -92,6 +92,12 @@ jobs: pyarrow_version: "8" - env_file: actions-39.yaml pyarrow_version: "9" + - env_file: actions-311.yaml + pyarrow_version: "7" + - env_file: actions-311.yaml + pyarrow_version: "8" + - env_file: actions-311.yaml + pyarrow_version: "9" fail-fast: false name: ${{ matrix.name || format('{0} pyarrow={1} {2}', matrix.env_file, matrix.pyarrow_version, matrix.pattern) }} env: From 88462e256012d7d7c8f6db871cdcf12dd3f0a50b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 12 Jan 2023 10:44:16 +0100 Subject: [PATCH 03/14] Disable dev workflow --- .github/workflows/python-dev.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/python-dev.yml b/.github/workflows/python-dev.yml index 220c1e464742e..3d954737fdf8a 100644 --- a/.github/workflows/python-dev.yml +++ b/.github/workflows/python-dev.yml @@ -23,12 +23,14 @@ name: Python Dev on: push: branches: - - main - - 1.5.x +# - main +# - 1.5.x + - None pull_request: branches: - - main - - 1.5.x +# - main +# - 1.5.x + - None paths-ignore: - "doc/**" From 7bdb1d91c4baf33f0c0091a4121d2316b640d99a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 12 Jan 2023 11:34:02 +0100 Subject: [PATCH 04/14] Fix typo --- .github/workflows/macos-windows.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/macos-windows.yml b/.github/workflows/macos-windows.yml index b960098d36508..ac5fd4aa7d4a4 100644 --- a/.github/workflows/macos-windows.yml +++ b/.github/workflows/macos-windows.yml @@ -31,7 +31,7 @@ jobs: strategy: matrix: os: [macos-latest, windows-latest] - env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml, actions-311.yam] + env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml, actions-311.yaml] fail-fast: false runs-on: ${{ matrix.os }} name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }} From febe6e43c8ec210e41549e3d702f4612e27499c5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 14 Jan 2023 23:21:00 +0100 Subject: [PATCH 05/14] Fix test --- pandas/tests/extension/test_arrow.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 0a7303ea239ed..1a4664715aa4e 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -26,6 +26,7 @@ import pytest from pandas.compat import ( + PY311, is_ci_environment, is_platform_windows, pa_version_under6p0, @@ -49,7 +50,7 @@ ) from pandas.tests.extension import base -pa = pytest.importorskip("pyarrow", minversion="1.0.1") +pa = pytest.importorskip("pyarrow", minversion="6.0.0") from pandas.core.arrays.arrow.array import ArrowExtensionArray @@ -287,7 +288,7 @@ def test_from_sequence_pa_array_notimplemented(self, request): def test_from_sequence_of_strings_pa_array(self, data, request): pa_dtype = data.dtype.pyarrow_dtype - if pa.types.is_time64(pa_dtype) and pa_dtype.equals("time64[ns]"): + if pa.types.is_time64(pa_dtype) and pa_dtype.equals("time64[ns]") and not PY311: request.node.add_marker( pytest.mark.xfail( reason="Nanosecond time parsing not supported.", From 43642523fd4113ce7d494b1cbc8be635265d2537 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 14 Jan 2023 23:22:12 +0100 Subject: [PATCH 06/14] Update deps --- ci/deps/actions-311.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 9b5a3d473f601..7d374dbb20e8b 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -9,9 +9,9 @@ dependencies: - cython>=0.29.32 # test dependencies - - pytest>=6.0 + - pytest>=7.0 - pytest-cov - - pytest-xdist>=1.31 + - pytest-xdist>=2.2.0 - psutil - pytest-asyncio>=0.17 - boto3 From e84c458b5161143b0b7d3aed9254d801ac9e7d7a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 15 Jan 2023 16:06:12 +0100 Subject: [PATCH 07/14] Remov pytables tests --- pandas/tests/io/pytables/__init__.py | 0 pandas/tests/io/pytables/common.py | 51 - pandas/tests/io/pytables/conftest.py | 9 - pandas/tests/io/pytables/test_append.py | 935 --------------- pandas/tests/io/pytables/test_categorical.py | 219 ---- pandas/tests/io/pytables/test_compat.py | 75 -- pandas/tests/io/pytables/test_complex.py | 200 ---- pandas/tests/io/pytables/test_errors.py | 236 ---- .../tests/io/pytables/test_file_handling.py | 446 -------- pandas/tests/io/pytables/test_keys.py | 79 -- pandas/tests/io/pytables/test_put.py | 372 ------ .../io/pytables/test_pytables_missing.py | 14 - pandas/tests/io/pytables/test_read.py | 344 ------ .../io/pytables/test_retain_attributes.py | 105 -- pandas/tests/io/pytables/test_round_trip.py | 557 --------- pandas/tests/io/pytables/test_select.py | 973 ---------------- pandas/tests/io/pytables/test_store.py | 1018 ----------------- pandas/tests/io/pytables/test_subclass.py | 52 - pandas/tests/io/pytables/test_time_series.py | 66 -- pandas/tests/io/pytables/test_timezones.py | 369 ------ 20 files changed, 6120 deletions(-) delete mode 100644 pandas/tests/io/pytables/__init__.py delete mode 100644 pandas/tests/io/pytables/common.py delete mode 100644 pandas/tests/io/pytables/conftest.py delete mode 100644 pandas/tests/io/pytables/test_append.py delete mode 100644 pandas/tests/io/pytables/test_categorical.py delete mode 100644 pandas/tests/io/pytables/test_compat.py delete mode 100644 pandas/tests/io/pytables/test_complex.py delete mode 100644 pandas/tests/io/pytables/test_errors.py delete mode 100644 pandas/tests/io/pytables/test_file_handling.py delete mode 100644 pandas/tests/io/pytables/test_keys.py delete mode 100644 pandas/tests/io/pytables/test_put.py delete mode 100644 pandas/tests/io/pytables/test_pytables_missing.py delete mode 100644 pandas/tests/io/pytables/test_read.py delete mode 100644 pandas/tests/io/pytables/test_retain_attributes.py delete mode 100644 pandas/tests/io/pytables/test_round_trip.py delete mode 100644 pandas/tests/io/pytables/test_select.py delete mode 100644 pandas/tests/io/pytables/test_store.py delete mode 100644 pandas/tests/io/pytables/test_subclass.py delete mode 100644 pandas/tests/io/pytables/test_time_series.py delete mode 100644 pandas/tests/io/pytables/test_timezones.py diff --git a/pandas/tests/io/pytables/__init__.py b/pandas/tests/io/pytables/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/pandas/tests/io/pytables/common.py b/pandas/tests/io/pytables/common.py deleted file mode 100644 index 9446d9df3a038..0000000000000 --- a/pandas/tests/io/pytables/common.py +++ /dev/null @@ -1,51 +0,0 @@ -from contextlib import contextmanager -import pathlib -import tempfile -from typing import Generator - -import pytest - -from pandas.io.pytables import HDFStore - -tables = pytest.importorskip("tables") -# set these parameters so we don't have file sharing -tables.parameters.MAX_NUMEXPR_THREADS = 1 -tables.parameters.MAX_BLOSC_THREADS = 1 -tables.parameters.MAX_THREADS = 1 - - -def safe_close(store): - try: - if store is not None: - store.close() - except OSError: - pass - - -# contextmanager to ensure the file cleanup -@contextmanager -def ensure_clean_store( - path, mode="a", complevel=None, complib=None, fletcher32=False -) -> Generator[HDFStore, None, None]: - - with tempfile.TemporaryDirectory() as tmpdirname: - tmp_path = pathlib.Path(tmpdirname, path) - with HDFStore( - tmp_path, - mode=mode, - complevel=complevel, - complib=complib, - fletcher32=fletcher32, - ) as store: - yield store - - -def _maybe_remove(store, key): - """ - For tests using tables, try removing the table to be sure there is - no content from previous tests using the same table name. - """ - try: - store.remove(key) - except (ValueError, KeyError): - pass diff --git a/pandas/tests/io/pytables/conftest.py b/pandas/tests/io/pytables/conftest.py deleted file mode 100644 index 466e4ae8bb99c..0000000000000 --- a/pandas/tests/io/pytables/conftest.py +++ /dev/null @@ -1,9 +0,0 @@ -import uuid - -import pytest - - -@pytest.fixture -def setup_path(): - """Fixture for setup path""" - return f"tmp.__{uuid.uuid4()}__.h5" diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py deleted file mode 100644 index 80562e77cae02..0000000000000 --- a/pandas/tests/io/pytables/test_append.py +++ /dev/null @@ -1,935 +0,0 @@ -import datetime -from datetime import timedelta -import re -from warnings import catch_warnings - -import numpy as np -import pytest - -from pandas._libs.tslibs import Timestamp -import pandas.util._test_decorators as td - -import pandas as pd -from pandas import ( - DataFrame, - Series, - _testing as tm, - concat, - date_range, - read_hdf, -) -from pandas.tests.io.pytables.common import ( - _maybe_remove, - ensure_clean_store, -) - -pytestmark = pytest.mark.single_cpu - - -def test_append(setup_path): - - with ensure_clean_store(setup_path) as store: - - # this is allowed by almost always don't want to do it - # tables.NaturalNameWarning): - with catch_warnings(record=True): - - df = tm.makeTimeDataFrame() - _maybe_remove(store, "df1") - store.append("df1", df[:10]) - store.append("df1", df[10:]) - tm.assert_frame_equal(store["df1"], df) - - _maybe_remove(store, "df2") - store.put("df2", df[:10], format="table") - store.append("df2", df[10:]) - tm.assert_frame_equal(store["df2"], df) - - _maybe_remove(store, "df3") - store.append("/df3", df[:10]) - store.append("/df3", df[10:]) - tm.assert_frame_equal(store["df3"], df) - - # this is allowed by almost always don't want to do it - # tables.NaturalNameWarning - _maybe_remove(store, "/df3 foo") - store.append("/df3 foo", df[:10]) - store.append("/df3 foo", df[10:]) - tm.assert_frame_equal(store["df3 foo"], df) - - # dtype issues - mizxed type in a single object column - df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]]) - df["mixed_column"] = "testing" - df.loc[2, "mixed_column"] = np.nan - _maybe_remove(store, "df") - store.append("df", df) - tm.assert_frame_equal(store["df"], df) - - # uints - test storage of uints - uint_data = DataFrame( - { - "u08": Series( - np.random.randint(0, high=255, size=5), dtype=np.uint8 - ), - "u16": Series( - np.random.randint(0, high=65535, size=5), dtype=np.uint16 - ), - "u32": Series( - np.random.randint(0, high=2**30, size=5), dtype=np.uint32 - ), - "u64": Series( - [2**58, 2**59, 2**60, 2**61, 2**62], - dtype=np.uint64, - ), - }, - index=np.arange(5), - ) - _maybe_remove(store, "uints") - store.append("uints", uint_data) - tm.assert_frame_equal(store["uints"], uint_data) - - # uints - test storage of uints in indexable columns - _maybe_remove(store, "uints") - # 64-bit indices not yet supported - store.append("uints", uint_data, data_columns=["u08", "u16", "u32"]) - tm.assert_frame_equal(store["uints"], uint_data) - - -def test_append_series(setup_path): - - with ensure_clean_store(setup_path) as store: - - # basic - ss = tm.makeStringSeries() - ts = tm.makeTimeSeries() - ns = Series(np.arange(100)) - - store.append("ss", ss) - result = store["ss"] - tm.assert_series_equal(result, ss) - assert result.name is None - - store.append("ts", ts) - result = store["ts"] - tm.assert_series_equal(result, ts) - assert result.name is None - - ns.name = "foo" - store.append("ns", ns) - result = store["ns"] - tm.assert_series_equal(result, ns) - assert result.name == ns.name - - # select on the values - expected = ns[ns > 60] - result = store.select("ns", "foo>60") - tm.assert_series_equal(result, expected) - - # select on the index and values - expected = ns[(ns > 70) & (ns.index < 90)] - result = store.select("ns", "foo>70 and index<90") - tm.assert_series_equal(result, expected) - - # multi-index - mi = DataFrame(np.random.randn(5, 1), columns=["A"]) - mi["B"] = np.arange(len(mi)) - mi["C"] = "foo" - mi.loc[3:5, "C"] = "bar" - mi.set_index(["C", "B"], inplace=True) - s = mi.stack() - s.index = s.index.droplevel(2) - store.append("mi", s) - tm.assert_series_equal(store["mi"], s) - - -def test_append_some_nans(setup_path): - - with ensure_clean_store(setup_path) as store: - df = DataFrame( - { - "A": Series(np.random.randn(20)).astype("int32"), - "A1": np.random.randn(20), - "A2": np.random.randn(20), - "B": "foo", - "C": "bar", - "D": Timestamp("20010101"), - "E": datetime.datetime(2001, 1, 2, 0, 0), - }, - index=np.arange(20), - ) - # some nans - _maybe_remove(store, "df1") - df.loc[0:15, ["A1", "B", "D", "E"]] = np.nan - store.append("df1", df[:10]) - store.append("df1", df[10:]) - tm.assert_frame_equal(store["df1"], df) - - # first column - df1 = df.copy() - df1["A1"] = np.nan - _maybe_remove(store, "df1") - store.append("df1", df1[:10]) - store.append("df1", df1[10:]) - tm.assert_frame_equal(store["df1"], df1) - - # 2nd column - df2 = df.copy() - df2["A2"] = np.nan - _maybe_remove(store, "df2") - store.append("df2", df2[:10]) - store.append("df2", df2[10:]) - tm.assert_frame_equal(store["df2"], df2) - - # datetimes - df3 = df.copy() - df3["E"] = np.nan - _maybe_remove(store, "df3") - store.append("df3", df3[:10]) - store.append("df3", df3[10:]) - tm.assert_frame_equal(store["df3"], df3) - - -def test_append_all_nans(setup_path): - - with ensure_clean_store(setup_path) as store: - - df = DataFrame( - {"A1": np.random.randn(20), "A2": np.random.randn(20)}, - index=np.arange(20), - ) - df.loc[0:15, :] = np.nan - - # nan some entire rows (dropna=True) - _maybe_remove(store, "df") - store.append("df", df[:10], dropna=True) - store.append("df", df[10:], dropna=True) - tm.assert_frame_equal(store["df"], df[-4:]) - - # nan some entire rows (dropna=False) - _maybe_remove(store, "df2") - store.append("df2", df[:10], dropna=False) - store.append("df2", df[10:], dropna=False) - tm.assert_frame_equal(store["df2"], df) - - # tests the option io.hdf.dropna_table - with pd.option_context("io.hdf.dropna_table", False): - _maybe_remove(store, "df3") - store.append("df3", df[:10]) - store.append("df3", df[10:]) - tm.assert_frame_equal(store["df3"], df) - - with pd.option_context("io.hdf.dropna_table", True): - _maybe_remove(store, "df4") - store.append("df4", df[:10]) - store.append("df4", df[10:]) - tm.assert_frame_equal(store["df4"], df[-4:]) - - # nan some entire rows (string are still written!) - df = DataFrame( - { - "A1": np.random.randn(20), - "A2": np.random.randn(20), - "B": "foo", - "C": "bar", - }, - index=np.arange(20), - ) - - df.loc[0:15, :] = np.nan - - _maybe_remove(store, "df") - store.append("df", df[:10], dropna=True) - store.append("df", df[10:], dropna=True) - tm.assert_frame_equal(store["df"], df) - - _maybe_remove(store, "df2") - store.append("df2", df[:10], dropna=False) - store.append("df2", df[10:], dropna=False) - tm.assert_frame_equal(store["df2"], df) - - # nan some entire rows (but since we have dates they are still - # written!) - df = DataFrame( - { - "A1": np.random.randn(20), - "A2": np.random.randn(20), - "B": "foo", - "C": "bar", - "D": Timestamp("20010101"), - "E": datetime.datetime(2001, 1, 2, 0, 0), - }, - index=np.arange(20), - ) - - df.loc[0:15, :] = np.nan - - _maybe_remove(store, "df") - store.append("df", df[:10], dropna=True) - store.append("df", df[10:], dropna=True) - tm.assert_frame_equal(store["df"], df) - - _maybe_remove(store, "df2") - store.append("df2", df[:10], dropna=False) - store.append("df2", df[10:], dropna=False) - tm.assert_frame_equal(store["df2"], df) - - -def test_append_frame_column_oriented(setup_path): - with ensure_clean_store(setup_path) as store: - - # column oriented - df = tm.makeTimeDataFrame() - df.index = df.index._with_freq(None) # freq doesn't round-trip - - _maybe_remove(store, "df1") - store.append("df1", df.iloc[:, :2], axes=["columns"]) - store.append("df1", df.iloc[:, 2:]) - tm.assert_frame_equal(store["df1"], df) - - result = store.select("df1", "columns=A") - expected = df.reindex(columns=["A"]) - tm.assert_frame_equal(expected, result) - - # selection on the non-indexable - result = store.select("df1", ("columns=A", "index=df.index[0:4]")) - expected = df.reindex(columns=["A"], index=df.index[0:4]) - tm.assert_frame_equal(expected, result) - - # this isn't supported - msg = re.escape( - "passing a filterable condition to a non-table indexer " - "[Filter: Not Initialized]" - ) - with pytest.raises(TypeError, match=msg): - store.select("df1", "columns=A and index>df.index[4]") - - -def test_append_with_different_block_ordering(setup_path): - - # GH 4096; using same frames, but different block orderings - with ensure_clean_store(setup_path) as store: - - for i in range(10): - - df = DataFrame(np.random.randn(10, 2), columns=list("AB")) - df["index"] = range(10) - df["index"] += i * 10 - df["int64"] = Series([1] * len(df), dtype="int64") - df["int16"] = Series([1] * len(df), dtype="int16") - - if i % 2 == 0: - del df["int64"] - df["int64"] = Series([1] * len(df), dtype="int64") - if i % 3 == 0: - a = df.pop("A") - df["A"] = a - - df.set_index("index", inplace=True) - - store.append("df", df) - - # test a different ordering but with more fields (like invalid - # combinations) - with ensure_clean_store(setup_path) as store: - - df = DataFrame(np.random.randn(10, 2), columns=list("AB"), dtype="float64") - df["int64"] = Series([1] * len(df), dtype="int64") - df["int16"] = Series([1] * len(df), dtype="int16") - store.append("df", df) - - # store additional fields in different blocks - df["int16_2"] = Series([1] * len(df), dtype="int16") - msg = re.escape( - "cannot match existing table structure for [int16] on appending data" - ) - with pytest.raises(ValueError, match=msg): - store.append("df", df) - - # store multiple additional fields in different blocks - df["float_3"] = Series([1.0] * len(df), dtype="float64") - msg = re.escape( - "cannot match existing table structure for [A,B] on appending data" - ) - with pytest.raises(ValueError, match=msg): - store.append("df", df) - - -def test_append_with_strings(setup_path): - - with ensure_clean_store(setup_path) as store: - with catch_warnings(record=True): - - def check_col(key, name, size): - assert ( - getattr(store.get_storer(key).table.description, name).itemsize - == size - ) - - # avoid truncation on elements - df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]]) - store.append("df_big", df) - tm.assert_frame_equal(store.select("df_big"), df) - check_col("df_big", "values_block_1", 15) - - # appending smaller string ok - df2 = DataFrame([[124, "asdqy"], [346, "dggnhefbdfb"]]) - store.append("df_big", df2) - expected = concat([df, df2]) - tm.assert_frame_equal(store.select("df_big"), expected) - check_col("df_big", "values_block_1", 15) - - # avoid truncation on elements - df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]]) - store.append("df_big2", df, min_itemsize={"values": 50}) - tm.assert_frame_equal(store.select("df_big2"), df) - check_col("df_big2", "values_block_1", 50) - - # bigger string on next append - store.append("df_new", df) - df_new = DataFrame( - [[124, "abcdefqhij"], [346, "abcdefghijklmnopqrtsuvwxyz"]] - ) - msg = ( - r"Trying to store a string with len \[26\] in " - r"\[values_block_1\] column but\n" - r"this column has a limit of \[15\]!\n" - "Consider using min_itemsize to preset the sizes on these " - "columns" - ) - with pytest.raises(ValueError, match=msg): - store.append("df_new", df_new) - - # min_itemsize on Series index (GH 11412) - df = tm.makeMixedDataFrame().set_index("C") - store.append("ss", df["B"], min_itemsize={"index": 4}) - tm.assert_series_equal(store.select("ss"), df["B"]) - - # same as above, with data_columns=True - store.append("ss2", df["B"], data_columns=True, min_itemsize={"index": 4}) - tm.assert_series_equal(store.select("ss2"), df["B"]) - - # min_itemsize in index without appending (GH 10381) - store.put("ss3", df, format="table", min_itemsize={"index": 6}) - # just make sure there is a longer string: - df2 = df.copy().reset_index().assign(C="longer").set_index("C") - store.append("ss3", df2) - tm.assert_frame_equal(store.select("ss3"), concat([df, df2])) - - # same as above, with a Series - store.put("ss4", df["B"], format="table", min_itemsize={"index": 6}) - store.append("ss4", df2["B"]) - tm.assert_series_equal(store.select("ss4"), concat([df["B"], df2["B"]])) - - # with nans - _maybe_remove(store, "df") - df = tm.makeTimeDataFrame() - df["string"] = "foo" - df.loc[df.index[1:4], "string"] = np.nan - df["string2"] = "bar" - df.loc[df.index[4:8], "string2"] = np.nan - df["string3"] = "bah" - df.loc[df.index[1:], "string3"] = np.nan - store.append("df", df) - result = store.select("df") - tm.assert_frame_equal(result, df) - - with ensure_clean_store(setup_path) as store: - - df = DataFrame({"A": "foo", "B": "bar"}, index=range(10)) - - # a min_itemsize that creates a data_column - _maybe_remove(store, "df") - store.append("df", df, min_itemsize={"A": 200}) - check_col("df", "A", 200) - assert store.get_storer("df").data_columns == ["A"] - - # a min_itemsize that creates a data_column2 - _maybe_remove(store, "df") - store.append("df", df, data_columns=["B"], min_itemsize={"A": 200}) - check_col("df", "A", 200) - assert store.get_storer("df").data_columns == ["B", "A"] - - # a min_itemsize that creates a data_column2 - _maybe_remove(store, "df") - store.append("df", df, data_columns=["B"], min_itemsize={"values": 200}) - check_col("df", "B", 200) - check_col("df", "values_block_0", 200) - assert store.get_storer("df").data_columns == ["B"] - - # infer the .typ on subsequent appends - _maybe_remove(store, "df") - store.append("df", df[:5], min_itemsize=200) - store.append("df", df[5:], min_itemsize=200) - tm.assert_frame_equal(store["df"], df) - - # invalid min_itemsize keys - df = DataFrame(["foo", "foo", "foo", "barh", "barh", "barh"], columns=["A"]) - _maybe_remove(store, "df") - msg = re.escape( - "min_itemsize has the key [foo] which is not an axis or data_column" - ) - with pytest.raises(ValueError, match=msg): - store.append("df", df, min_itemsize={"foo": 20, "foobar": 20}) - - -def test_append_with_empty_string(setup_path): - - with ensure_clean_store(setup_path) as store: - - # with all empty strings (GH 12242) - df = DataFrame({"x": ["a", "b", "c", "d", "e", "f", ""]}) - store.append("df", df[:-1], min_itemsize={"x": 1}) - store.append("df", df[-1:], min_itemsize={"x": 1}) - tm.assert_frame_equal(store.select("df"), df) - - -def test_append_with_data_columns(setup_path): - - with ensure_clean_store(setup_path) as store: - df = tm.makeTimeDataFrame() - df.iloc[0, df.columns.get_loc("B")] = 1.0 - _maybe_remove(store, "df") - store.append("df", df[:2], data_columns=["B"]) - store.append("df", df[2:]) - tm.assert_frame_equal(store["df"], df) - - # check that we have indices created - assert store._handle.root.df.table.cols.index.is_indexed is True - assert store._handle.root.df.table.cols.B.is_indexed is True - - # data column searching - result = store.select("df", "B>0") - expected = df[df.B > 0] - tm.assert_frame_equal(result, expected) - - # data column searching (with an indexable and a data_columns) - result = store.select("df", "B>0 and index>df.index[3]") - df_new = df.reindex(index=df.index[4:]) - expected = df_new[df_new.B > 0] - tm.assert_frame_equal(result, expected) - - # data column selection with a string data_column - df_new = df.copy() - df_new["string"] = "foo" - df_new.loc[df_new.index[1:4], "string"] = np.nan - df_new.loc[df_new.index[5:6], "string"] = "bar" - _maybe_remove(store, "df") - store.append("df", df_new, data_columns=["string"]) - result = store.select("df", "string='foo'") - expected = df_new[df_new.string == "foo"] - tm.assert_frame_equal(result, expected) - - # using min_itemsize and a data column - def check_col(key, name, size): - assert ( - getattr(store.get_storer(key).table.description, name).itemsize == size - ) - - with ensure_clean_store(setup_path) as store: - _maybe_remove(store, "df") - store.append("df", df_new, data_columns=["string"], min_itemsize={"string": 30}) - check_col("df", "string", 30) - _maybe_remove(store, "df") - store.append("df", df_new, data_columns=["string"], min_itemsize=30) - check_col("df", "string", 30) - _maybe_remove(store, "df") - store.append("df", df_new, data_columns=["string"], min_itemsize={"values": 30}) - check_col("df", "string", 30) - - with ensure_clean_store(setup_path) as store: - df_new["string2"] = "foobarbah" - df_new["string_block1"] = "foobarbah1" - df_new["string_block2"] = "foobarbah2" - _maybe_remove(store, "df") - store.append( - "df", - df_new, - data_columns=["string", "string2"], - min_itemsize={"string": 30, "string2": 40, "values": 50}, - ) - check_col("df", "string", 30) - check_col("df", "string2", 40) - check_col("df", "values_block_1", 50) - - with ensure_clean_store(setup_path) as store: - # multiple data columns - df_new = df.copy() - df_new.iloc[0, df_new.columns.get_loc("A")] = 1.0 - df_new.iloc[0, df_new.columns.get_loc("B")] = -1.0 - df_new["string"] = "foo" - - sl = df_new.columns.get_loc("string") - df_new.iloc[1:4, sl] = np.nan - df_new.iloc[5:6, sl] = "bar" - - df_new["string2"] = "foo" - sl = df_new.columns.get_loc("string2") - df_new.iloc[2:5, sl] = np.nan - df_new.iloc[7:8, sl] = "bar" - _maybe_remove(store, "df") - store.append("df", df_new, data_columns=["A", "B", "string", "string2"]) - result = store.select("df", "string='foo' and string2='foo' and A>0 and B<0") - expected = df_new[ - (df_new.string == "foo") - & (df_new.string2 == "foo") - & (df_new.A > 0) - & (df_new.B < 0) - ] - tm.assert_frame_equal(result, expected, check_freq=False) - # FIXME: 2020-05-07 freq check randomly fails in the CI - - # yield an empty frame - result = store.select("df", "string='foo' and string2='cool'") - expected = df_new[(df_new.string == "foo") & (df_new.string2 == "cool")] - tm.assert_frame_equal(result, expected) - - with ensure_clean_store(setup_path) as store: - # doc example - df_dc = df.copy() - df_dc["string"] = "foo" - df_dc.loc[df_dc.index[4:6], "string"] = np.nan - df_dc.loc[df_dc.index[7:9], "string"] = "bar" - df_dc["string2"] = "cool" - df_dc["datetime"] = Timestamp("20010102") - df_dc.loc[df_dc.index[3:5], ["A", "B", "datetime"]] = np.nan - - _maybe_remove(store, "df_dc") - store.append( - "df_dc", df_dc, data_columns=["B", "C", "string", "string2", "datetime"] - ) - result = store.select("df_dc", "B>0") - - expected = df_dc[df_dc.B > 0] - tm.assert_frame_equal(result, expected) - - result = store.select("df_dc", ["B > 0", "C > 0", "string == foo"]) - expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] - tm.assert_frame_equal(result, expected, check_freq=False) - # FIXME: 2020-12-07 intermittent build failures here with freq of - # None instead of BDay(4) - - with ensure_clean_store(setup_path) as store: - # doc example part 2 - np.random.seed(1234) - index = date_range("1/1/2000", periods=8) - df_dc = DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"]) - df_dc["string"] = "foo" - df_dc.loc[df_dc.index[4:6], "string"] = np.nan - df_dc.loc[df_dc.index[7:9], "string"] = "bar" - df_dc[["B", "C"]] = df_dc[["B", "C"]].abs() - df_dc["string2"] = "cool" - - # on-disk operations - store.append("df_dc", df_dc, data_columns=["B", "C", "string", "string2"]) - - result = store.select("df_dc", "B>0") - expected = df_dc[df_dc.B > 0] - tm.assert_frame_equal(result, expected) - - result = store.select("df_dc", ["B > 0", "C > 0", 'string == "foo"']) - expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] - tm.assert_frame_equal(result, expected) - - -def test_append_hierarchical(tmp_path, setup_path, multiindex_dataframe_random_data): - df = multiindex_dataframe_random_data - df.columns.name = None - - with ensure_clean_store(setup_path) as store: - store.append("mi", df) - result = store.select("mi") - tm.assert_frame_equal(result, df) - - # GH 3748 - result = store.select("mi", columns=["A", "B"]) - expected = df.reindex(columns=["A", "B"]) - tm.assert_frame_equal(result, expected) - - path = tmp_path / "test.hdf" - df.to_hdf(path, "df", format="table") - result = read_hdf(path, "df", columns=["A", "B"]) - expected = df.reindex(columns=["A", "B"]) - tm.assert_frame_equal(result, expected) - - -def test_append_misc(setup_path): - - with ensure_clean_store(setup_path) as store: - df = tm.makeDataFrame() - store.append("df", df, chunksize=1) - result = store.select("df") - tm.assert_frame_equal(result, df) - - store.append("df1", df, expectedrows=10) - result = store.select("df1") - tm.assert_frame_equal(result, df) - - -@pytest.mark.parametrize("chunksize", [10, 200, 1000]) -def test_append_misc_chunksize(setup_path, chunksize): - # more chunksize in append tests - df = tm.makeDataFrame() - df["string"] = "foo" - df["float322"] = 1.0 - df["float322"] = df["float322"].astype("float32") - df["bool"] = df["float322"] > 0 - df["time1"] = Timestamp("20130101") - df["time2"] = Timestamp("20130102") - with ensure_clean_store(setup_path, mode="w") as store: - store.append("obj", df, chunksize=chunksize) - result = store.select("obj") - tm.assert_frame_equal(result, df) - - -def test_append_misc_empty_frame(setup_path): - # empty frame, GH4273 - with ensure_clean_store(setup_path) as store: - - # 0 len - df_empty = DataFrame(columns=list("ABC")) - store.append("df", df_empty) - with pytest.raises(KeyError, match="'No object named df in the file'"): - store.select("df") - - # repeated append of 0/non-zero frames - df = DataFrame(np.random.rand(10, 3), columns=list("ABC")) - store.append("df", df) - tm.assert_frame_equal(store.select("df"), df) - store.append("df", df_empty) - tm.assert_frame_equal(store.select("df"), df) - - # store - df = DataFrame(columns=list("ABC")) - store.put("df2", df) - tm.assert_frame_equal(store.select("df2"), df) - - -# TODO(ArrayManager) currently we rely on falling back to BlockManager, but -# the conversion from AM->BM converts the invalid object dtype column into -# a datetime64 column no longer raising an error -@td.skip_array_manager_not_yet_implemented -def test_append_raise(setup_path): - - with ensure_clean_store(setup_path) as store: - - # test append with invalid input to get good error messages - - # list in column - df = tm.makeDataFrame() - df["invalid"] = [["a"]] * len(df) - assert df.dtypes["invalid"] == np.object_ - msg = re.escape( - """Cannot serialize the column [invalid] -because its data contents are not [string] but [mixed] object dtype""" - ) - with pytest.raises(TypeError, match=msg): - store.append("df", df) - - # multiple invalid columns - df["invalid2"] = [["a"]] * len(df) - df["invalid3"] = [["a"]] * len(df) - with pytest.raises(TypeError, match=msg): - store.append("df", df) - - # datetime with embedded nans as object - df = tm.makeDataFrame() - s = Series(datetime.datetime(2001, 1, 2), index=df.index) - s = s.astype(object) - s[0:5] = np.nan - df["invalid"] = s - assert df.dtypes["invalid"] == np.object_ - msg = "too many timezones in this block, create separate data columns" - with pytest.raises(TypeError, match=msg): - store.append("df", df) - - # directly ndarray - msg = "value must be None, Series, or DataFrame" - with pytest.raises(TypeError, match=msg): - store.append("df", np.arange(10)) - - # series directly - msg = re.escape( - "cannot properly create the storer for: " - "[group->df,value->]" - ) - with pytest.raises(TypeError, match=msg): - store.append("df", Series(np.arange(10))) - - # appending an incompatible table - df = tm.makeDataFrame() - store.append("df", df) - - df["foo"] = "foo" - msg = re.escape( - "invalid combination of [non_index_axes] on appending data " - "[(1, ['A', 'B', 'C', 'D', 'foo'])] vs current table " - "[(1, ['A', 'B', 'C', 'D'])]" - ) - with pytest.raises(ValueError, match=msg): - store.append("df", df) - - # incompatible type (GH 41897) - _maybe_remove(store, "df") - df["foo"] = Timestamp("20130101") - store.append("df", df) - df["foo"] = "bar" - msg = re.escape( - "invalid combination of [values_axes] on appending data " - "[name->values_block_1,cname->values_block_1," - "dtype->bytes24,kind->string,shape->(1, 30)] " - "vs current table " - "[name->values_block_1,cname->values_block_1," - "dtype->datetime64,kind->datetime64,shape->None]" - ) - with pytest.raises(ValueError, match=msg): - store.append("df", df) - - -def test_append_with_timedelta(setup_path): - # GH 3577 - # append timedelta - - df = DataFrame( - { - "A": Timestamp("20130101"), - "B": [ - Timestamp("20130101") + timedelta(days=i, seconds=10) for i in range(10) - ], - } - ) - df["C"] = df["A"] - df["B"] - df.loc[3:5, "C"] = np.nan - - with ensure_clean_store(setup_path) as store: - - # table - _maybe_remove(store, "df") - store.append("df", df, data_columns=True) - result = store.select("df") - tm.assert_frame_equal(result, df) - - result = store.select("df", where="C<100000") - tm.assert_frame_equal(result, df) - - result = store.select("df", where="C0", "B>0"], selector="df1" - ) - expected = df[(df.A > 0) & (df.B > 0)] - tm.assert_frame_equal(result, expected) - - -def test_append_to_multiple_dropna(setup_path): - df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) - df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan - df = concat([df1, df2], axis=1) - - with ensure_clean_store(setup_path) as store: - - # dropna=True should guarantee rows are synchronized - store.append_to_multiple( - {"df1": ["A", "B"], "df2": None}, df, selector="df1", dropna=True - ) - result = store.select_as_multiple(["df1", "df2"]) - expected = df.dropna() - tm.assert_frame_equal(result, expected) - tm.assert_index_equal(store.select("df1").index, store.select("df2").index) - - -def test_append_to_multiple_dropna_false(setup_path): - df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) - df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan - df = concat([df1, df2], axis=1) - - with ensure_clean_store(setup_path) as store, pd.option_context( - "io.hdf.dropna_table", True - ): - # dropna=False shouldn't synchronize row indexes - store.append_to_multiple( - {"df1a": ["A", "B"], "df2a": None}, df, selector="df1a", dropna=False - ) - - msg = "all tables must have exactly the same nrows!" - with pytest.raises(ValueError, match=msg): - store.select_as_multiple(["df1a", "df2a"]) - - assert not store.select("df1a").index.equals(store.select("df2a").index) - - -def test_append_to_multiple_min_itemsize(setup_path): - # GH 11238 - df = DataFrame( - { - "IX": np.arange(1, 21), - "Num": np.arange(1, 21), - "BigNum": np.arange(1, 21) * 88, - "Str": ["a" for _ in range(20)], - "LongStr": ["abcde" for _ in range(20)], - } - ) - expected = df.iloc[[0]] - - with ensure_clean_store(setup_path) as store: - store.append_to_multiple( - { - "index": ["IX"], - "nums": ["Num", "BigNum"], - "strs": ["Str", "LongStr"], - }, - df.iloc[[0]], - "index", - min_itemsize={"Str": 10, "LongStr": 100, "Num": 2}, - ) - result = store.select_as_multiple(["index", "nums", "strs"]) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_categorical.py b/pandas/tests/io/pytables/test_categorical.py deleted file mode 100644 index 7c2ab9b4f6ec0..0000000000000 --- a/pandas/tests/io/pytables/test_categorical.py +++ /dev/null @@ -1,219 +0,0 @@ -import numpy as np -import pytest - -from pandas import ( - Categorical, - DataFrame, - Series, - _testing as tm, - concat, - read_hdf, -) -from pandas.tests.io.pytables.common import ( - _maybe_remove, - ensure_clean_store, -) - -pytestmark = [ - pytest.mark.single_cpu, -] - - -def test_categorical(setup_path): - - with ensure_clean_store(setup_path) as store: - - # Basic - _maybe_remove(store, "s") - s = Series( - Categorical( - ["a", "b", "b", "a", "a", "c"], - categories=["a", "b", "c", "d"], - ordered=False, - ) - ) - store.append("s", s, format="table") - result = store.select("s") - tm.assert_series_equal(s, result) - - _maybe_remove(store, "s_ordered") - s = Series( - Categorical( - ["a", "b", "b", "a", "a", "c"], - categories=["a", "b", "c", "d"], - ordered=True, - ) - ) - store.append("s_ordered", s, format="table") - result = store.select("s_ordered") - tm.assert_series_equal(s, result) - - _maybe_remove(store, "df") - df = DataFrame({"s": s, "vals": [1, 2, 3, 4, 5, 6]}) - store.append("df", df, format="table") - result = store.select("df") - tm.assert_frame_equal(result, df) - - # Dtypes - _maybe_remove(store, "si") - s = Series([1, 1, 2, 2, 3, 4, 5]).astype("category") - store.append("si", s) - result = store.select("si") - tm.assert_series_equal(result, s) - - _maybe_remove(store, "si2") - s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype("category") - store.append("si2", s) - result = store.select("si2") - tm.assert_series_equal(result, s) - - # Multiple - _maybe_remove(store, "df2") - df2 = df.copy() - df2["s2"] = Series(list("abcdefg")).astype("category") - store.append("df2", df2) - result = store.select("df2") - tm.assert_frame_equal(result, df2) - - # Make sure the metadata is OK - info = store.info() - assert "/df2 " in info - # df2._mgr.blocks[0] and df2._mgr.blocks[2] are Categorical - assert "/df2/meta/values_block_0/meta" in info - assert "/df2/meta/values_block_2/meta" in info - - # unordered - _maybe_remove(store, "s2") - s = Series( - Categorical( - ["a", "b", "b", "a", "a", "c"], - categories=["a", "b", "c", "d"], - ordered=False, - ) - ) - store.append("s2", s, format="table") - result = store.select("s2") - tm.assert_series_equal(result, s) - - # Query - _maybe_remove(store, "df3") - store.append("df3", df, data_columns=["s"]) - expected = df[df.s.isin(["b", "c"])] - result = store.select("df3", where=['s in ["b","c"]']) - tm.assert_frame_equal(result, expected) - - expected = df[df.s.isin(["b", "c"])] - result = store.select("df3", where=['s = ["b","c"]']) - tm.assert_frame_equal(result, expected) - - expected = df[df.s.isin(["d"])] - result = store.select("df3", where=['s in ["d"]']) - tm.assert_frame_equal(result, expected) - - expected = df[df.s.isin(["f"])] - result = store.select("df3", where=['s in ["f"]']) - tm.assert_frame_equal(result, expected) - - # Appending with same categories is ok - store.append("df3", df) - - df = concat([df, df]) - expected = df[df.s.isin(["b", "c"])] - result = store.select("df3", where=['s in ["b","c"]']) - tm.assert_frame_equal(result, expected) - - # Appending must have the same categories - df3 = df.copy() - df3["s"] = df3["s"].cat.remove_unused_categories() - - msg = "cannot append a categorical with different categories to the existing" - with pytest.raises(ValueError, match=msg): - store.append("df3", df3) - - # Remove, and make sure meta data is removed (its a recursive - # removal so should be). - result = store.select("df3/meta/s/meta") - assert result is not None - store.remove("df3") - - with pytest.raises( - KeyError, match="'No object named df3/meta/s/meta in the file'" - ): - store.select("df3/meta/s/meta") - - -def test_categorical_conversion(tmp_path, setup_path): - - # GH13322 - # Check that read_hdf with categorical columns doesn't return rows if - # where criteria isn't met. - obsids = ["ESP_012345_6789", "ESP_987654_3210"] - imgids = ["APF00006np", "APF0001imm"] - data = [4.3, 9.8] - - # Test without categories - df = DataFrame({"obsids": obsids, "imgids": imgids, "data": data}) - - # We are expecting an empty DataFrame matching types of df - expected = df.iloc[[], :] - path = tmp_path / setup_path - df.to_hdf(path, "df", format="table", data_columns=True) - result = read_hdf(path, "df", where="obsids=B") - tm.assert_frame_equal(result, expected) - - # Test with categories - df.obsids = df.obsids.astype("category") - df.imgids = df.imgids.astype("category") - - # We are expecting an empty DataFrame matching types of df - expected = df.iloc[[], :] - path = tmp_path / setup_path - df.to_hdf(path, "df", format="table", data_columns=True) - result = read_hdf(path, "df", where="obsids=B") - tm.assert_frame_equal(result, expected) - - -def test_categorical_nan_only_columns(tmp_path, setup_path): - # GH18413 - # Check that read_hdf with categorical columns with NaN-only values can - # be read back. - df = DataFrame( - { - "a": ["a", "b", "c", np.nan], - "b": [np.nan, np.nan, np.nan, np.nan], - "c": [1, 2, 3, 4], - "d": Series([None] * 4, dtype=object), - } - ) - df["a"] = df.a.astype("category") - df["b"] = df.b.astype("category") - df["d"] = df.b.astype("category") - expected = df - path = tmp_path / setup_path - df.to_hdf(path, "df", format="table", data_columns=True) - result = read_hdf(path, "df") - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "where, df, expected", - [ - ('col=="q"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": []})), - ('col=="a"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": ["a"]})), - ], -) -def test_convert_value( - tmp_path, setup_path, where: str, df: DataFrame, expected: DataFrame -): - # GH39420 - # Check that read_hdf with categorical columns can filter by where condition. - df.col = df.col.astype("category") - max_widths = {"col": 1} - categorical_values = sorted(df.col.unique()) - expected.col = expected.col.astype("category") - expected.col = expected.col.cat.set_categories(categorical_values) - - path = tmp_path / setup_path - df.to_hdf(path, "df", format="table", min_itemsize=max_widths) - result = read_hdf(path, where=where) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_compat.py b/pandas/tests/io/pytables/test_compat.py deleted file mode 100644 index 7f71d2666c9ae..0000000000000 --- a/pandas/tests/io/pytables/test_compat.py +++ /dev/null @@ -1,75 +0,0 @@ -import pytest - -import pandas as pd -import pandas._testing as tm - -tables = pytest.importorskip("tables") - - -@pytest.fixture -def pytables_hdf5_file(tmp_path): - """ - Use PyTables to create a simple HDF5 file. - """ - table_schema = { - "c0": tables.Time64Col(pos=0), - "c1": tables.StringCol(5, pos=1), - "c2": tables.Int64Col(pos=2), - } - - t0 = 1_561_105_000.0 - - testsamples = [ - {"c0": t0, "c1": "aaaaa", "c2": 1}, - {"c0": t0 + 1, "c1": "bbbbb", "c2": 2}, - {"c0": t0 + 2, "c1": "ccccc", "c2": 10**5}, - {"c0": t0 + 3, "c1": "ddddd", "c2": 4_294_967_295}, - ] - - objname = "pandas_test_timeseries" - - path = tmp_path / "written_with_pytables.h5" - with tables.open_file(path, mode="w") as f: - t = f.create_table("/", name=objname, description=table_schema) - for sample in testsamples: - for key, value in sample.items(): - t.row[key] = value - t.row.append() - - yield path, objname, pd.DataFrame(testsamples) - - -class TestReadPyTablesHDF5: - """ - A group of tests which covers reading HDF5 files written by plain PyTables - (not written by pandas). - - Was introduced for regression-testing issue 11188. - """ - - def test_read_complete(self, pytables_hdf5_file): - path, objname, df = pytables_hdf5_file - result = pd.read_hdf(path, key=objname) - expected = df - tm.assert_frame_equal(result, expected) - - def test_read_with_start(self, pytables_hdf5_file): - path, objname, df = pytables_hdf5_file - # This is a regression test for pandas-dev/pandas/issues/11188 - result = pd.read_hdf(path, key=objname, start=1) - expected = df[1:].reset_index(drop=True) - tm.assert_frame_equal(result, expected) - - def test_read_with_stop(self, pytables_hdf5_file): - path, objname, df = pytables_hdf5_file - # This is a regression test for pandas-dev/pandas/issues/11188 - result = pd.read_hdf(path, key=objname, stop=1) - expected = df[:1].reset_index(drop=True) - tm.assert_frame_equal(result, expected) - - def test_read_with_startstop(self, pytables_hdf5_file): - path, objname, df = pytables_hdf5_file - # This is a regression test for pandas-dev/pandas/issues/11188 - result = pd.read_hdf(path, key=objname, start=1, stop=2) - expected = df[1:2].reset_index(drop=True) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py deleted file mode 100644 index 870458e93689f..0000000000000 --- a/pandas/tests/io/pytables/test_complex.py +++ /dev/null @@ -1,200 +0,0 @@ -from warnings import catch_warnings - -import numpy as np -import pytest - -import pandas as pd -from pandas import ( - DataFrame, - Series, -) -import pandas._testing as tm -from pandas.tests.io.pytables.common import ensure_clean_store - -from pandas.io.pytables import read_hdf - - -def test_complex_fixed(tmp_path, setup_path): - df = DataFrame( - np.random.rand(4, 5).astype(np.complex64), - index=list("abcd"), - columns=list("ABCDE"), - ) - - path = tmp_path / setup_path - df.to_hdf(path, "df") - reread = read_hdf(path, "df") - tm.assert_frame_equal(df, reread) - - df = DataFrame( - np.random.rand(4, 5).astype(np.complex128), - index=list("abcd"), - columns=list("ABCDE"), - ) - path = tmp_path / setup_path - df.to_hdf(path, "df") - reread = read_hdf(path, "df") - tm.assert_frame_equal(df, reread) - - -def test_complex_table(tmp_path, setup_path): - df = DataFrame( - np.random.rand(4, 5).astype(np.complex64), - index=list("abcd"), - columns=list("ABCDE"), - ) - - path = tmp_path / setup_path - df.to_hdf(path, "df", format="table") - reread = read_hdf(path, "df") - tm.assert_frame_equal(df, reread) - - df = DataFrame( - np.random.rand(4, 5).astype(np.complex128), - index=list("abcd"), - columns=list("ABCDE"), - ) - - path = tmp_path / setup_path - df.to_hdf(path, "df", format="table", mode="w") - reread = read_hdf(path, "df") - tm.assert_frame_equal(df, reread) - - -def test_complex_mixed_fixed(tmp_path, setup_path): - complex64 = np.array( - [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64 - ) - complex128 = np.array( - [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 - ) - df = DataFrame( - { - "A": [1, 2, 3, 4], - "B": ["a", "b", "c", "d"], - "C": complex64, - "D": complex128, - "E": [1.0, 2.0, 3.0, 4.0], - }, - index=list("abcd"), - ) - path = tmp_path / setup_path - df.to_hdf(path, "df") - reread = read_hdf(path, "df") - tm.assert_frame_equal(df, reread) - - -def test_complex_mixed_table(tmp_path, setup_path): - complex64 = np.array( - [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64 - ) - complex128 = np.array( - [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 - ) - df = DataFrame( - { - "A": [1, 2, 3, 4], - "B": ["a", "b", "c", "d"], - "C": complex64, - "D": complex128, - "E": [1.0, 2.0, 3.0, 4.0], - }, - index=list("abcd"), - ) - - with ensure_clean_store(setup_path) as store: - store.append("df", df, data_columns=["A", "B"]) - result = store.select("df", where="A>2") - tm.assert_frame_equal(df.loc[df.A > 2], result) - - path = tmp_path / setup_path - df.to_hdf(path, "df", format="table") - reread = read_hdf(path, "df") - tm.assert_frame_equal(df, reread) - - -def test_complex_across_dimensions_fixed(tmp_path, setup_path): - with catch_warnings(record=True): - complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) - s = Series(complex128, index=list("abcd")) - df = DataFrame({"A": s, "B": s}) - - objs = [s, df] - comps = [tm.assert_series_equal, tm.assert_frame_equal] - for obj, comp in zip(objs, comps): - path = tmp_path / setup_path - obj.to_hdf(path, "obj", format="fixed") - reread = read_hdf(path, "obj") - comp(obj, reread) - - -def test_complex_across_dimensions(tmp_path, setup_path): - complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) - s = Series(complex128, index=list("abcd")) - df = DataFrame({"A": s, "B": s}) - - with catch_warnings(record=True): - - objs = [df] - comps = [tm.assert_frame_equal] - for obj, comp in zip(objs, comps): - path = tmp_path / setup_path - obj.to_hdf(path, "obj", format="table") - reread = read_hdf(path, "obj") - comp(obj, reread) - - -def test_complex_indexing_error(setup_path): - complex128 = np.array( - [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 - ) - df = DataFrame( - {"A": [1, 2, 3, 4], "B": ["a", "b", "c", "d"], "C": complex128}, - index=list("abcd"), - ) - - msg = ( - "Columns containing complex values can be stored " - "but cannot be indexed when using table format. " - "Either use fixed format, set index=False, " - "or do not include the columns containing complex " - "values to data_columns when initializing the table." - ) - - with ensure_clean_store(setup_path) as store: - with pytest.raises(TypeError, match=msg): - store.append("df", df, data_columns=["C"]) - - -def test_complex_series_error(tmp_path, setup_path): - complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) - s = Series(complex128, index=list("abcd")) - - msg = ( - "Columns containing complex values can be stored " - "but cannot be indexed when using table format. " - "Either use fixed format, set index=False, " - "or do not include the columns containing complex " - "values to data_columns when initializing the table." - ) - - path = tmp_path / setup_path - with pytest.raises(TypeError, match=msg): - s.to_hdf(path, "obj", format="t") - - path = tmp_path / setup_path - s.to_hdf(path, "obj", format="t", index=False) - reread = read_hdf(path, "obj") - tm.assert_series_equal(s, reread) - - -def test_complex_append(setup_path): - df = DataFrame( - {"a": np.random.randn(100).astype(np.complex128), "b": np.random.randn(100)} - ) - - with ensure_clean_store(setup_path) as store: - store.append("df", df, data_columns=["b"]) - store.append("df", df) - result = store.select("df") - tm.assert_frame_equal(pd.concat([df, df], axis=0), result) diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py deleted file mode 100644 index 7629e8ca7dfc2..0000000000000 --- a/pandas/tests/io/pytables/test_errors.py +++ /dev/null @@ -1,236 +0,0 @@ -import datetime -from io import BytesIO -import re -from warnings import catch_warnings - -import numpy as np -import pytest - -from pandas import ( - CategoricalIndex, - DataFrame, - HDFStore, - MultiIndex, - _testing as tm, - date_range, - read_hdf, -) -from pandas.tests.io.pytables.common import ensure_clean_store - -from pandas.io.pytables import ( - Term, - _maybe_adjust_name, -) - -pytestmark = pytest.mark.single_cpu - - -def test_pass_spec_to_storer(setup_path): - - df = tm.makeDataFrame() - - with ensure_clean_store(setup_path) as store: - store.put("df", df) - msg = ( - "cannot pass a column specification when reading a Fixed format " - "store. this store must be selected in its entirety" - ) - with pytest.raises(TypeError, match=msg): - store.select("df", columns=["A"]) - msg = ( - "cannot pass a where specification when reading from a Fixed " - "format store. this store must be selected in its entirety" - ) - with pytest.raises(TypeError, match=msg): - store.select("df", where=[("columns=A")]) - - -def test_table_index_incompatible_dtypes(setup_path): - df1 = DataFrame({"a": [1, 2, 3]}) - df2 = DataFrame({"a": [4, 5, 6]}, index=date_range("1/1/2000", periods=3)) - - with ensure_clean_store(setup_path) as store: - store.put("frame", df1, format="table") - msg = re.escape("incompatible kind in col [integer - datetime64]") - with pytest.raises(TypeError, match=msg): - store.put("frame", df2, format="table", append=True) - - -def test_unimplemented_dtypes_table_columns(setup_path): - - with ensure_clean_store(setup_path) as store: - - dtypes = [("date", datetime.date(2001, 1, 2))] - - # currently not supported dtypes #### - for n, f in dtypes: - df = tm.makeDataFrame() - df[n] = f - msg = re.escape(f"[{n}] is not implemented as a table column") - with pytest.raises(TypeError, match=msg): - store.append(f"df1_{n}", df) - - # frame - df = tm.makeDataFrame() - df["obj1"] = "foo" - df["obj2"] = "bar" - df["datetime1"] = datetime.date(2001, 1, 2) - df = df._consolidate() - - with ensure_clean_store(setup_path) as store: - # this fails because we have a date in the object block...... - msg = re.escape( - """Cannot serialize the column [datetime1] -because its data contents are not [string] but [date] object dtype""" - ) - with pytest.raises(TypeError, match=msg): - store.append("df_unimplemented", df) - - -def test_invalid_terms(tmp_path, setup_path): - - with ensure_clean_store(setup_path) as store: - - with catch_warnings(record=True): - - df = tm.makeTimeDataFrame() - df["string"] = "foo" - df.loc[df.index[0:4], "string"] = "bar" - - store.put("df", df, format="table") - - # some invalid terms - msg = re.escape( - "__init__() missing 1 required positional argument: 'where'" - ) - with pytest.raises(TypeError, match=msg): - Term() - - # more invalid - msg = re.escape( - "cannot process expression [df.index[3]], " - "[2000-01-06 00:00:00] is not a valid condition" - ) - with pytest.raises(ValueError, match=msg): - store.select("df", "df.index[3]") - - msg = "invalid syntax" - with pytest.raises(SyntaxError, match=msg): - store.select("df", "index>") - - # from the docs - path = tmp_path / setup_path - dfq = DataFrame( - np.random.randn(10, 4), - columns=list("ABCD"), - index=date_range("20130101", periods=10), - ) - dfq.to_hdf(path, "dfq", format="table", data_columns=True) - - # check ok - read_hdf(path, "dfq", where="index>Timestamp('20130104') & columns=['A', 'B']") - read_hdf(path, "dfq", where="A>0 or C>0") - - # catch the invalid reference - path = tmp_path / setup_path - dfq = DataFrame( - np.random.randn(10, 4), - columns=list("ABCD"), - index=date_range("20130101", periods=10), - ) - dfq.to_hdf(path, "dfq", format="table") - - msg = ( - r"The passed where expression: A>0 or C>0\n\s*" - r"contains an invalid variable reference\n\s*" - r"all of the variable references must be a reference to\n\s*" - r"an axis \(e.g. 'index' or 'columns'\), or a data_column\n\s*" - r"The currently defined references are: index,columns\n" - ) - with pytest.raises(ValueError, match=msg): - read_hdf(path, "dfq", where="A>0 or C>0") - - -def test_append_with_diff_col_name_types_raises_value_error(setup_path): - df = DataFrame(np.random.randn(10, 1)) - df2 = DataFrame({"a": np.random.randn(10)}) - df3 = DataFrame({(1, 2): np.random.randn(10)}) - df4 = DataFrame({("1", 2): np.random.randn(10)}) - df5 = DataFrame({("1", 2, object): np.random.randn(10)}) - - with ensure_clean_store(setup_path) as store: - name = f"df_{tm.rands(10)}" - store.append(name, df) - - for d in (df2, df3, df4, df5): - msg = re.escape( - "cannot match existing table structure for [0] on appending data" - ) - with pytest.raises(ValueError, match=msg): - store.append(name, d) - - -def test_invalid_complib(setup_path): - df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) - with tm.ensure_clean(setup_path) as path: - msg = r"complib only supports \[.*\] compression." - with pytest.raises(ValueError, match=msg): - df.to_hdf(path, "df", complib="foolib") - - -@pytest.mark.parametrize( - "idx", - [ - date_range("2019", freq="D", periods=3, tz="UTC"), - CategoricalIndex(list("abc")), - ], -) -def test_to_hdf_multiindex_extension_dtype(idx, tmp_path, setup_path): - # GH 7775 - mi = MultiIndex.from_arrays([idx, idx]) - df = DataFrame(0, index=mi, columns=["a"]) - path = tmp_path / setup_path - with pytest.raises(NotImplementedError, match="Saving a MultiIndex"): - df.to_hdf(path, "df") - - -def test_unsuppored_hdf_file_error(datapath): - # GH 9539 - data_path = datapath("io", "data", "legacy_hdf/incompatible_dataset.h5") - message = ( - r"Dataset\(s\) incompatible with Pandas data types, " - "not table, or no datasets found in HDF5 file." - ) - - with pytest.raises(ValueError, match=message): - read_hdf(data_path) - - -def test_read_hdf_errors(setup_path, tmp_path): - df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) - - path = tmp_path / setup_path - msg = r"File [\S]* does not exist" - with pytest.raises(OSError, match=msg): - read_hdf(path, "key") - - df.to_hdf(path, "df") - store = HDFStore(path, mode="r") - store.close() - - msg = "The HDFStore must be open for reading." - with pytest.raises(OSError, match=msg): - read_hdf(store, "df") - - -def test_read_hdf_generic_buffer_errors(): - msg = "Support for generic buffers has not been implemented." - with pytest.raises(NotImplementedError, match=msg): - read_hdf(BytesIO(b""), "df") - - -@pytest.mark.parametrize("bad_version", [(1, 2), (1,), [], "12", "123"]) -def test_maybe_adjust_name_bad_version_raises(bad_version): - msg = "Version is incorrect, expected sequence of 3 integers" - with pytest.raises(ValueError, match=msg): - _maybe_adjust_name("values_block_0", version=bad_version) diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py deleted file mode 100644 index 19a92163c6dd2..0000000000000 --- a/pandas/tests/io/pytables/test_file_handling.py +++ /dev/null @@ -1,446 +0,0 @@ -import os - -import numpy as np -import pytest - -from pandas.compat import is_platform_little_endian -from pandas.errors import ( - ClosedFileError, - PossibleDataLossError, -) - -from pandas import ( - DataFrame, - HDFStore, - Series, - _testing as tm, - read_hdf, -) -from pandas.tests.io.pytables.common import ( - _maybe_remove, - ensure_clean_store, - tables, -) - -from pandas.io import pytables -from pandas.io.pytables import Term - -pytestmark = pytest.mark.single_cpu - - -@pytest.mark.parametrize("mode", ["r", "r+", "a", "w"]) -def test_mode(setup_path, tmp_path, mode): - - df = tm.makeTimeDataFrame() - msg = r"[\S]* does not exist" - path = tmp_path / setup_path - - # constructor - if mode in ["r", "r+"]: - with pytest.raises(OSError, match=msg): - HDFStore(path, mode=mode) - - else: - with HDFStore(path, mode=mode) as store: - assert store._handle.mode == mode - - path = tmp_path / setup_path - - # context - if mode in ["r", "r+"]: - with pytest.raises(OSError, match=msg): - with HDFStore(path, mode=mode) as store: - pass - else: - with HDFStore(path, mode=mode) as store: - assert store._handle.mode == mode - - path = tmp_path / setup_path - - # conv write - if mode in ["r", "r+"]: - with pytest.raises(OSError, match=msg): - df.to_hdf(path, "df", mode=mode) - df.to_hdf(path, "df", mode="w") - else: - df.to_hdf(path, "df", mode=mode) - - # conv read - if mode in ["w"]: - msg = ( - "mode w is not allowed while performing a read. " - r"Allowed modes are r, r\+ and a." - ) - with pytest.raises(ValueError, match=msg): - read_hdf(path, "df", mode=mode) - else: - result = read_hdf(path, "df", mode=mode) - tm.assert_frame_equal(result, df) - - -def test_default_mode(tmp_path, setup_path): - # read_hdf uses default mode - df = tm.makeTimeDataFrame() - path = tmp_path / setup_path - df.to_hdf(path, "df", mode="w") - result = read_hdf(path, "df") - tm.assert_frame_equal(result, df) - - -def test_reopen_handle(tmp_path, setup_path): - - path = tmp_path / setup_path - - store = HDFStore(path, mode="a") - store["a"] = tm.makeTimeSeries() - - msg = ( - r"Re-opening the file \[[\S]*\] with mode \[a\] will delete the " - "current file!" - ) - # invalid mode change - with pytest.raises(PossibleDataLossError, match=msg): - store.open("w") - - store.close() - assert not store.is_open - - # truncation ok here - store.open("w") - assert store.is_open - assert len(store) == 0 - store.close() - assert not store.is_open - - store = HDFStore(path, mode="a") - store["a"] = tm.makeTimeSeries() - - # reopen as read - store.open("r") - assert store.is_open - assert len(store) == 1 - assert store._mode == "r" - store.close() - assert not store.is_open - - # reopen as append - store.open("a") - assert store.is_open - assert len(store) == 1 - assert store._mode == "a" - store.close() - assert not store.is_open - - # reopen as append (again) - store.open("a") - assert store.is_open - assert len(store) == 1 - assert store._mode == "a" - store.close() - assert not store.is_open - - -def test_open_args(setup_path): - - with tm.ensure_clean(setup_path) as path: - - df = tm.makeDataFrame() - - # create an in memory store - store = HDFStore( - path, mode="a", driver="H5FD_CORE", driver_core_backing_store=0 - ) - store["df"] = df - store.append("df2", df) - - tm.assert_frame_equal(store["df"], df) - tm.assert_frame_equal(store["df2"], df) - - store.close() - - # the file should not have actually been written - assert not os.path.exists(path) - - -def test_flush(setup_path): - - with ensure_clean_store(setup_path) as store: - store["a"] = tm.makeTimeSeries() - store.flush() - store.flush(fsync=True) - - -def test_complibs_default_settings(tmp_path, setup_path): - # GH15943 - df = tm.makeDataFrame() - - # Set complevel and check if complib is automatically set to - # default value - tmpfile = tmp_path / setup_path - df.to_hdf(tmpfile, "df", complevel=9) - result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) - - with tables.open_file(tmpfile, mode="r") as h5file: - for node in h5file.walk_nodes(where="/df", classname="Leaf"): - assert node.filters.complevel == 9 - assert node.filters.complib == "zlib" - - # Set complib and check to see if compression is disabled - tmpfile = tmp_path / setup_path - df.to_hdf(tmpfile, "df", complib="zlib") - result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) - - with tables.open_file(tmpfile, mode="r") as h5file: - for node in h5file.walk_nodes(where="/df", classname="Leaf"): - assert node.filters.complevel == 0 - assert node.filters.complib is None - - # Check if not setting complib or complevel results in no compression - tmpfile = tmp_path / setup_path - df.to_hdf(tmpfile, "df") - result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) - - with tables.open_file(tmpfile, mode="r") as h5file: - for node in h5file.walk_nodes(where="/df", classname="Leaf"): - assert node.filters.complevel == 0 - assert node.filters.complib is None - - -def test_complibs_default_settings_override(tmp_path, setup_path): - # Check if file-defaults can be overridden on a per table basis - df = tm.makeDataFrame() - tmpfile = tmp_path / setup_path - store = HDFStore(tmpfile) - store.append("dfc", df, complevel=9, complib="blosc") - store.append("df", df) - store.close() - - with tables.open_file(tmpfile, mode="r") as h5file: - for node in h5file.walk_nodes(where="/df", classname="Leaf"): - assert node.filters.complevel == 0 - assert node.filters.complib is None - for node in h5file.walk_nodes(where="/dfc", classname="Leaf"): - assert node.filters.complevel == 9 - assert node.filters.complib == "blosc" - - -def test_complibs(tmp_path, setup_path): - # GH14478 - df = tm.makeDataFrame() - - # Building list of all complibs and complevels tuples - all_complibs = tables.filters.all_complibs - # Remove lzo if its not available on this platform - if not tables.which_lib_version("lzo"): - all_complibs.remove("lzo") - # Remove bzip2 if its not available on this platform - if not tables.which_lib_version("bzip2"): - all_complibs.remove("bzip2") - - all_levels = range(0, 10) - all_tests = [(lib, lvl) for lib in all_complibs for lvl in all_levels] - - for (lib, lvl) in all_tests: - tmpfile = tmp_path / setup_path - gname = "foo" - - # Write and read file to see if data is consistent - df.to_hdf(tmpfile, gname, complib=lib, complevel=lvl) - result = read_hdf(tmpfile, gname) - tm.assert_frame_equal(result, df) - - # Open file and check metadata for correct amount of compression - with tables.open_file(tmpfile, mode="r") as h5table: - for node in h5table.walk_nodes(where="/" + gname, classname="Leaf"): - assert node.filters.complevel == lvl - if lvl == 0: - assert node.filters.complib is None - else: - assert node.filters.complib == lib - - -@pytest.mark.skipif( - not is_platform_little_endian(), reason="reason platform is not little endian" -) -def test_encoding(setup_path): - - with ensure_clean_store(setup_path) as store: - df = DataFrame({"A": "foo", "B": "bar"}, index=range(5)) - df.loc[2, "A"] = np.nan - df.loc[3, "B"] = np.nan - _maybe_remove(store, "df") - store.append("df", df, encoding="ascii") - tm.assert_frame_equal(store["df"], df) - - expected = df.reindex(columns=["A"]) - result = store.select("df", Term("columns=A", encoding="ascii")) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "val", - [ - [b"E\xc9, 17", b"", b"a", b"b", b"c"], - [b"E\xc9, 17", b"a", b"b", b"c"], - [b"EE, 17", b"", b"a", b"b", b"c"], - [b"E\xc9, 17", b"\xf8\xfc", b"a", b"b", b"c"], - [b"", b"a", b"b", b"c"], - [b"\xf8\xfc", b"a", b"b", b"c"], - [b"A\xf8\xfc", b"", b"a", b"b", b"c"], - [np.nan, b"", b"b", b"c"], - [b"A\xf8\xfc", np.nan, b"", b"b", b"c"], - ], -) -@pytest.mark.parametrize("dtype", ["category", object]) -def test_latin_encoding(tmp_path, setup_path, dtype, val): - enc = "latin-1" - nan_rep = "" - key = "data" - - val = [x.decode(enc) if isinstance(x, bytes) else x for x in val] - ser = Series(val, dtype=dtype) - - store = tmp_path / setup_path - ser.to_hdf(store, key, format="table", encoding=enc, nan_rep=nan_rep) - retr = read_hdf(store, key) - - s_nan = ser.replace(nan_rep, np.nan) - - tm.assert_series_equal(s_nan, retr) - - -def test_multiple_open_close(tmp_path, setup_path): - # gh-4409: open & close multiple times - - path = tmp_path / setup_path - - df = tm.makeDataFrame() - df.to_hdf(path, "df", mode="w", format="table") - - # single - store = HDFStore(path) - assert "CLOSED" not in store.info() - assert store.is_open - - store.close() - assert "CLOSED" in store.info() - assert not store.is_open - - path = tmp_path / setup_path - - if pytables._table_file_open_policy_is_strict: - # multiples - store1 = HDFStore(path) - msg = ( - r"The file [\S]* is already opened\. Please close it before " - r"reopening in write mode\." - ) - with pytest.raises(ValueError, match=msg): - HDFStore(path) - - store1.close() - else: - - # multiples - store1 = HDFStore(path) - store2 = HDFStore(path) - - assert "CLOSED" not in store1.info() - assert "CLOSED" not in store2.info() - assert store1.is_open - assert store2.is_open - - store1.close() - assert "CLOSED" in store1.info() - assert not store1.is_open - assert "CLOSED" not in store2.info() - assert store2.is_open - - store2.close() - assert "CLOSED" in store1.info() - assert "CLOSED" in store2.info() - assert not store1.is_open - assert not store2.is_open - - # nested close - store = HDFStore(path, mode="w") - store.append("df", df) - - store2 = HDFStore(path) - store2.append("df2", df) - store2.close() - assert "CLOSED" in store2.info() - assert not store2.is_open - - store.close() - assert "CLOSED" in store.info() - assert not store.is_open - - # double closing - store = HDFStore(path, mode="w") - store.append("df", df) - - store2 = HDFStore(path) - store.close() - assert "CLOSED" in store.info() - assert not store.is_open - - store2.close() - assert "CLOSED" in store2.info() - assert not store2.is_open - - # ops on a closed store - path = tmp_path / setup_path - - df = tm.makeDataFrame() - df.to_hdf(path, "df", mode="w", format="table") - - store = HDFStore(path) - store.close() - - msg = r"[\S]* file is not open!" - with pytest.raises(ClosedFileError, match=msg): - store.keys() - - with pytest.raises(ClosedFileError, match=msg): - "df" in store - - with pytest.raises(ClosedFileError, match=msg): - len(store) - - with pytest.raises(ClosedFileError, match=msg): - store["df"] - - with pytest.raises(ClosedFileError, match=msg): - store.select("df") - - with pytest.raises(ClosedFileError, match=msg): - store.get("df") - - with pytest.raises(ClosedFileError, match=msg): - store.append("df2", df) - - with pytest.raises(ClosedFileError, match=msg): - store.put("df3", df) - - with pytest.raises(ClosedFileError, match=msg): - store.get_storer("df2") - - with pytest.raises(ClosedFileError, match=msg): - store.remove("df2") - - with pytest.raises(ClosedFileError, match=msg): - store.select("df") - - msg = "'HDFStore' object has no attribute 'df'" - with pytest.raises(AttributeError, match=msg): - store.df - - -def test_fspath(): - with tm.ensure_clean("foo.h5") as path: - with HDFStore(path) as store: - assert os.fspath(store) == str(path) diff --git a/pandas/tests/io/pytables/test_keys.py b/pandas/tests/io/pytables/test_keys.py deleted file mode 100644 index dff7e2144d3c2..0000000000000 --- a/pandas/tests/io/pytables/test_keys.py +++ /dev/null @@ -1,79 +0,0 @@ -import pytest - -from pandas import ( - DataFrame, - HDFStore, - _testing as tm, -) -from pandas.tests.io.pytables.common import ( - ensure_clean_store, - tables, -) - -pytestmark = pytest.mark.single_cpu - - -def test_keys(setup_path): - - with ensure_clean_store(setup_path) as store: - store["a"] = tm.makeTimeSeries() - store["b"] = tm.makeStringSeries() - store["c"] = tm.makeDataFrame() - - assert len(store) == 3 - expected = {"/a", "/b", "/c"} - assert set(store.keys()) == expected - assert set(store) == expected - - -def test_non_pandas_keys(tmp_path, setup_path): - class Table1(tables.IsDescription): - value1 = tables.Float32Col() - - class Table2(tables.IsDescription): - value2 = tables.Float32Col() - - class Table3(tables.IsDescription): - value3 = tables.Float32Col() - - path = tmp_path / setup_path - with tables.open_file(path, mode="w") as h5file: - group = h5file.create_group("/", "group") - h5file.create_table(group, "table1", Table1, "Table 1") - h5file.create_table(group, "table2", Table2, "Table 2") - h5file.create_table(group, "table3", Table3, "Table 3") - with HDFStore(path) as store: - assert len(store.keys(include="native")) == 3 - expected = {"/group/table1", "/group/table2", "/group/table3"} - assert set(store.keys(include="native")) == expected - assert set(store.keys(include="pandas")) == set() - for name in expected: - df = store.get(name) - assert len(df.columns) == 1 - - -def test_keys_illegal_include_keyword_value(setup_path): - with ensure_clean_store(setup_path) as store: - with pytest.raises( - ValueError, - match="`include` should be either 'pandas' or 'native' but is 'illegal'", - ): - store.keys(include="illegal") - - -def test_keys_ignore_hdf_softlink(setup_path): - - # GH 20523 - # Puts a softlink into HDF file and rereads - - with ensure_clean_store(setup_path) as store: - - df = DataFrame({"A": range(5), "B": range(5)}) - store.put("df", df) - - assert store.keys() == ["/df"] - - store._handle.create_soft_link(store._handle.root, "symlink", "df") - - # Should ignore the softlink - assert store.keys() == ["/df"] diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py deleted file mode 100644 index 8cff9e65ce23b..0000000000000 --- a/pandas/tests/io/pytables/test_put.py +++ /dev/null @@ -1,372 +0,0 @@ -import datetime -import re -from warnings import ( - catch_warnings, - simplefilter, -) - -import numpy as np -import pytest - -from pandas._libs.tslibs import Timestamp - -import pandas as pd -from pandas import ( - DataFrame, - HDFStore, - Index, - MultiIndex, - Series, - _testing as tm, - concat, -) -from pandas.tests.io.pytables.common import ( - _maybe_remove, - ensure_clean_store, -) -from pandas.util import _test_decorators as td - -pytestmark = pytest.mark.single_cpu - - -def test_format_type(tmp_path, setup_path): - df = DataFrame({"A": [1, 2]}) - with HDFStore(tmp_path / setup_path) as store: - store.put("a", df, format="fixed") - store.put("b", df, format="table") - - assert store.get_storer("a").format_type == "fixed" - assert store.get_storer("b").format_type == "table" - - -def test_format_kwarg_in_constructor(tmp_path, setup_path): - # GH 13291 - - msg = "format is not a defined argument for HDFStore" - - with pytest.raises(ValueError, match=msg): - HDFStore(tmp_path / setup_path, format="table") - - -def test_api_default_format(tmp_path, setup_path): - - # default_format option - with ensure_clean_store(setup_path) as store: - df = tm.makeDataFrame() - - with pd.option_context("io.hdf.default_format", "fixed"): - _maybe_remove(store, "df") - store.put("df", df) - assert not store.get_storer("df").is_table - - msg = "Can only append to Tables" - with pytest.raises(ValueError, match=msg): - store.append("df2", df) - - with pd.option_context("io.hdf.default_format", "table"): - _maybe_remove(store, "df") - store.put("df", df) - assert store.get_storer("df").is_table - - _maybe_remove(store, "df2") - store.append("df2", df) - assert store.get_storer("df").is_table - - path = tmp_path / setup_path - df = tm.makeDataFrame() - - with pd.option_context("io.hdf.default_format", "fixed"): - df.to_hdf(path, "df") - with HDFStore(path) as store: - assert not store.get_storer("df").is_table - with pytest.raises(ValueError, match=msg): - df.to_hdf(path, "df2", append=True) - - with pd.option_context("io.hdf.default_format", "table"): - df.to_hdf(path, "df3") - with HDFStore(path) as store: - assert store.get_storer("df3").is_table - df.to_hdf(path, "df4", append=True) - with HDFStore(path) as store: - assert store.get_storer("df4").is_table - - -def test_put(setup_path): - - with ensure_clean_store(setup_path) as store: - - ts = tm.makeTimeSeries() - df = tm.makeTimeDataFrame() - store["a"] = ts - store["b"] = df[:10] - store["foo/bar/bah"] = df[:10] - store["foo"] = df[:10] - store["/foo"] = df[:10] - store.put("c", df[:10], format="table") - - # not OK, not a table - msg = "Can only append to Tables" - with pytest.raises(ValueError, match=msg): - store.put("b", df[10:], append=True) - - # node does not currently exist, test _is_table_type returns False - # in this case - _maybe_remove(store, "f") - with pytest.raises(ValueError, match=msg): - store.put("f", df[10:], append=True) - - # can't put to a table (use append instead) - with pytest.raises(ValueError, match=msg): - store.put("c", df[10:], append=True) - - # overwrite table - store.put("c", df[:10], format="table", append=False) - tm.assert_frame_equal(df[:10], store["c"]) - - -def test_put_string_index(setup_path): - - with ensure_clean_store(setup_path) as store: - - index = Index([f"I am a very long string index: {i}" for i in range(20)]) - s = Series(np.arange(20), index=index) - df = DataFrame({"A": s, "B": s}) - - store["a"] = s - tm.assert_series_equal(store["a"], s) - - store["b"] = df - tm.assert_frame_equal(store["b"], df) - - # mixed length - index = Index( - ["abcdefghijklmnopqrstuvwxyz1234567890"] - + [f"I am a very long string index: {i}" for i in range(20)] - ) - s = Series(np.arange(21), index=index) - df = DataFrame({"A": s, "B": s}) - store["a"] = s - tm.assert_series_equal(store["a"], s) - - store["b"] = df - tm.assert_frame_equal(store["b"], df) - - -def test_put_compression(setup_path): - - with ensure_clean_store(setup_path) as store: - df = tm.makeTimeDataFrame() - - store.put("c", df, format="table", complib="zlib") - tm.assert_frame_equal(store["c"], df) - - # can't compress if format='fixed' - msg = "Compression not supported on Fixed format stores" - with pytest.raises(ValueError, match=msg): - store.put("b", df, format="fixed", complib="zlib") - - -@td.skip_if_windows -def test_put_compression_blosc(setup_path): - df = tm.makeTimeDataFrame() - - with ensure_clean_store(setup_path) as store: - - # can't compress if format='fixed' - msg = "Compression not supported on Fixed format stores" - with pytest.raises(ValueError, match=msg): - store.put("b", df, format="fixed", complib="blosc") - - store.put("c", df, format="table", complib="blosc") - tm.assert_frame_equal(store["c"], df) - - -def test_put_mixed_type(setup_path): - df = tm.makeTimeDataFrame() - df["obj1"] = "foo" - df["obj2"] = "bar" - df["bool1"] = df["A"] > 0 - df["bool2"] = df["B"] > 0 - df["bool3"] = True - df["int1"] = 1 - df["int2"] = 2 - df["timestamp1"] = Timestamp("20010102") - df["timestamp2"] = Timestamp("20010103") - df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) - df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) - df.loc[df.index[3:6], ["obj1"]] = np.nan - df = df._consolidate() - - with ensure_clean_store(setup_path) as store: - _maybe_remove(store, "df") - - # PerformanceWarning - with catch_warnings(record=True): - simplefilter("ignore", pd.errors.PerformanceWarning) - store.put("df", df) - - expected = store.get("df") - tm.assert_frame_equal(expected, df) - - -@pytest.mark.parametrize( - "format, index", - [ - ["table", tm.makeFloatIndex], - ["table", tm.makeStringIndex], - ["table", tm.makeIntIndex], - ["table", tm.makeDateIndex], - ["fixed", tm.makeFloatIndex], - ["fixed", tm.makeStringIndex], - ["fixed", tm.makeIntIndex], - ["fixed", tm.makeDateIndex], - ["table", tm.makePeriodIndex], # GH#7796 - ["fixed", tm.makePeriodIndex], - ], -) -def test_store_index_types(setup_path, format, index): - # GH5386 - # test storing various index types - - with ensure_clean_store(setup_path) as store: - - df = DataFrame(np.random.randn(10, 2), columns=list("AB")) - df.index = index(len(df)) - - _maybe_remove(store, "df") - store.put("df", df, format=format) - tm.assert_frame_equal(df, store["df"]) - - -def test_column_multiindex(setup_path): - # GH 4710 - # recreate multi-indexes properly - - index = MultiIndex.from_tuples( - [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")], names=["first", "second"] - ) - df = DataFrame(np.arange(12).reshape(3, 4), columns=index) - expected = df.set_axis(df.index.to_numpy()) - - with ensure_clean_store(setup_path) as store: - - store.put("df", df) - tm.assert_frame_equal( - store["df"], expected, check_index_type=True, check_column_type=True - ) - - store.put("df1", df, format="table") - tm.assert_frame_equal( - store["df1"], expected, check_index_type=True, check_column_type=True - ) - - msg = re.escape("cannot use a multi-index on axis [1] with data_columns ['A']") - with pytest.raises(ValueError, match=msg): - store.put("df2", df, format="table", data_columns=["A"]) - msg = re.escape("cannot use a multi-index on axis [1] with data_columns True") - with pytest.raises(ValueError, match=msg): - store.put("df3", df, format="table", data_columns=True) - - # appending multi-column on existing table (see GH 6167) - with ensure_clean_store(setup_path) as store: - store.append("df2", df) - store.append("df2", df) - - tm.assert_frame_equal(store["df2"], concat((df, df))) - - # non_index_axes name - df = DataFrame(np.arange(12).reshape(3, 4), columns=Index(list("ABCD"), name="foo")) - expected = df.set_axis(df.index.to_numpy()) - - with ensure_clean_store(setup_path) as store: - - store.put("df1", df, format="table") - tm.assert_frame_equal( - store["df1"], expected, check_index_type=True, check_column_type=True - ) - - -def test_store_multiindex(setup_path): - - # validate multi-index names - # GH 5527 - with ensure_clean_store(setup_path) as store: - - def make_index(names=None): - return MultiIndex.from_tuples( - [ - (datetime.datetime(2013, 12, d), s, t) - for d in range(1, 3) - for s in range(2) - for t in range(3) - ], - names=names, - ) - - # no names - _maybe_remove(store, "df") - df = DataFrame(np.zeros((12, 2)), columns=["a", "b"], index=make_index()) - store.append("df", df) - tm.assert_frame_equal(store.select("df"), df) - - # partial names - _maybe_remove(store, "df") - df = DataFrame( - np.zeros((12, 2)), - columns=["a", "b"], - index=make_index(["date", None, None]), - ) - store.append("df", df) - tm.assert_frame_equal(store.select("df"), df) - - # series - _maybe_remove(store, "s") - s = Series(np.zeros(12), index=make_index(["date", None, None])) - store.append("s", s) - xp = Series(np.zeros(12), index=make_index(["date", "level_1", "level_2"])) - tm.assert_series_equal(store.select("s"), xp) - - # dup with column - _maybe_remove(store, "df") - df = DataFrame( - np.zeros((12, 2)), - columns=["a", "b"], - index=make_index(["date", "a", "t"]), - ) - msg = "duplicate names/columns in the multi-index when storing as a table" - with pytest.raises(ValueError, match=msg): - store.append("df", df) - - # dup within level - _maybe_remove(store, "df") - df = DataFrame( - np.zeros((12, 2)), - columns=["a", "b"], - index=make_index(["date", "date", "date"]), - ) - with pytest.raises(ValueError, match=msg): - store.append("df", df) - - # fully names - _maybe_remove(store, "df") - df = DataFrame( - np.zeros((12, 2)), - columns=["a", "b"], - index=make_index(["date", "s", "t"]), - ) - store.append("df", df) - tm.assert_frame_equal(store.select("df"), df) - - -@pytest.mark.parametrize("format", ["fixed", "table"]) -def test_store_periodindex(tmp_path, setup_path, format): - # GH 7796 - # test of PeriodIndex in HDFStore - df = DataFrame( - np.random.randn(5, 1), index=pd.period_range("20220101", freq="M", periods=5) - ) - - path = tmp_path / setup_path - df.to_hdf(path, "df", mode="w", format=format) - expected = pd.read_hdf(path, "df") - tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/io/pytables/test_pytables_missing.py b/pandas/tests/io/pytables/test_pytables_missing.py deleted file mode 100644 index 9adb0a6d227da..0000000000000 --- a/pandas/tests/io/pytables/test_pytables_missing.py +++ /dev/null @@ -1,14 +0,0 @@ -import pytest - -import pandas.util._test_decorators as td - -import pandas as pd -import pandas._testing as tm - - -@td.skip_if_installed("tables") -def test_pytables_raises(): - df = pd.DataFrame({"A": [1, 2]}) - with pytest.raises(ImportError, match="tables"): - with tm.ensure_clean("foo.h5") as path: - df.to_hdf(path, "df") diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py deleted file mode 100644 index 6d92c15f1ea10..0000000000000 --- a/pandas/tests/io/pytables/test_read.py +++ /dev/null @@ -1,344 +0,0 @@ -from contextlib import closing -from pathlib import Path -import re - -import numpy as np -import pytest - -from pandas._libs.tslibs import Timestamp -from pandas.compat import is_platform_windows - -import pandas as pd -from pandas import ( - DataFrame, - HDFStore, - Index, - Series, - _testing as tm, - read_hdf, -) -from pandas.tests.io.pytables.common import ( - _maybe_remove, - ensure_clean_store, -) -from pandas.util import _test_decorators as td - -from pandas.io.pytables import TableIterator - -pytestmark = pytest.mark.single_cpu - - -def test_read_missing_key_close_store(tmp_path, setup_path): - # GH 25766 - path = tmp_path / setup_path - df = DataFrame({"a": range(2), "b": range(2)}) - df.to_hdf(path, "k1") - - with pytest.raises(KeyError, match="'No object named k2 in the file'"): - read_hdf(path, "k2") - - # smoke test to test that file is properly closed after - # read with KeyError before another write - df.to_hdf(path, "k2") - - -def test_read_missing_key_opened_store(tmp_path, setup_path): - # GH 28699 - path = tmp_path / setup_path - df = DataFrame({"a": range(2), "b": range(2)}) - df.to_hdf(path, "k1") - - with HDFStore(path, "r") as store: - - with pytest.raises(KeyError, match="'No object named k2 in the file'"): - read_hdf(store, "k2") - - # Test that the file is still open after a KeyError and that we can - # still read from it. - read_hdf(store, "k1") - - -def test_read_column(setup_path): - - df = tm.makeTimeDataFrame() - - with ensure_clean_store(setup_path) as store: - _maybe_remove(store, "df") - - # GH 17912 - # HDFStore.select_column should raise a KeyError - # exception if the key is not a valid store - with pytest.raises(KeyError, match="No object named df in the file"): - store.select_column("df", "index") - - store.append("df", df) - # error - with pytest.raises( - KeyError, match=re.escape("'column [foo] not found in the table'") - ): - store.select_column("df", "foo") - - msg = re.escape("select_column() got an unexpected keyword argument 'where'") - with pytest.raises(TypeError, match=msg): - store.select_column("df", "index", where=["index>5"]) - - # valid - result = store.select_column("df", "index") - tm.assert_almost_equal(result.values, Series(df.index).values) - assert isinstance(result, Series) - - # not a data indexable column - msg = re.escape( - "column [values_block_0] can not be extracted individually; " - "it is not data indexable" - ) - with pytest.raises(ValueError, match=msg): - store.select_column("df", "values_block_0") - - # a data column - df2 = df.copy() - df2["string"] = "foo" - store.append("df2", df2, data_columns=["string"]) - result = store.select_column("df2", "string") - tm.assert_almost_equal(result.values, df2["string"].values) - - # a data column with NaNs, result excludes the NaNs - df3 = df.copy() - df3["string"] = "foo" - df3.loc[df3.index[4:6], "string"] = np.nan - store.append("df3", df3, data_columns=["string"]) - result = store.select_column("df3", "string") - tm.assert_almost_equal(result.values, df3["string"].values) - - # start/stop - result = store.select_column("df3", "string", start=2) - tm.assert_almost_equal(result.values, df3["string"].values[2:]) - - result = store.select_column("df3", "string", start=-2) - tm.assert_almost_equal(result.values, df3["string"].values[-2:]) - - result = store.select_column("df3", "string", stop=2) - tm.assert_almost_equal(result.values, df3["string"].values[:2]) - - result = store.select_column("df3", "string", stop=-2) - tm.assert_almost_equal(result.values, df3["string"].values[:-2]) - - result = store.select_column("df3", "string", start=2, stop=-2) - tm.assert_almost_equal(result.values, df3["string"].values[2:-2]) - - result = store.select_column("df3", "string", start=-2, stop=2) - tm.assert_almost_equal(result.values, df3["string"].values[-2:2]) - - # GH 10392 - make sure column name is preserved - df4 = DataFrame({"A": np.random.randn(10), "B": "foo"}) - store.append("df4", df4, data_columns=True) - expected = df4["B"] - result = store.select_column("df4", "B") - tm.assert_series_equal(result, expected) - - -def test_pytables_native_read(datapath): - with ensure_clean_store( - datapath("io", "data", "legacy_hdf/pytables_native.h5"), mode="r" - ) as store: - d2 = store["detector/readout"] - assert isinstance(d2, DataFrame) - - -@pytest.mark.skipif(is_platform_windows(), reason="native2 read fails oddly on windows") -def test_pytables_native2_read(datapath): - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "pytables_native2.h5"), mode="r" - ) as store: - str(store) - d1 = store["detector"] - assert isinstance(d1, DataFrame) - - -def test_legacy_table_fixed_format_read_py2(datapath): - # GH 24510 - # legacy table with fixed format written in Python 2 - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "legacy_table_fixed_py2.h5"), mode="r" - ) as store: - result = store.select("df") - expected = DataFrame( - [[1, 2, 3, "D"]], - columns=["A", "B", "C", "D"], - index=Index(["ABC"], name="INDEX_NAME"), - ) - tm.assert_frame_equal(expected, result) - - -def test_legacy_table_fixed_format_read_datetime_py2(datapath): - # GH 31750 - # legacy table with fixed format and datetime64 column written in Python 2 - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "legacy_table_fixed_datetime_py2.h5"), - mode="r", - ) as store: - result = store.select("df") - expected = DataFrame( - [[Timestamp("2020-02-06T18:00")]], - columns=["A"], - index=Index(["date"]), - ) - tm.assert_frame_equal(expected, result) - - -def test_legacy_table_read_py2(datapath): - # issue: 24925 - # legacy table written in Python 2 - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "legacy_table_py2.h5"), mode="r" - ) as store: - result = store.select("table") - - expected = DataFrame({"a": ["a", "b"], "b": [2, 3]}) - tm.assert_frame_equal(expected, result) - - -def test_read_hdf_open_store(tmp_path, setup_path): - # GH10330 - # No check for non-string path_or-buf, and no test of open store - df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) - df.index.name = "letters" - df = df.set_index(keys="E", append=True) - - path = tmp_path / setup_path - df.to_hdf(path, "df", mode="w") - direct = read_hdf(path, "df") - with HDFStore(path, mode="r") as store: - indirect = read_hdf(store, "df") - tm.assert_frame_equal(direct, indirect) - assert store.is_open - - -def test_read_hdf_iterator(tmp_path, setup_path): - df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) - df.index.name = "letters" - df = df.set_index(keys="E", append=True) - - path = tmp_path / setup_path - df.to_hdf(path, "df", mode="w", format="t") - direct = read_hdf(path, "df") - iterator = read_hdf(path, "df", iterator=True) - with closing(iterator.store): - assert isinstance(iterator, TableIterator) - indirect = next(iterator.__iter__()) - tm.assert_frame_equal(direct, indirect) - - -def test_read_nokey(tmp_path, setup_path): - # GH10443 - df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) - - # Categorical dtype not supported for "fixed" format. So no need - # to test with that dtype in the dataframe here. - path = tmp_path / setup_path - df.to_hdf(path, "df", mode="a") - reread = read_hdf(path) - tm.assert_frame_equal(df, reread) - df.to_hdf(path, "df2", mode="a") - - msg = "key must be provided when HDF5 file contains multiple datasets." - with pytest.raises(ValueError, match=msg): - read_hdf(path) - - -def test_read_nokey_table(tmp_path, setup_path): - # GH13231 - df = DataFrame({"i": range(5), "c": Series(list("abacd"), dtype="category")}) - - path = tmp_path / setup_path - df.to_hdf(path, "df", mode="a", format="table") - reread = read_hdf(path) - tm.assert_frame_equal(df, reread) - df.to_hdf(path, "df2", mode="a", format="table") - - msg = "key must be provided when HDF5 file contains multiple datasets." - with pytest.raises(ValueError, match=msg): - read_hdf(path) - - -def test_read_nokey_empty(tmp_path, setup_path): - path = tmp_path / setup_path - store = HDFStore(path) - store.close() - msg = re.escape( - "Dataset(s) incompatible with Pandas data types, not table, or no " - "datasets found in HDF5 file." - ) - with pytest.raises(ValueError, match=msg): - read_hdf(path) - - -def test_read_from_pathlib_path(tmp_path, setup_path): - - # GH11773 - expected = DataFrame( - np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE") - ) - filename = tmp_path / setup_path - path_obj = Path(filename) - - expected.to_hdf(path_obj, "df", mode="a") - actual = read_hdf(path_obj, "df") - - tm.assert_frame_equal(expected, actual) - - -@td.skip_if_no("py.path") -def test_read_from_py_localpath(tmp_path, setup_path): - - # GH11773 - from py.path import local as LocalPath - - expected = DataFrame( - np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE") - ) - filename = tmp_path / setup_path - path_obj = LocalPath(filename) - - expected.to_hdf(path_obj, "df", mode="a") - actual = read_hdf(path_obj, "df") - - tm.assert_frame_equal(expected, actual) - - -@pytest.mark.parametrize("format", ["fixed", "table"]) -def test_read_hdf_series_mode_r(tmp_path, format, setup_path): - # GH 16583 - # Tests that reading a Series saved to an HDF file - # still works if a mode='r' argument is supplied - series = tm.makeFloatSeries() - path = tmp_path / setup_path - series.to_hdf(path, key="data", format=format) - result = read_hdf(path, key="data", mode="r") - tm.assert_series_equal(result, series) - - -def test_read_py2_hdf_file_in_py3(datapath): - # GH 16781 - - # tests reading a PeriodIndex DataFrame written in Python2 in Python3 - - # the file was generated in Python 2.7 like so: - # - # df = DataFrame([1.,2,3], index=pd.PeriodIndex( - # ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B')) - # df.to_hdf('periodindex_0.20.1_x86_64_darwin_2.7.13.h5', 'p') - - expected = DataFrame( - [1.0, 2, 3], - index=pd.PeriodIndex(["2015-01-01", "2015-01-02", "2015-01-05"], freq="B"), - ) - - with ensure_clean_store( - datapath( - "io", "data", "legacy_hdf", "periodindex_0.20.1_x86_64_darwin_2.7.13.h5" - ), - mode="r", - ) as store: - result = store["p"] - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_retain_attributes.py b/pandas/tests/io/pytables/test_retain_attributes.py deleted file mode 100644 index 3043cd3604e58..0000000000000 --- a/pandas/tests/io/pytables/test_retain_attributes.py +++ /dev/null @@ -1,105 +0,0 @@ -from warnings import catch_warnings - -import pytest - -from pandas._libs.tslibs import Timestamp - -from pandas import ( - DataFrame, - Series, - _testing as tm, - date_range, - read_hdf, -) -from pandas.tests.io.pytables.common import ( - _maybe_remove, - ensure_clean_store, -) - -pytestmark = pytest.mark.single_cpu - - -def test_retain_index_attributes(setup_path): - - # GH 3499, losing frequency info on index recreation - df = DataFrame( - {"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="H"))} - ) - - with ensure_clean_store(setup_path) as store: - _maybe_remove(store, "data") - store.put("data", df, format="table") - - result = store.get("data") - tm.assert_frame_equal(df, result) - - for attr in ["freq", "tz", "name"]: - for idx in ["index", "columns"]: - assert getattr(getattr(df, idx), attr, None) == getattr( - getattr(result, idx), attr, None - ) - - # try to append a table with a different frequency - with catch_warnings(record=True): - df2 = DataFrame( - { - "A": Series( - range(3), index=date_range("2002-1-1", periods=3, freq="D") - ) - } - ) - store.append("data", df2) - - assert store.get_storer("data").info["index"]["freq"] is None - - # this is ok - _maybe_remove(store, "df2") - df2 = DataFrame( - { - "A": Series( - range(3), - index=[ - Timestamp("20010101"), - Timestamp("20010102"), - Timestamp("20020101"), - ], - ) - } - ) - store.append("df2", df2) - df3 = DataFrame( - {"A": Series(range(3), index=date_range("2002-1-1", periods=3, freq="D"))} - ) - store.append("df2", df3) - - -def test_retain_index_attributes2(tmp_path, setup_path): - path = tmp_path / setup_path - - with catch_warnings(record=True): - - df = DataFrame( - {"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="H"))} - ) - df.to_hdf(path, "data", mode="w", append=True) - df2 = DataFrame( - {"A": Series(range(3), index=date_range("2002-1-1", periods=3, freq="D"))} - ) - - df2.to_hdf(path, "data", append=True) - - idx = date_range("2000-1-1", periods=3, freq="H") - idx.name = "foo" - df = DataFrame({"A": Series(range(3), index=idx)}) - df.to_hdf(path, "data", mode="w", append=True) - - assert read_hdf(path, "data").index.name == "foo" - - with catch_warnings(record=True): - - idx2 = date_range("2001-1-1", periods=3, freq="H") - idx2.name = "bar" - df2 = DataFrame({"A": Series(range(3), index=idx2)}) - df2.to_hdf(path, "data", append=True) - - assert read_hdf(path, "data").index.name is None diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py deleted file mode 100644 index 5c7c4f9ce0b75..0000000000000 --- a/pandas/tests/io/pytables/test_round_trip.py +++ /dev/null @@ -1,557 +0,0 @@ -import datetime -import re -from warnings import ( - catch_warnings, - simplefilter, -) - -import numpy as np -import pytest - -from pandas._libs.tslibs import Timestamp -from pandas.compat import is_platform_windows - -import pandas as pd -from pandas import ( - DataFrame, - Index, - Series, - _testing as tm, - bdate_range, - read_hdf, -) -from pandas.tests.io.pytables.common import ( - _maybe_remove, - ensure_clean_store, -) -from pandas.util import _test_decorators as td - -_default_compressor = "blosc" - - -pytestmark = pytest.mark.single_cpu - - -def test_conv_read_write(): - with tm.ensure_clean() as path: - - def roundtrip(key, obj, **kwargs): - obj.to_hdf(path, key, **kwargs) - return read_hdf(path, key) - - o = tm.makeTimeSeries() - tm.assert_series_equal(o, roundtrip("series", o)) - - o = tm.makeStringSeries() - tm.assert_series_equal(o, roundtrip("string_series", o)) - - o = tm.makeDataFrame() - tm.assert_frame_equal(o, roundtrip("frame", o)) - - # table - df = DataFrame({"A": range(5), "B": range(5)}) - df.to_hdf(path, "table", append=True) - result = read_hdf(path, "table", where=["index>2"]) - tm.assert_frame_equal(df[df.index > 2], result) - - -def test_long_strings(setup_path): - - # GH6166 - df = DataFrame( - {"a": tm.rands_array(100, size=10)}, index=tm.rands_array(100, size=10) - ) - - with ensure_clean_store(setup_path) as store: - store.append("df", df, data_columns=["a"]) - - result = store.select("df") - tm.assert_frame_equal(df, result) - - -def test_api(tmp_path, setup_path): - - # GH4584 - # API issue when to_hdf doesn't accept append AND format args - path = tmp_path / setup_path - - df = tm.makeDataFrame() - df.iloc[:10].to_hdf(path, "df", append=True, format="table") - df.iloc[10:].to_hdf(path, "df", append=True, format="table") - tm.assert_frame_equal(read_hdf(path, "df"), df) - - # append to False - df.iloc[:10].to_hdf(path, "df", append=False, format="table") - df.iloc[10:].to_hdf(path, "df", append=True, format="table") - tm.assert_frame_equal(read_hdf(path, "df"), df) - - -def test_api_append(tmp_path, setup_path): - path = tmp_path / setup_path - - df = tm.makeDataFrame() - df.iloc[:10].to_hdf(path, "df", append=True) - df.iloc[10:].to_hdf(path, "df", append=True, format="table") - tm.assert_frame_equal(read_hdf(path, "df"), df) - - # append to False - df.iloc[:10].to_hdf(path, "df", append=False, format="table") - df.iloc[10:].to_hdf(path, "df", append=True) - tm.assert_frame_equal(read_hdf(path, "df"), df) - - -def test_api_2(tmp_path, setup_path): - path = tmp_path / setup_path - - df = tm.makeDataFrame() - df.to_hdf(path, "df", append=False, format="fixed") - tm.assert_frame_equal(read_hdf(path, "df"), df) - - df.to_hdf(path, "df", append=False, format="f") - tm.assert_frame_equal(read_hdf(path, "df"), df) - - df.to_hdf(path, "df", append=False) - tm.assert_frame_equal(read_hdf(path, "df"), df) - - df.to_hdf(path, "df") - tm.assert_frame_equal(read_hdf(path, "df"), df) - - with ensure_clean_store(setup_path) as store: - - df = tm.makeDataFrame() - - _maybe_remove(store, "df") - store.append("df", df.iloc[:10], append=True, format="table") - store.append("df", df.iloc[10:], append=True, format="table") - tm.assert_frame_equal(store.select("df"), df) - - # append to False - _maybe_remove(store, "df") - store.append("df", df.iloc[:10], append=False, format="table") - store.append("df", df.iloc[10:], append=True, format="table") - tm.assert_frame_equal(store.select("df"), df) - - # formats - _maybe_remove(store, "df") - store.append("df", df.iloc[:10], append=False, format="table") - store.append("df", df.iloc[10:], append=True, format="table") - tm.assert_frame_equal(store.select("df"), df) - - _maybe_remove(store, "df") - store.append("df", df.iloc[:10], append=False, format="table") - store.append("df", df.iloc[10:], append=True, format=None) - tm.assert_frame_equal(store.select("df"), df) - - -def test_api_invalid(tmp_path, setup_path): - path = tmp_path / setup_path - # Invalid. - df = tm.makeDataFrame() - - msg = "Can only append to Tables" - - with pytest.raises(ValueError, match=msg): - df.to_hdf(path, "df", append=True, format="f") - - with pytest.raises(ValueError, match=msg): - df.to_hdf(path, "df", append=True, format="fixed") - - msg = r"invalid HDFStore format specified \[foo\]" - - with pytest.raises(TypeError, match=msg): - df.to_hdf(path, "df", append=True, format="foo") - - with pytest.raises(TypeError, match=msg): - df.to_hdf(path, "df", append=False, format="foo") - - # File path doesn't exist - path = "" - msg = f"File {path} does not exist" - - with pytest.raises(FileNotFoundError, match=msg): - read_hdf(path, "df") - - -def test_get(setup_path): - - with ensure_clean_store(setup_path) as store: - store["a"] = tm.makeTimeSeries() - left = store.get("a") - right = store["a"] - tm.assert_series_equal(left, right) - - left = store.get("/a") - right = store["/a"] - tm.assert_series_equal(left, right) - - with pytest.raises(KeyError, match="'No object named b in the file'"): - store.get("b") - - -def test_put_integer(setup_path): - # non-date, non-string index - df = DataFrame(np.random.randn(50, 100)) - _check_roundtrip(df, tm.assert_frame_equal, setup_path) - - -def test_table_values_dtypes_roundtrip(setup_path): - - with ensure_clean_store(setup_path) as store: - df1 = DataFrame({"a": [1, 2, 3]}, dtype="f8") - store.append("df_f8", df1) - tm.assert_series_equal(df1.dtypes, store["df_f8"].dtypes) - - df2 = DataFrame({"a": [1, 2, 3]}, dtype="i8") - store.append("df_i8", df2) - tm.assert_series_equal(df2.dtypes, store["df_i8"].dtypes) - - # incompatible dtype - msg = re.escape( - "invalid combination of [values_axes] on appending data " - "[name->values_block_0,cname->values_block_0," - "dtype->float64,kind->float,shape->(1, 3)] vs " - "current table [name->values_block_0," - "cname->values_block_0,dtype->int64,kind->integer," - "shape->None]" - ) - with pytest.raises(ValueError, match=msg): - store.append("df_i8", df1) - - # check creation/storage/retrieval of float32 (a bit hacky to - # actually create them thought) - df1 = DataFrame(np.array([[1], [2], [3]], dtype="f4"), columns=["A"]) - store.append("df_f4", df1) - tm.assert_series_equal(df1.dtypes, store["df_f4"].dtypes) - assert df1.dtypes[0] == "float32" - - # check with mixed dtypes - df1 = DataFrame( - { - c: Series(np.random.randint(5), dtype=c) - for c in ["float32", "float64", "int32", "int64", "int16", "int8"] - } - ) - df1["string"] = "foo" - df1["float322"] = 1.0 - df1["float322"] = df1["float322"].astype("float32") - df1["bool"] = df1["float32"] > 0 - df1["time1"] = Timestamp("20130101") - df1["time2"] = Timestamp("20130102") - - store.append("df_mixed_dtypes1", df1) - result = store.select("df_mixed_dtypes1").dtypes.value_counts() - result.index = [str(i) for i in result.index] - expected = Series( - { - "float32": 2, - "float64": 1, - "int32": 1, - "bool": 1, - "int16": 1, - "int8": 1, - "int64": 1, - "object": 1, - "datetime64[ns]": 2, - } - ) - result = result.sort_index() - expected = expected.sort_index() - tm.assert_series_equal(result, expected) - - -@pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning") -def test_series(setup_path): - - s = tm.makeStringSeries() - _check_roundtrip(s, tm.assert_series_equal, path=setup_path) - - ts = tm.makeTimeSeries() - _check_roundtrip(ts, tm.assert_series_equal, path=setup_path) - - ts2 = Series(ts.index, Index(ts.index, dtype=object)) - _check_roundtrip(ts2, tm.assert_series_equal, path=setup_path) - - ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) - _check_roundtrip( - ts3, tm.assert_series_equal, path=setup_path, check_index_type=False - ) - - -def test_float_index(setup_path): - - # GH #454 - index = np.random.randn(10) - s = Series(np.random.randn(10), index=index) - _check_roundtrip(s, tm.assert_series_equal, path=setup_path) - - -def test_tuple_index(setup_path): - - # GH #492 - col = np.arange(10) - idx = [(0.0, 1.0), (2.0, 3.0), (4.0, 5.0)] - data = np.random.randn(30).reshape((3, 10)) - DF = DataFrame(data, index=idx, columns=col) - - with catch_warnings(record=True): - simplefilter("ignore", pd.errors.PerformanceWarning) - _check_roundtrip(DF, tm.assert_frame_equal, path=setup_path) - - -@pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning") -def test_index_types(setup_path): - with catch_warnings(record=True): - values = np.random.randn(2) - - func = lambda lhs, rhs: tm.assert_series_equal(lhs, rhs, check_index_type=True) - - with catch_warnings(record=True): - ser = Series(values, [0, "y"]) - _check_roundtrip(ser, func, path=setup_path) - - with catch_warnings(record=True): - ser = Series(values, [datetime.datetime.today(), 0]) - _check_roundtrip(ser, func, path=setup_path) - - with catch_warnings(record=True): - ser = Series(values, ["y", 0]) - _check_roundtrip(ser, func, path=setup_path) - - with catch_warnings(record=True): - ser = Series(values, [datetime.date.today(), "a"]) - _check_roundtrip(ser, func, path=setup_path) - - with catch_warnings(record=True): - ser = Series(values, [0, "y"]) - _check_roundtrip(ser, func, path=setup_path) - - ser = Series(values, [datetime.datetime.today(), 0]) - _check_roundtrip(ser, func, path=setup_path) - - ser = Series(values, ["y", 0]) - _check_roundtrip(ser, func, path=setup_path) - - ser = Series(values, [datetime.date.today(), "a"]) - _check_roundtrip(ser, func, path=setup_path) - - ser = Series(values, [1.23, "b"]) - _check_roundtrip(ser, func, path=setup_path) - - ser = Series(values, [1, 1.53]) - _check_roundtrip(ser, func, path=setup_path) - - ser = Series(values, [1, 5]) - _check_roundtrip(ser, func, path=setup_path) - - ser = Series( - values, [datetime.datetime(2012, 1, 1), datetime.datetime(2012, 1, 2)] - ) - _check_roundtrip(ser, func, path=setup_path) - - -def test_timeseries_preepoch(setup_path, request): - - dr = bdate_range("1/1/1940", "1/1/1960") - ts = Series(np.random.randn(len(dr)), index=dr) - try: - _check_roundtrip(ts, tm.assert_series_equal, path=setup_path) - except OverflowError: - if is_platform_windows(): - request.node.add_marker( - pytest.mark.xfail("known failure on some windows platforms") - ) - raise - - -@pytest.mark.parametrize( - "compression", [False, pytest.param(True, marks=td.skip_if_windows)] -) -def test_frame(compression, setup_path): - - df = tm.makeDataFrame() - - # put in some random NAs - df.values[0, 0] = np.nan - df.values[5, 3] = np.nan - - _check_roundtrip_table( - df, tm.assert_frame_equal, path=setup_path, compression=compression - ) - _check_roundtrip( - df, tm.assert_frame_equal, path=setup_path, compression=compression - ) - - tdf = tm.makeTimeDataFrame() - _check_roundtrip( - tdf, tm.assert_frame_equal, path=setup_path, compression=compression - ) - - with ensure_clean_store(setup_path) as store: - # not consolidated - df["foo"] = np.random.randn(len(df)) - store["df"] = df - recons = store["df"] - assert recons._mgr.is_consolidated() - - # empty - _check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path) - - -def test_empty_series_frame(setup_path): - s0 = Series(dtype=object) - s1 = Series(name="myseries", dtype=object) - df0 = DataFrame() - df1 = DataFrame(index=["a", "b", "c"]) - df2 = DataFrame(columns=["d", "e", "f"]) - - _check_roundtrip(s0, tm.assert_series_equal, path=setup_path) - _check_roundtrip(s1, tm.assert_series_equal, path=setup_path) - _check_roundtrip(df0, tm.assert_frame_equal, path=setup_path) - _check_roundtrip(df1, tm.assert_frame_equal, path=setup_path) - _check_roundtrip(df2, tm.assert_frame_equal, path=setup_path) - - -@pytest.mark.parametrize("dtype", [np.int64, np.float64, object, "m8[ns]", "M8[ns]"]) -def test_empty_series(dtype, setup_path): - s = Series(dtype=dtype) - _check_roundtrip(s, tm.assert_series_equal, path=setup_path) - - -def test_can_serialize_dates(setup_path): - - rng = [x.date() for x in bdate_range("1/1/2000", "1/30/2000")] - frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - - _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) - - -def test_store_hierarchical(setup_path, multiindex_dataframe_random_data): - frame = multiindex_dataframe_random_data - - _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) - _check_roundtrip(frame.T, tm.assert_frame_equal, path=setup_path) - _check_roundtrip(frame["A"], tm.assert_series_equal, path=setup_path) - - # check that the names are stored - with ensure_clean_store(setup_path) as store: - store["frame"] = frame - recons = store["frame"] - tm.assert_frame_equal(recons, frame) - - -@pytest.mark.parametrize( - "compression", [False, pytest.param(True, marks=td.skip_if_windows)] -) -def test_store_mixed(compression, setup_path): - def _make_one(): - df = tm.makeDataFrame() - df["obj1"] = "foo" - df["obj2"] = "bar" - df["bool1"] = df["A"] > 0 - df["bool2"] = df["B"] > 0 - df["int1"] = 1 - df["int2"] = 2 - return df._consolidate() - - df1 = _make_one() - df2 = _make_one() - - _check_roundtrip(df1, tm.assert_frame_equal, path=setup_path) - _check_roundtrip(df2, tm.assert_frame_equal, path=setup_path) - - with ensure_clean_store(setup_path) as store: - store["obj"] = df1 - tm.assert_frame_equal(store["obj"], df1) - store["obj"] = df2 - tm.assert_frame_equal(store["obj"], df2) - - # check that can store Series of all of these types - _check_roundtrip( - df1["obj1"], - tm.assert_series_equal, - path=setup_path, - compression=compression, - ) - _check_roundtrip( - df1["bool1"], - tm.assert_series_equal, - path=setup_path, - compression=compression, - ) - _check_roundtrip( - df1["int1"], - tm.assert_series_equal, - path=setup_path, - compression=compression, - ) - - -def _check_roundtrip(obj, comparator, path, compression=False, **kwargs): - - options = {} - if compression: - options["complib"] = _default_compressor - - with ensure_clean_store(path, "w", **options) as store: - store["obj"] = obj - retrieved = store["obj"] - comparator(retrieved, obj, **kwargs) - - -def _check_roundtrip_table(obj, comparator, path, compression=False): - options = {} - if compression: - options["complib"] = _default_compressor - - with ensure_clean_store(path, "w", **options) as store: - store.put("obj", obj, format="table") - retrieved = store["obj"] - - comparator(retrieved, obj) - - -def test_unicode_index(setup_path): - - unicode_values = ["\u03c3", "\u03c3\u03c3"] - - # PerformanceWarning - with catch_warnings(record=True): - simplefilter("ignore", pd.errors.PerformanceWarning) - s = Series(np.random.randn(len(unicode_values)), unicode_values) - _check_roundtrip(s, tm.assert_series_equal, path=setup_path) - - -def test_unicode_longer_encoded(setup_path): - # GH 11234 - char = "\u0394" - df = DataFrame({"A": [char]}) - with ensure_clean_store(setup_path) as store: - store.put("df", df, format="table", encoding="utf-8") - result = store.get("df") - tm.assert_frame_equal(result, df) - - df = DataFrame({"A": ["a", char], "B": ["b", "b"]}) - with ensure_clean_store(setup_path) as store: - store.put("df", df, format="table", encoding="utf-8") - result = store.get("df") - tm.assert_frame_equal(result, df) - - -def test_store_datetime_mixed(setup_path): - - df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]}) - ts = tm.makeTimeSeries() - df["d"] = ts.index[:3] - _check_roundtrip(df, tm.assert_frame_equal, path=setup_path) - - -def test_round_trip_equals(tmp_path, setup_path): - # GH 9330 - df = DataFrame({"B": [1, 2], "A": ["x", "y"]}) - - path = tmp_path / setup_path - df.to_hdf(path, "df", format="table") - other = read_hdf(path, "df") - tm.assert_frame_equal(df, other) - assert df.equals(other) - assert other.equals(df) diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py deleted file mode 100644 index b0c9b85e7ad05..0000000000000 --- a/pandas/tests/io/pytables/test_select.py +++ /dev/null @@ -1,973 +0,0 @@ -from warnings import catch_warnings - -import numpy as np -import pytest - -from pandas._libs.tslibs import Timestamp - -import pandas as pd -from pandas import ( - DataFrame, - HDFStore, - Index, - MultiIndex, - Series, - _testing as tm, - bdate_range, - concat, - date_range, - isna, - read_hdf, -) -from pandas.tests.io.pytables.common import ( - _maybe_remove, - ensure_clean_store, -) - -from pandas.io.pytables import Term - -pytestmark = pytest.mark.single_cpu - - -def test_select_columns_in_where(setup_path): - - # GH 6169 - # recreate multi-indexes when columns is passed - # in the `where` argument - index = MultiIndex( - levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=["foo_name", "bar_name"], - ) - - # With a DataFrame - df = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) - - with ensure_clean_store(setup_path) as store: - store.put("df", df, format="table") - expected = df[["A"]] - - tm.assert_frame_equal(store.select("df", columns=["A"]), expected) - - tm.assert_frame_equal(store.select("df", where="columns=['A']"), expected) - - # With a Series - s = Series(np.random.randn(10), index=index, name="A") - with ensure_clean_store(setup_path) as store: - store.put("s", s, format="table") - tm.assert_series_equal(store.select("s", where="columns=['A']"), s) - - -def test_select_with_dups(setup_path): - - # single dtypes - df = DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]) - df.index = date_range("20130101 9:30", periods=10, freq="T") - - with ensure_clean_store(setup_path) as store: - store.append("df", df) - - result = store.select("df") - expected = df - tm.assert_frame_equal(result, expected, by_blocks=True) - - result = store.select("df", columns=df.columns) - expected = df - tm.assert_frame_equal(result, expected, by_blocks=True) - - result = store.select("df", columns=["A"]) - expected = df.loc[:, ["A"]] - tm.assert_frame_equal(result, expected) - - # dups across dtypes - df = concat( - [ - DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]), - DataFrame( - np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"] - ), - ], - axis=1, - ) - df.index = date_range("20130101 9:30", periods=10, freq="T") - - with ensure_clean_store(setup_path) as store: - store.append("df", df) - - result = store.select("df") - expected = df - tm.assert_frame_equal(result, expected, by_blocks=True) - - result = store.select("df", columns=df.columns) - expected = df - tm.assert_frame_equal(result, expected, by_blocks=True) - - expected = df.loc[:, ["A"]] - result = store.select("df", columns=["A"]) - tm.assert_frame_equal(result, expected, by_blocks=True) - - expected = df.loc[:, ["B", "A"]] - result = store.select("df", columns=["B", "A"]) - tm.assert_frame_equal(result, expected, by_blocks=True) - - # duplicates on both index and columns - with ensure_clean_store(setup_path) as store: - store.append("df", df) - store.append("df", df) - - expected = df.loc[:, ["B", "A"]] - expected = concat([expected, expected]) - result = store.select("df", columns=["B", "A"]) - tm.assert_frame_equal(result, expected, by_blocks=True) - - -def test_select(setup_path): - - with ensure_clean_store(setup_path) as store: - - with catch_warnings(record=True): - - # select with columns= - df = tm.makeTimeDataFrame() - _maybe_remove(store, "df") - store.append("df", df) - result = store.select("df", columns=["A", "B"]) - expected = df.reindex(columns=["A", "B"]) - tm.assert_frame_equal(expected, result) - - # equivalently - result = store.select("df", [("columns=['A', 'B']")]) - expected = df.reindex(columns=["A", "B"]) - tm.assert_frame_equal(expected, result) - - # with a data column - _maybe_remove(store, "df") - store.append("df", df, data_columns=["A"]) - result = store.select("df", ["A > 0"], columns=["A", "B"]) - expected = df[df.A > 0].reindex(columns=["A", "B"]) - tm.assert_frame_equal(expected, result) - - # all a data columns - _maybe_remove(store, "df") - store.append("df", df, data_columns=True) - result = store.select("df", ["A > 0"], columns=["A", "B"]) - expected = df[df.A > 0].reindex(columns=["A", "B"]) - tm.assert_frame_equal(expected, result) - - # with a data column, but different columns - _maybe_remove(store, "df") - store.append("df", df, data_columns=["A"]) - result = store.select("df", ["A > 0"], columns=["C", "D"]) - expected = df[df.A > 0].reindex(columns=["C", "D"]) - tm.assert_frame_equal(expected, result) - - -def test_select_dtypes(setup_path): - - with ensure_clean_store(setup_path) as store: - # with a Timestamp data column (GH #2637) - df = DataFrame( - { - "ts": bdate_range("2012-01-01", periods=300), - "A": np.random.randn(300), - } - ) - _maybe_remove(store, "df") - store.append("df", df, data_columns=["ts", "A"]) - - result = store.select("df", "ts>=Timestamp('2012-02-01')") - expected = df[df.ts >= Timestamp("2012-02-01")] - tm.assert_frame_equal(expected, result) - - # bool columns (GH #2849) - df = DataFrame(np.random.randn(5, 2), columns=["A", "B"]) - df["object"] = "foo" - df.loc[4:5, "object"] = "bar" - df["boolv"] = df["A"] > 0 - _maybe_remove(store, "df") - store.append("df", df, data_columns=True) - - expected = df[df.boolv == True].reindex(columns=["A", "boolv"]) # noqa:E712 - for v in [True, "true", 1]: - result = store.select("df", f"boolv == {v}", columns=["A", "boolv"]) - tm.assert_frame_equal(expected, result) - - expected = df[df.boolv == False].reindex(columns=["A", "boolv"]) # noqa:E712 - for v in [False, "false", 0]: - result = store.select("df", f"boolv == {v}", columns=["A", "boolv"]) - tm.assert_frame_equal(expected, result) - - # integer index - df = DataFrame({"A": np.random.rand(20), "B": np.random.rand(20)}) - _maybe_remove(store, "df_int") - store.append("df_int", df) - result = store.select("df_int", "index<10 and columns=['A']") - expected = df.reindex(index=list(df.index)[0:10], columns=["A"]) - tm.assert_frame_equal(expected, result) - - # float index - df = DataFrame( - { - "A": np.random.rand(20), - "B": np.random.rand(20), - "index": np.arange(20, dtype="f8"), - } - ) - _maybe_remove(store, "df_float") - store.append("df_float", df) - result = store.select("df_float", "index<10.0 and columns=['A']") - expected = df.reindex(index=list(df.index)[0:10], columns=["A"]) - tm.assert_frame_equal(expected, result) - - with ensure_clean_store(setup_path) as store: - - # floats w/o NaN - df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64") - df["cols"] = (df["cols"] + 10).apply(str) - - store.append("df1", df, data_columns=True) - result = store.select("df1", where="values>2.0") - expected = df[df["values"] > 2.0] - tm.assert_frame_equal(expected, result) - - # floats with NaN - df.iloc[0] = np.nan - expected = df[df["values"] > 2.0] - - store.append("df2", df, data_columns=True, index=False) - result = store.select("df2", where="values>2.0") - tm.assert_frame_equal(expected, result) - - # https://github.com/PyTables/PyTables/issues/282 - # bug in selection when 0th row has a np.nan and an index - # store.append('df3',df,data_columns=True) - # result = store.select( - # 'df3', where='values>2.0') - # tm.assert_frame_equal(expected, result) - - # not in first position float with NaN ok too - df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64") - df["cols"] = (df["cols"] + 10).apply(str) - - df.iloc[1] = np.nan - expected = df[df["values"] > 2.0] - - store.append("df4", df, data_columns=True) - result = store.select("df4", where="values>2.0") - tm.assert_frame_equal(expected, result) - - # test selection with comparison against numpy scalar - # GH 11283 - with ensure_clean_store(setup_path) as store: - df = tm.makeDataFrame() - - expected = df[df["A"] > 0] - - store.append("df", df, data_columns=True) - np_zero = np.float64(0) # noqa:F841 - result = store.select("df", where=["A>np_zero"]) - tm.assert_frame_equal(expected, result) - - -def test_select_with_many_inputs(setup_path): - - with ensure_clean_store(setup_path) as store: - - df = DataFrame( - { - "ts": bdate_range("2012-01-01", periods=300), - "A": np.random.randn(300), - "B": range(300), - "users": ["a"] * 50 - + ["b"] * 50 - + ["c"] * 100 - + [f"a{i:03d}" for i in range(100)], - } - ) - _maybe_remove(store, "df") - store.append("df", df, data_columns=["ts", "A", "B", "users"]) - - # regular select - result = store.select("df", "ts>=Timestamp('2012-02-01')") - expected = df[df.ts >= Timestamp("2012-02-01")] - tm.assert_frame_equal(expected, result) - - # small selector - result = store.select("df", "ts>=Timestamp('2012-02-01') & users=['a','b','c']") - expected = df[ - (df.ts >= Timestamp("2012-02-01")) & df.users.isin(["a", "b", "c"]) - ] - tm.assert_frame_equal(expected, result) - - # big selector along the columns - selector = ["a", "b", "c"] + [f"a{i:03d}" for i in range(60)] - result = store.select("df", "ts>=Timestamp('2012-02-01') and users=selector") - expected = df[(df.ts >= Timestamp("2012-02-01")) & df.users.isin(selector)] - tm.assert_frame_equal(expected, result) - - selector = range(100, 200) - result = store.select("df", "B=selector") - expected = df[df.B.isin(selector)] - tm.assert_frame_equal(expected, result) - assert len(result) == 100 - - # big selector along the index - selector = Index(df.ts[0:100].values) - result = store.select("df", "ts=selector") - expected = df[df.ts.isin(selector.values)] - tm.assert_frame_equal(expected, result) - assert len(result) == 100 - - -def test_select_iterator(tmp_path, setup_path): - - # single table - with ensure_clean_store(setup_path) as store: - - df = tm.makeTimeDataFrame(500) - _maybe_remove(store, "df") - store.append("df", df) - - expected = store.select("df") - - results = list(store.select("df", iterator=True)) - result = concat(results) - tm.assert_frame_equal(expected, result) - - results = list(store.select("df", chunksize=100)) - assert len(results) == 5 - result = concat(results) - tm.assert_frame_equal(expected, result) - - results = list(store.select("df", chunksize=150)) - result = concat(results) - tm.assert_frame_equal(result, expected) - - path = tmp_path / setup_path - - df = tm.makeTimeDataFrame(500) - df.to_hdf(path, "df_non_table") - - msg = "can only use an iterator or chunksize on a table" - with pytest.raises(TypeError, match=msg): - read_hdf(path, "df_non_table", chunksize=100) - - with pytest.raises(TypeError, match=msg): - read_hdf(path, "df_non_table", iterator=True) - - path = tmp_path / setup_path - - df = tm.makeTimeDataFrame(500) - df.to_hdf(path, "df", format="table") - - results = list(read_hdf(path, "df", chunksize=100)) - result = concat(results) - - assert len(results) == 5 - tm.assert_frame_equal(result, df) - tm.assert_frame_equal(result, read_hdf(path, "df")) - - # multiple - - with ensure_clean_store(setup_path) as store: - - df1 = tm.makeTimeDataFrame(500) - store.append("df1", df1, data_columns=True) - df2 = tm.makeTimeDataFrame(500).rename(columns="{}_2".format) - df2["foo"] = "bar" - store.append("df2", df2) - - df = concat([df1, df2], axis=1) - - # full selection - expected = store.select_as_multiple(["df1", "df2"], selector="df1") - results = list( - store.select_as_multiple(["df1", "df2"], selector="df1", chunksize=150) - ) - result = concat(results) - tm.assert_frame_equal(expected, result) - - -def test_select_iterator_complete_8014(setup_path): - - # GH 8014 - # using iterator and where clause - chunksize = 1e4 - - # no iterator - with ensure_clean_store(setup_path) as store: - - expected = tm.makeTimeDataFrame(100064, "S") - _maybe_remove(store, "df") - store.append("df", expected) - - beg_dt = expected.index[0] - end_dt = expected.index[-1] - - # select w/o iteration and no where clause works - result = store.select("df") - tm.assert_frame_equal(expected, result) - - # select w/o iterator and where clause, single term, begin - # of range, works - where = f"index >= '{beg_dt}'" - result = store.select("df", where=where) - tm.assert_frame_equal(expected, result) - - # select w/o iterator and where clause, single term, end - # of range, works - where = f"index <= '{end_dt}'" - result = store.select("df", where=where) - tm.assert_frame_equal(expected, result) - - # select w/o iterator and where clause, inclusive range, - # works - where = f"index >= '{beg_dt}' & index <= '{end_dt}'" - result = store.select("df", where=where) - tm.assert_frame_equal(expected, result) - - # with iterator, full range - with ensure_clean_store(setup_path) as store: - - expected = tm.makeTimeDataFrame(100064, "S") - _maybe_remove(store, "df") - store.append("df", expected) - - beg_dt = expected.index[0] - end_dt = expected.index[-1] - - # select w/iterator and no where clause works - results = list(store.select("df", chunksize=chunksize)) - result = concat(results) - tm.assert_frame_equal(expected, result) - - # select w/iterator and where clause, single term, begin of range - where = f"index >= '{beg_dt}'" - results = list(store.select("df", where=where, chunksize=chunksize)) - result = concat(results) - tm.assert_frame_equal(expected, result) - - # select w/iterator and where clause, single term, end of range - where = f"index <= '{end_dt}'" - results = list(store.select("df", where=where, chunksize=chunksize)) - result = concat(results) - tm.assert_frame_equal(expected, result) - - # select w/iterator and where clause, inclusive range - where = f"index >= '{beg_dt}' & index <= '{end_dt}'" - results = list(store.select("df", where=where, chunksize=chunksize)) - result = concat(results) - tm.assert_frame_equal(expected, result) - - -def test_select_iterator_non_complete_8014(setup_path): - - # GH 8014 - # using iterator and where clause - chunksize = 1e4 - - # with iterator, non complete range - with ensure_clean_store(setup_path) as store: - - expected = tm.makeTimeDataFrame(100064, "S") - _maybe_remove(store, "df") - store.append("df", expected) - - beg_dt = expected.index[1] - end_dt = expected.index[-2] - - # select w/iterator and where clause, single term, begin of range - where = f"index >= '{beg_dt}'" - results = list(store.select("df", where=where, chunksize=chunksize)) - result = concat(results) - rexpected = expected[expected.index >= beg_dt] - tm.assert_frame_equal(rexpected, result) - - # select w/iterator and where clause, single term, end of range - where = f"index <= '{end_dt}'" - results = list(store.select("df", where=where, chunksize=chunksize)) - result = concat(results) - rexpected = expected[expected.index <= end_dt] - tm.assert_frame_equal(rexpected, result) - - # select w/iterator and where clause, inclusive range - where = f"index >= '{beg_dt}' & index <= '{end_dt}'" - results = list(store.select("df", where=where, chunksize=chunksize)) - result = concat(results) - rexpected = expected[(expected.index >= beg_dt) & (expected.index <= end_dt)] - tm.assert_frame_equal(rexpected, result) - - # with iterator, empty where - with ensure_clean_store(setup_path) as store: - - expected = tm.makeTimeDataFrame(100064, "S") - _maybe_remove(store, "df") - store.append("df", expected) - - end_dt = expected.index[-1] - - # select w/iterator and where clause, single term, begin of range - where = f"index > '{end_dt}'" - results = list(store.select("df", where=where, chunksize=chunksize)) - assert 0 == len(results) - - -def test_select_iterator_many_empty_frames(setup_path): - - # GH 8014 - # using iterator and where clause can return many empty - # frames. - chunksize = 10_000 - - # with iterator, range limited to the first chunk - with ensure_clean_store(setup_path) as store: - - expected = tm.makeTimeDataFrame(100000, "S") - _maybe_remove(store, "df") - store.append("df", expected) - - beg_dt = expected.index[0] - end_dt = expected.index[chunksize - 1] - - # select w/iterator and where clause, single term, begin of range - where = f"index >= '{beg_dt}'" - results = list(store.select("df", where=where, chunksize=chunksize)) - result = concat(results) - rexpected = expected[expected.index >= beg_dt] - tm.assert_frame_equal(rexpected, result) - - # select w/iterator and where clause, single term, end of range - where = f"index <= '{end_dt}'" - results = list(store.select("df", where=where, chunksize=chunksize)) - - assert len(results) == 1 - result = concat(results) - rexpected = expected[expected.index <= end_dt] - tm.assert_frame_equal(rexpected, result) - - # select w/iterator and where clause, inclusive range - where = f"index >= '{beg_dt}' & index <= '{end_dt}'" - results = list(store.select("df", where=where, chunksize=chunksize)) - - # should be 1, is 10 - assert len(results) == 1 - result = concat(results) - rexpected = expected[(expected.index >= beg_dt) & (expected.index <= end_dt)] - tm.assert_frame_equal(rexpected, result) - - # select w/iterator and where clause which selects - # *nothing*. - # - # To be consistent with Python idiom I suggest this should - # return [] e.g. `for e in []: print True` never prints - # True. - - where = f"index <= '{beg_dt}' & index >= '{end_dt}'" - results = list(store.select("df", where=where, chunksize=chunksize)) - - # should be [] - assert len(results) == 0 - - -def test_frame_select(setup_path): - - df = tm.makeTimeDataFrame() - - with ensure_clean_store(setup_path) as store: - store.put("frame", df, format="table") - date = df.index[len(df) // 2] - - crit1 = Term("index>=date") - assert crit1.env.scope["date"] == date - - crit2 = "columns=['A', 'D']" - crit3 = "columns=A" - - result = store.select("frame", [crit1, crit2]) - expected = df.loc[date:, ["A", "D"]] - tm.assert_frame_equal(result, expected) - - result = store.select("frame", [crit3]) - expected = df.loc[:, ["A"]] - tm.assert_frame_equal(result, expected) - - # invalid terms - df = tm.makeTimeDataFrame() - store.append("df_time", df) - msg = "could not convert string to Timestamp" - with pytest.raises(ValueError, match=msg): - store.select("df_time", "index>0") - - # can't select if not written as table - # store['frame'] = df - # with pytest.raises(ValueError): - # store.select('frame', [crit1, crit2]) - - -def test_frame_select_complex(setup_path): - # select via complex criteria - - df = tm.makeTimeDataFrame() - df["string"] = "foo" - df.loc[df.index[0:4], "string"] = "bar" - - with ensure_clean_store(setup_path) as store: - store.put("df", df, format="table", data_columns=["string"]) - - # empty - result = store.select("df", 'index>df.index[3] & string="bar"') - expected = df.loc[(df.index > df.index[3]) & (df.string == "bar")] - tm.assert_frame_equal(result, expected) - - result = store.select("df", 'index>df.index[3] & string="foo"') - expected = df.loc[(df.index > df.index[3]) & (df.string == "foo")] - tm.assert_frame_equal(result, expected) - - # or - result = store.select("df", 'index>df.index[3] | string="bar"') - expected = df.loc[(df.index > df.index[3]) | (df.string == "bar")] - tm.assert_frame_equal(result, expected) - - result = store.select( - "df", '(index>df.index[3] & index<=df.index[6]) | string="bar"' - ) - expected = df.loc[ - ((df.index > df.index[3]) & (df.index <= df.index[6])) - | (df.string == "bar") - ] - tm.assert_frame_equal(result, expected) - - # invert - result = store.select("df", 'string!="bar"') - expected = df.loc[df.string != "bar"] - tm.assert_frame_equal(result, expected) - - # invert not implemented in numexpr :( - msg = "cannot use an invert condition when passing to numexpr" - with pytest.raises(NotImplementedError, match=msg): - store.select("df", '~(string="bar")') - - # invert ok for filters - result = store.select("df", "~(columns=['A','B'])") - expected = df.loc[:, df.columns.difference(["A", "B"])] - tm.assert_frame_equal(result, expected) - - # in - result = store.select("df", "index>df.index[3] & columns in ['A','B']") - expected = df.loc[df.index > df.index[3]].reindex(columns=["A", "B"]) - tm.assert_frame_equal(result, expected) - - -def test_frame_select_complex2(tmp_path): - - pp = tmp_path / "params.hdf" - hh = tmp_path / "hist.hdf" - - # use non-trivial selection criteria - params = DataFrame({"A": [1, 1, 2, 2, 3]}) - params.to_hdf(pp, "df", mode="w", format="table", data_columns=["A"]) - - selection = read_hdf(pp, "df", where="A=[2,3]") - hist = DataFrame( - np.random.randn(25, 1), - columns=["data"], - index=MultiIndex.from_tuples( - [(i, j) for i in range(5) for j in range(5)], names=["l1", "l2"] - ), - ) - - hist.to_hdf(hh, "df", mode="w", format="table") - - expected = read_hdf(hh, "df", where="l1=[2, 3, 4]") - - # scope with list like - l0 = selection.index.tolist() # noqa:F841 - with HDFStore(hh) as store: - result = store.select("df", where="l1=l0") - tm.assert_frame_equal(result, expected) - - result = read_hdf(hh, "df", where="l1=l0") - tm.assert_frame_equal(result, expected) - - # index - index = selection.index # noqa:F841 - result = read_hdf(hh, "df", where="l1=index") - tm.assert_frame_equal(result, expected) - - result = read_hdf(hh, "df", where="l1=selection.index") - tm.assert_frame_equal(result, expected) - - result = read_hdf(hh, "df", where="l1=selection.index.tolist()") - tm.assert_frame_equal(result, expected) - - result = read_hdf(hh, "df", where="l1=list(selection.index)") - tm.assert_frame_equal(result, expected) - - # scope with index - with HDFStore(hh) as store: - result = store.select("df", where="l1=index") - tm.assert_frame_equal(result, expected) - - result = store.select("df", where="l1=selection.index") - tm.assert_frame_equal(result, expected) - - result = store.select("df", where="l1=selection.index.tolist()") - tm.assert_frame_equal(result, expected) - - result = store.select("df", where="l1=list(selection.index)") - tm.assert_frame_equal(result, expected) - - -def test_invalid_filtering(setup_path): - - # can't use more than one filter (atm) - - df = tm.makeTimeDataFrame() - - with ensure_clean_store(setup_path) as store: - store.put("df", df, format="table") - - msg = "unable to collapse Joint Filters" - # not implemented - with pytest.raises(NotImplementedError, match=msg): - store.select("df", "columns=['A'] | columns=['B']") - - # in theory we could deal with this - with pytest.raises(NotImplementedError, match=msg): - store.select("df", "columns=['A','B'] & columns=['C']") - - -def test_string_select(setup_path): - # GH 2973 - with ensure_clean_store(setup_path) as store: - - df = tm.makeTimeDataFrame() - - # test string ==/!= - df["x"] = "none" - df.loc[df.index[2:7], "x"] = "" - - store.append("df", df, data_columns=["x"]) - - result = store.select("df", "x=none") - expected = df[df.x == "none"] - tm.assert_frame_equal(result, expected) - - result = store.select("df", "x!=none") - expected = df[df.x != "none"] - tm.assert_frame_equal(result, expected) - - df2 = df.copy() - df2.loc[df2.x == "", "x"] = np.nan - - store.append("df2", df2, data_columns=["x"]) - result = store.select("df2", "x!=none") - expected = df2[isna(df2.x)] - tm.assert_frame_equal(result, expected) - - # int ==/!= - df["int"] = 1 - df.loc[df.index[2:7], "int"] = 2 - - store.append("df3", df, data_columns=["int"]) - - result = store.select("df3", "int=2") - expected = df[df.int == 2] - tm.assert_frame_equal(result, expected) - - result = store.select("df3", "int!=2") - expected = df[df.int != 2] - tm.assert_frame_equal(result, expected) - - -def test_select_as_multiple(setup_path): - - df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) - df2["foo"] = "bar" - - with ensure_clean_store(setup_path) as store: - - msg = "keys must be a list/tuple" - # no tables stored - with pytest.raises(TypeError, match=msg): - store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1") - - store.append("df1", df1, data_columns=["A", "B"]) - store.append("df2", df2) - - # exceptions - with pytest.raises(TypeError, match=msg): - store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1") - - with pytest.raises(TypeError, match=msg): - store.select_as_multiple([None], where=["A>0", "B>0"], selector="df1") - - msg = "'No object named df3 in the file'" - with pytest.raises(KeyError, match=msg): - store.select_as_multiple( - ["df1", "df3"], where=["A>0", "B>0"], selector="df1" - ) - - with pytest.raises(KeyError, match=msg): - store.select_as_multiple(["df3"], where=["A>0", "B>0"], selector="df1") - - with pytest.raises(KeyError, match="'No object named df4 in the file'"): - store.select_as_multiple( - ["df1", "df2"], where=["A>0", "B>0"], selector="df4" - ) - - # default select - result = store.select("df1", ["A>0", "B>0"]) - expected = store.select_as_multiple( - ["df1"], where=["A>0", "B>0"], selector="df1" - ) - tm.assert_frame_equal(result, expected) - expected = store.select_as_multiple("df1", where=["A>0", "B>0"], selector="df1") - tm.assert_frame_equal(result, expected) - - # multiple - result = store.select_as_multiple( - ["df1", "df2"], where=["A>0", "B>0"], selector="df1" - ) - expected = concat([df1, df2], axis=1) - expected = expected[(expected.A > 0) & (expected.B > 0)] - tm.assert_frame_equal(result, expected, check_freq=False) - # FIXME: 2021-01-20 this is failing with freq None vs 4B on some builds - - # multiple (diff selector) - result = store.select_as_multiple( - ["df1", "df2"], where="index>df2.index[4]", selector="df2" - ) - expected = concat([df1, df2], axis=1) - expected = expected[5:] - tm.assert_frame_equal(result, expected) - - # test exception for diff rows - store.append("df3", tm.makeTimeDataFrame(nper=50)) - msg = "all tables must have exactly the same nrows!" - with pytest.raises(ValueError, match=msg): - store.select_as_multiple( - ["df1", "df3"], where=["A>0", "B>0"], selector="df1" - ) - - -def test_nan_selection_bug_4858(setup_path): - - with ensure_clean_store(setup_path) as store: - - df = DataFrame({"cols": range(6), "values": range(6)}, dtype="float64") - df["cols"] = (df["cols"] + 10).apply(str) - df.iloc[0] = np.nan - - expected = DataFrame( - {"cols": ["13.0", "14.0", "15.0"], "values": [3.0, 4.0, 5.0]}, - index=[3, 4, 5], - ) - - # write w/o the index on that particular column - store.append("df", df, data_columns=True, index=["cols"]) - result = store.select("df", where="values>2.0") - tm.assert_frame_equal(result, expected) - - -def test_query_with_nested_special_character(setup_path): - df = DataFrame( - { - "a": ["a", "a", "c", "b", "test & test", "c", "b", "e"], - "b": [1, 2, 3, 4, 5, 6, 7, 8], - } - ) - expected = df[df.a == "test & test"] - with ensure_clean_store(setup_path) as store: - store.append("test", df, format="table", data_columns=True) - result = store.select("test", 'a = "test & test"') - tm.assert_frame_equal(expected, result) - - -def test_query_long_float_literal(setup_path): - # GH 14241 - df = DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]}) - - with ensure_clean_store(setup_path) as store: - store.append("test", df, format="table", data_columns=True) - - cutoff = 1000000000.0006 - result = store.select("test", f"A < {cutoff:.4f}") - assert result.empty - - cutoff = 1000000000.0010 - result = store.select("test", f"A > {cutoff:.4f}") - expected = df.loc[[1, 2], :] - tm.assert_frame_equal(expected, result) - - exact = 1000000000.0011 - result = store.select("test", f"A == {exact:.4f}") - expected = df.loc[[1], :] - tm.assert_frame_equal(expected, result) - - -def test_query_compare_column_type(setup_path): - # GH 15492 - df = DataFrame( - { - "date": ["2014-01-01", "2014-01-02"], - "real_date": date_range("2014-01-01", periods=2), - "float": [1.1, 1.2], - "int": [1, 2], - }, - columns=["date", "real_date", "float", "int"], - ) - - with ensure_clean_store(setup_path) as store: - store.append("test", df, format="table", data_columns=True) - - ts = Timestamp("2014-01-01") # noqa:F841 - result = store.select("test", where="real_date > ts") - expected = df.loc[[1], :] - tm.assert_frame_equal(expected, result) - - for op in ["<", ">", "=="]: - # non strings to string column always fail - for v in [2.1, True, Timestamp("2014-01-01"), pd.Timedelta(1, "s")]: - query = f"date {op} v" - msg = f"Cannot compare {v} of type {type(v)} to string column" - with pytest.raises(TypeError, match=msg): - store.select("test", where=query) - - # strings to other columns must be convertible to type - v = "a" - for col in ["int", "float", "real_date"]: - query = f"{col} {op} v" - if col == "real_date": - msg = 'Given date string "a" not likely a datetime' - else: - msg = "could not convert string to " - with pytest.raises(ValueError, match=msg): - store.select("test", where=query) - - for v, col in zip( - ["1", "1.1", "2014-01-01"], ["int", "float", "real_date"] - ): - query = f"{col} {op} v" - result = store.select("test", where=query) - - if op == "==": - expected = df.loc[[0], :] - elif op == ">": - expected = df.loc[[1], :] - else: - expected = df.loc[[], :] - tm.assert_frame_equal(expected, result) - - -@pytest.mark.parametrize("where", ["", (), (None,), [], [None]]) -def test_select_empty_where(tmp_path, where): - # GH26610 - - df = DataFrame([1, 2, 3]) - path = tmp_path / "empty_where.h5" - with HDFStore(path) as store: - store.put("df", df, "t") - result = read_hdf(store, "df", where=where) - tm.assert_frame_equal(result, df) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py deleted file mode 100644 index 06684f076aefe..0000000000000 --- a/pandas/tests/io/pytables/test_store.py +++ /dev/null @@ -1,1018 +0,0 @@ -import datetime as dt -import hashlib -import os -import tempfile -import time -from warnings import ( - catch_warnings, - simplefilter, -) - -import numpy as np -import pytest - -import pandas as pd -from pandas import ( - DataFrame, - DatetimeIndex, - Index, - MultiIndex, - Series, - Timestamp, - concat, - date_range, - timedelta_range, -) -import pandas._testing as tm -from pandas.tests.io.pytables.common import ( - _maybe_remove, - ensure_clean_store, - safe_close, -) - -_default_compressor = "blosc" - -from pandas.io.pytables import ( - HDFStore, - read_hdf, -) - -pytestmark = pytest.mark.single_cpu - - -def test_context(setup_path): - with tm.ensure_clean(setup_path) as path: - try: - with HDFStore(path) as tbl: - raise ValueError("blah") - except ValueError: - pass - with tm.ensure_clean(setup_path) as path: - with HDFStore(path) as tbl: - tbl["a"] = tm.makeDataFrame() - assert len(tbl) == 1 - assert type(tbl["a"]) == DataFrame - - -def test_no_track_times(tmp_path, setup_path): - - # GH 32682 - # enables to set track_times (see `pytables` `create_table` documentation) - - def checksum(filename, hash_factory=hashlib.md5, chunk_num_blocks=128): - h = hash_factory() - with open(filename, "rb") as f: - for chunk in iter(lambda: f.read(chunk_num_blocks * h.block_size), b""): - h.update(chunk) - return h.digest() - - def create_h5_and_return_checksum(tmp_path, track_times): - path = tmp_path / setup_path - df = DataFrame({"a": [1]}) - - with HDFStore(path, mode="w") as hdf: - hdf.put( - "table", - df, - format="table", - data_columns=True, - index=None, - track_times=track_times, - ) - - return checksum(path) - - checksum_0_tt_false = create_h5_and_return_checksum(tmp_path, track_times=False) - checksum_0_tt_true = create_h5_and_return_checksum(tmp_path, track_times=True) - - # sleep is necessary to create h5 with different creation time - time.sleep(1) - - checksum_1_tt_false = create_h5_and_return_checksum(tmp_path, track_times=False) - checksum_1_tt_true = create_h5_and_return_checksum(tmp_path, track_times=True) - - # checksums are the same if track_time = False - assert checksum_0_tt_false == checksum_1_tt_false - - # checksums are NOT same if track_time = True - assert checksum_0_tt_true != checksum_1_tt_true - - -def test_iter_empty(setup_path): - - with ensure_clean_store(setup_path) as store: - # GH 12221 - assert list(store) == [] - - -def test_repr(setup_path): - - with ensure_clean_store(setup_path) as store: - repr(store) - store.info() - store["a"] = tm.makeTimeSeries() - store["b"] = tm.makeStringSeries() - store["c"] = tm.makeDataFrame() - - df = tm.makeDataFrame() - df["obj1"] = "foo" - df["obj2"] = "bar" - df["bool1"] = df["A"] > 0 - df["bool2"] = df["B"] > 0 - df["bool3"] = True - df["int1"] = 1 - df["int2"] = 2 - df["timestamp1"] = Timestamp("20010102") - df["timestamp2"] = Timestamp("20010103") - df["datetime1"] = dt.datetime(2001, 1, 2, 0, 0) - df["datetime2"] = dt.datetime(2001, 1, 3, 0, 0) - df.loc[df.index[3:6], ["obj1"]] = np.nan - df = df._consolidate() - - with catch_warnings(record=True): - simplefilter("ignore", pd.errors.PerformanceWarning) - store["df"] = df - - # make a random group in hdf space - store._handle.create_group(store._handle.root, "bah") - - assert store.filename in repr(store) - assert store.filename in str(store) - store.info() - - # storers - with ensure_clean_store(setup_path) as store: - - df = tm.makeDataFrame() - store.append("df", df) - - s = store.get_storer("df") - repr(s) - str(s) - - -def test_contains(setup_path): - - with ensure_clean_store(setup_path) as store: - store["a"] = tm.makeTimeSeries() - store["b"] = tm.makeDataFrame() - store["foo/bar"] = tm.makeDataFrame() - assert "a" in store - assert "b" in store - assert "c" not in store - assert "foo/bar" in store - assert "/foo/bar" in store - assert "/foo/b" not in store - assert "bar" not in store - - # gh-2694: tables.NaturalNameWarning - with catch_warnings(record=True): - store["node())"] = tm.makeDataFrame() - assert "node())" in store - - -def test_versioning(setup_path): - - with ensure_clean_store(setup_path) as store: - store["a"] = tm.makeTimeSeries() - store["b"] = tm.makeDataFrame() - df = tm.makeTimeDataFrame() - _maybe_remove(store, "df1") - store.append("df1", df[:10]) - store.append("df1", df[10:]) - assert store.root.a._v_attrs.pandas_version == "0.15.2" - assert store.root.b._v_attrs.pandas_version == "0.15.2" - assert store.root.df1._v_attrs.pandas_version == "0.15.2" - - # write a file and wipe its versioning - _maybe_remove(store, "df2") - store.append("df2", df) - - # this is an error because its table_type is appendable, but no - # version info - store.get_node("df2")._v_attrs.pandas_version = None - - msg = "'NoneType' object has no attribute 'startswith'" - - with pytest.raises(Exception, match=msg): - store.select("df2") - - -@pytest.mark.parametrize( - "where, expected", - [ - ( - "/", - { - "": ({"first_group", "second_group"}, set()), - "/first_group": (set(), {"df1", "df2"}), - "/second_group": ({"third_group"}, {"df3", "s1"}), - "/second_group/third_group": (set(), {"df4"}), - }, - ), - ( - "/second_group", - { - "/second_group": ({"third_group"}, {"df3", "s1"}), - "/second_group/third_group": (set(), {"df4"}), - }, - ), - ], -) -def test_walk(where, expected): - # GH10143 - objs = { - "df1": DataFrame([1, 2, 3]), - "df2": DataFrame([4, 5, 6]), - "df3": DataFrame([6, 7, 8]), - "df4": DataFrame([9, 10, 11]), - "s1": Series([10, 9, 8]), - # Next 3 items aren't pandas objects and should be ignored - "a1": np.array([[1, 2, 3], [4, 5, 6]]), - "tb1": np.array([(1, 2, 3), (4, 5, 6)], dtype="i,i,i"), - "tb2": np.array([(7, 8, 9), (10, 11, 12)], dtype="i,i,i"), - } - - with ensure_clean_store("walk_groups.hdf", mode="w") as store: - store.put("/first_group/df1", objs["df1"]) - store.put("/first_group/df2", objs["df2"]) - store.put("/second_group/df3", objs["df3"]) - store.put("/second_group/s1", objs["s1"]) - store.put("/second_group/third_group/df4", objs["df4"]) - # Create non-pandas objects - store._handle.create_array("/first_group", "a1", objs["a1"]) - store._handle.create_table("/first_group", "tb1", obj=objs["tb1"]) - store._handle.create_table("/second_group", "tb2", obj=objs["tb2"]) - - assert len(list(store.walk(where=where))) == len(expected) - for path, groups, leaves in store.walk(where=where): - assert path in expected - expected_groups, expected_frames = expected[path] - assert expected_groups == set(groups) - assert expected_frames == set(leaves) - for leaf in leaves: - frame_path = "/".join([path, leaf]) - obj = store.get(frame_path) - if "df" in leaf: - tm.assert_frame_equal(obj, objs[leaf]) - else: - tm.assert_series_equal(obj, objs[leaf]) - - -def test_getattr(setup_path): - - with ensure_clean_store(setup_path) as store: - - s = tm.makeTimeSeries() - store["a"] = s - - # test attribute access - result = store.a - tm.assert_series_equal(result, s) - result = getattr(store, "a") - tm.assert_series_equal(result, s) - - df = tm.makeTimeDataFrame() - store["df"] = df - result = store.df - tm.assert_frame_equal(result, df) - - # errors - for x in ["d", "mode", "path", "handle", "complib"]: - msg = f"'HDFStore' object has no attribute '{x}'" - with pytest.raises(AttributeError, match=msg): - getattr(store, x) - - # not stores - for x in ["mode", "path", "handle", "complib"]: - getattr(store, f"_{x}") - - -def test_store_dropna(tmp_path, setup_path): - df_with_missing = DataFrame( - {"col1": [0.0, np.nan, 2.0], "col2": [1.0, np.nan, np.nan]}, - index=list("abc"), - ) - df_without_missing = DataFrame( - {"col1": [0.0, 2.0], "col2": [1.0, np.nan]}, index=list("ac") - ) - - # # Test to make sure defaults are to not drop. - # # Corresponding to Issue 9382 - path = tmp_path / setup_path - df_with_missing.to_hdf(path, "df", format="table") - reloaded = read_hdf(path, "df") - tm.assert_frame_equal(df_with_missing, reloaded) - - path = tmp_path / setup_path - df_with_missing.to_hdf(path, "df", format="table", dropna=False) - reloaded = read_hdf(path, "df") - tm.assert_frame_equal(df_with_missing, reloaded) - - path = tmp_path / setup_path - df_with_missing.to_hdf(path, "df", format="table", dropna=True) - reloaded = read_hdf(path, "df") - tm.assert_frame_equal(df_without_missing, reloaded) - - -def test_to_hdf_with_min_itemsize(tmp_path, setup_path): - - path = tmp_path / setup_path - - # min_itemsize in index with to_hdf (GH 10381) - df = tm.makeMixedDataFrame().set_index("C") - df.to_hdf(path, "ss3", format="table", min_itemsize={"index": 6}) - # just make sure there is a longer string: - df2 = df.copy().reset_index().assign(C="longer").set_index("C") - df2.to_hdf(path, "ss3", append=True, format="table") - tm.assert_frame_equal(read_hdf(path, "ss3"), concat([df, df2])) - - # same as above, with a Series - df["B"].to_hdf(path, "ss4", format="table", min_itemsize={"index": 6}) - df2["B"].to_hdf(path, "ss4", append=True, format="table") - tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]])) - - -@pytest.mark.parametrize("format", ["fixed", "table"]) -def test_to_hdf_errors(tmp_path, format, setup_path): - - data = ["\ud800foo"] - ser = Series(data, index=Index(data)) - path = tmp_path / setup_path - # GH 20835 - ser.to_hdf(path, "table", format=format, errors="surrogatepass") - - result = read_hdf(path, "table", errors="surrogatepass") - tm.assert_series_equal(result, ser) - - -def test_create_table_index(setup_path): - - with ensure_clean_store(setup_path) as store: - - with catch_warnings(record=True): - - def col(t, column): - return getattr(store.get_storer(t).table.cols, column) - - # data columns - df = tm.makeTimeDataFrame() - df["string"] = "foo" - df["string2"] = "bar" - store.append("f", df, data_columns=["string", "string2"]) - assert col("f", "index").is_indexed is True - assert col("f", "string").is_indexed is True - assert col("f", "string2").is_indexed is True - - # specify index=columns - store.append("f2", df, index=["string"], data_columns=["string", "string2"]) - assert col("f2", "index").is_indexed is False - assert col("f2", "string").is_indexed is True - assert col("f2", "string2").is_indexed is False - - # try to index a non-table - _maybe_remove(store, "f2") - store.put("f2", df) - msg = "cannot create table index on a Fixed format store" - with pytest.raises(TypeError, match=msg): - store.create_table_index("f2") - - -def test_create_table_index_data_columns_argument(setup_path): - # GH 28156 - - with ensure_clean_store(setup_path) as store: - - with catch_warnings(record=True): - - def col(t, column): - return getattr(store.get_storer(t).table.cols, column) - - # data columns - df = tm.makeTimeDataFrame() - df["string"] = "foo" - df["string2"] = "bar" - store.append("f", df, data_columns=["string"]) - assert col("f", "index").is_indexed is True - assert col("f", "string").is_indexed is True - - msg = "'Cols' object has no attribute 'string2'" - with pytest.raises(AttributeError, match=msg): - col("f", "string2").is_indexed - - # try to index a col which isn't a data_column - msg = ( - "column string2 is not a data_column.\n" - "In order to read column string2 you must reload the dataframe \n" - "into HDFStore and include string2 with the data_columns argument." - ) - with pytest.raises(AttributeError, match=msg): - store.create_table_index("f", columns=["string2"]) - - -def test_mi_data_columns(setup_path): - # GH 14435 - idx = MultiIndex.from_arrays( - [date_range("2000-01-01", periods=5), range(5)], names=["date", "id"] - ) - df = DataFrame({"a": [1.1, 1.2, 1.3, 1.4, 1.5]}, index=idx) - - with ensure_clean_store(setup_path) as store: - store.append("df", df, data_columns=True) - - actual = store.select("df", where="id == 1") - expected = df.iloc[[1], :] - tm.assert_frame_equal(actual, expected) - - -def test_table_mixed_dtypes(setup_path): - - # frame - df = tm.makeDataFrame() - df["obj1"] = "foo" - df["obj2"] = "bar" - df["bool1"] = df["A"] > 0 - df["bool2"] = df["B"] > 0 - df["bool3"] = True - df["int1"] = 1 - df["int2"] = 2 - df["timestamp1"] = Timestamp("20010102") - df["timestamp2"] = Timestamp("20010103") - df["datetime1"] = dt.datetime(2001, 1, 2, 0, 0) - df["datetime2"] = dt.datetime(2001, 1, 3, 0, 0) - df.loc[df.index[3:6], ["obj1"]] = np.nan - df = df._consolidate() - - with ensure_clean_store(setup_path) as store: - store.append("df1_mixed", df) - tm.assert_frame_equal(store.select("df1_mixed"), df) - - -def test_calendar_roundtrip_issue(setup_path): - - # 8591 - # doc example from tseries holiday section - weekmask_egypt = "Sun Mon Tue Wed Thu" - holidays = [ - "2012-05-01", - dt.datetime(2013, 5, 1), - np.datetime64("2014-05-01"), - ] - bday_egypt = pd.offsets.CustomBusinessDay( - holidays=holidays, weekmask=weekmask_egypt - ) - mydt = dt.datetime(2013, 4, 30) - dts = date_range(mydt, periods=5, freq=bday_egypt) - - s = Series(dts.weekday, dts).map(Series("Mon Tue Wed Thu Fri Sat Sun".split())) - - with ensure_clean_store(setup_path) as store: - - store.put("fixed", s) - result = store.select("fixed") - tm.assert_series_equal(result, s) - - store.append("table", s) - result = store.select("table") - tm.assert_series_equal(result, s) - - -def test_remove(setup_path): - - with ensure_clean_store(setup_path) as store: - - ts = tm.makeTimeSeries() - df = tm.makeDataFrame() - store["a"] = ts - store["b"] = df - _maybe_remove(store, "a") - assert len(store) == 1 - tm.assert_frame_equal(df, store["b"]) - - _maybe_remove(store, "b") - assert len(store) == 0 - - # nonexistence - with pytest.raises( - KeyError, match="'No object named a_nonexistent_store in the file'" - ): - store.remove("a_nonexistent_store") - - # pathing - store["a"] = ts - store["b/foo"] = df - _maybe_remove(store, "foo") - _maybe_remove(store, "b/foo") - assert len(store) == 1 - - store["a"] = ts - store["b/foo"] = df - _maybe_remove(store, "b") - assert len(store) == 1 - - # __delitem__ - store["a"] = ts - store["b"] = df - del store["a"] - del store["b"] - assert len(store) == 0 - - -def test_same_name_scoping(setup_path): - - with ensure_clean_store(setup_path) as store: - - df = DataFrame(np.random.randn(20, 2), index=date_range("20130101", periods=20)) - store.put("df", df, format="table") - expected = df[df.index > Timestamp("20130105")] - - result = store.select("df", "index>datetime.datetime(2013,1,5)") - tm.assert_frame_equal(result, expected) - - # changes what 'datetime' points to in the namespace where - # 'select' does the lookup - - # technically an error, but allow it - result = store.select("df", "index>datetime.datetime(2013,1,5)") - tm.assert_frame_equal(result, expected) - - result = store.select("df", "index>datetime(2013,1,5)") - tm.assert_frame_equal(result, expected) - - -def test_store_index_name(setup_path): - df = tm.makeDataFrame() - df.index.name = "foo" - - with ensure_clean_store(setup_path) as store: - store["frame"] = df - recons = store["frame"] - tm.assert_frame_equal(recons, df) - - -@pytest.mark.parametrize("table_format", ["table", "fixed"]) -def test_store_index_name_numpy_str(tmp_path, table_format, setup_path): - # GH #13492 - idx = Index( - pd.to_datetime([dt.date(2000, 1, 1), dt.date(2000, 1, 2)]), - name="cols\u05d2", - ) - idx1 = Index( - pd.to_datetime([dt.date(2010, 1, 1), dt.date(2010, 1, 2)]), - name="rows\u05d0", - ) - df = DataFrame(np.arange(4).reshape(2, 2), columns=idx, index=idx1) - - # This used to fail, returning numpy strings instead of python strings. - path = tmp_path / setup_path - df.to_hdf(path, "df", format=table_format) - df2 = read_hdf(path, "df") - - tm.assert_frame_equal(df, df2, check_names=True) - - assert type(df2.index.name) == str - assert type(df2.columns.name) == str - - -def test_store_series_name(setup_path): - df = tm.makeDataFrame() - series = df["A"] - - with ensure_clean_store(setup_path) as store: - store["series"] = series - recons = store["series"] - tm.assert_series_equal(recons, series) - - -def test_overwrite_node(setup_path): - - with ensure_clean_store(setup_path) as store: - store["a"] = tm.makeTimeDataFrame() - ts = tm.makeTimeSeries() - store["a"] = ts - - tm.assert_series_equal(store["a"], ts) - - -def test_coordinates(setup_path): - df = tm.makeTimeDataFrame() - - with ensure_clean_store(setup_path) as store: - - _maybe_remove(store, "df") - store.append("df", df) - - # all - c = store.select_as_coordinates("df") - assert (c.values == np.arange(len(df.index))).all() - - # get coordinates back & test vs frame - _maybe_remove(store, "df") - - df = DataFrame({"A": range(5), "B": range(5)}) - store.append("df", df) - c = store.select_as_coordinates("df", ["index<3"]) - assert (c.values == np.arange(3)).all() - result = store.select("df", where=c) - expected = df.loc[0:2, :] - tm.assert_frame_equal(result, expected) - - c = store.select_as_coordinates("df", ["index>=3", "index<=4"]) - assert (c.values == np.arange(2) + 3).all() - result = store.select("df", where=c) - expected = df.loc[3:4, :] - tm.assert_frame_equal(result, expected) - assert isinstance(c, Index) - - # multiple tables - _maybe_remove(store, "df1") - _maybe_remove(store, "df2") - df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) - store.append("df1", df1, data_columns=["A", "B"]) - store.append("df2", df2) - - c = store.select_as_coordinates("df1", ["A>0", "B>0"]) - df1_result = store.select("df1", c) - df2_result = store.select("df2", c) - result = concat([df1_result, df2_result], axis=1) - - expected = concat([df1, df2], axis=1) - expected = expected[(expected.A > 0) & (expected.B > 0)] - tm.assert_frame_equal(result, expected, check_freq=False) - # FIXME: 2021-01-18 on some (mostly windows) builds we get freq=None - # but expect freq="18B" - - # pass array/mask as the coordinates - with ensure_clean_store(setup_path) as store: - - df = DataFrame( - np.random.randn(1000, 2), index=date_range("20000101", periods=1000) - ) - store.append("df", df) - c = store.select_column("df", "index") - where = c[DatetimeIndex(c).month == 5].index - expected = df.iloc[where] - - # locations - result = store.select("df", where=where) - tm.assert_frame_equal(result, expected) - - # boolean - result = store.select("df", where=where) - tm.assert_frame_equal(result, expected) - - # invalid - msg = ( - "where must be passed as a string, PyTablesExpr, " - "or list-like of PyTablesExpr" - ) - with pytest.raises(TypeError, match=msg): - store.select("df", where=np.arange(len(df), dtype="float64")) - - with pytest.raises(TypeError, match=msg): - store.select("df", where=np.arange(len(df) + 1)) - - with pytest.raises(TypeError, match=msg): - store.select("df", where=np.arange(len(df)), start=5) - - with pytest.raises(TypeError, match=msg): - store.select("df", where=np.arange(len(df)), start=5, stop=10) - - # selection with filter - selection = date_range("20000101", periods=500) - result = store.select("df", where="index in selection") - expected = df[df.index.isin(selection)] - tm.assert_frame_equal(result, expected) - - # list - df = DataFrame(np.random.randn(10, 2)) - store.append("df2", df) - result = store.select("df2", where=[0, 3, 5]) - expected = df.iloc[[0, 3, 5]] - tm.assert_frame_equal(result, expected) - - # boolean - where = [True] * 10 - where[-2] = False - result = store.select("df2", where=where) - expected = df.loc[where] - tm.assert_frame_equal(result, expected) - - # start/stop - result = store.select("df2", start=5, stop=10) - expected = df[5:10] - tm.assert_frame_equal(result, expected) - - -def test_start_stop_table(setup_path): - - with ensure_clean_store(setup_path) as store: - - # table - df = DataFrame({"A": np.random.rand(20), "B": np.random.rand(20)}) - store.append("df", df) - - result = store.select("df", "columns=['A']", start=0, stop=5) - expected = df.loc[0:4, ["A"]] - tm.assert_frame_equal(result, expected) - - # out of range - result = store.select("df", "columns=['A']", start=30, stop=40) - assert len(result) == 0 - expected = df.loc[30:40, ["A"]] - tm.assert_frame_equal(result, expected) - - -def test_start_stop_multiple(setup_path): - - # GH 16209 - with ensure_clean_store(setup_path) as store: - - df = DataFrame({"foo": [1, 2], "bar": [1, 2]}) - - store.append_to_multiple( - {"selector": ["foo"], "data": None}, df, selector="selector" - ) - result = store.select_as_multiple( - ["selector", "data"], selector="selector", start=0, stop=1 - ) - expected = df.loc[[0], ["foo", "bar"]] - tm.assert_frame_equal(result, expected) - - -def test_start_stop_fixed(setup_path): - - with ensure_clean_store(setup_path) as store: - - # fixed, GH 8287 - df = DataFrame( - {"A": np.random.rand(20), "B": np.random.rand(20)}, - index=date_range("20130101", periods=20), - ) - store.put("df", df) - - result = store.select("df", start=0, stop=5) - expected = df.iloc[0:5, :] - tm.assert_frame_equal(result, expected) - - result = store.select("df", start=5, stop=10) - expected = df.iloc[5:10, :] - tm.assert_frame_equal(result, expected) - - # out of range - result = store.select("df", start=30, stop=40) - expected = df.iloc[30:40, :] - tm.assert_frame_equal(result, expected) - - # series - s = df.A - store.put("s", s) - result = store.select("s", start=0, stop=5) - expected = s.iloc[0:5] - tm.assert_series_equal(result, expected) - - result = store.select("s", start=5, stop=10) - expected = s.iloc[5:10] - tm.assert_series_equal(result, expected) - - # sparse; not implemented - df = tm.makeDataFrame() - df.iloc[3:5, 1:3] = np.nan - df.iloc[8:10, -2] = np.nan - - -def test_select_filter_corner(setup_path): - - df = DataFrame(np.random.randn(50, 100)) - df.index = [f"{c:3d}" for c in df.index] - df.columns = [f"{c:3d}" for c in df.columns] - - with ensure_clean_store(setup_path) as store: - store.put("frame", df, format="table") - - crit = "columns=df.columns[:75]" - result = store.select("frame", [crit]) - tm.assert_frame_equal(result, df.loc[:, df.columns[:75]]) - - crit = "columns=df.columns[:75:2]" - result = store.select("frame", [crit]) - tm.assert_frame_equal(result, df.loc[:, df.columns[:75:2]]) - - -def test_path_pathlib(): - df = tm.makeDataFrame() - - result = tm.round_trip_pathlib( - lambda p: df.to_hdf(p, "df"), lambda p: read_hdf(p, "df") - ) - tm.assert_frame_equal(df, result) - - -@pytest.mark.parametrize("start, stop", [(0, 2), (1, 2), (None, None)]) -def test_contiguous_mixed_data_table(start, stop, setup_path): - # GH 17021 - df = DataFrame( - { - "a": Series([20111010, 20111011, 20111012]), - "b": Series(["ab", "cd", "ab"]), - } - ) - - with ensure_clean_store(setup_path) as store: - store.append("test_dataset", df) - - result = store.select("test_dataset", start=start, stop=stop) - tm.assert_frame_equal(df[start:stop], result) - - -def test_path_pathlib_hdfstore(): - df = tm.makeDataFrame() - - def writer(path): - with HDFStore(path) as store: - df.to_hdf(store, "df") - - def reader(path): - with HDFStore(path) as store: - return read_hdf(store, "df") - - result = tm.round_trip_pathlib(writer, reader) - tm.assert_frame_equal(df, result) - - -def test_pickle_path_localpath(): - df = tm.makeDataFrame() - result = tm.round_trip_pathlib( - lambda p: df.to_hdf(p, "df"), lambda p: read_hdf(p, "df") - ) - tm.assert_frame_equal(df, result) - - -def test_path_localpath_hdfstore(): - df = tm.makeDataFrame() - - def writer(path): - with HDFStore(path) as store: - df.to_hdf(store, "df") - - def reader(path): - with HDFStore(path) as store: - return read_hdf(store, "df") - - result = tm.round_trip_localpath(writer, reader) - tm.assert_frame_equal(df, result) - - -def test_copy(): - - with catch_warnings(record=True): - - def do_copy(f, new_f=None, keys=None, propindexes=True, **kwargs): - if new_f is None: - fd, new_f = tempfile.mkstemp() - - try: - store = HDFStore(f, "r") - tstore = store.copy(new_f, keys=keys, propindexes=propindexes, **kwargs) - - # check keys - if keys is None: - keys = store.keys() - assert set(keys) == set(tstore.keys()) - - # check indices & nrows - for k in tstore.keys(): - if tstore.get_storer(k).is_table: - new_t = tstore.get_storer(k) - orig_t = store.get_storer(k) - - assert orig_t.nrows == new_t.nrows - - # check propindixes - if propindexes: - for a in orig_t.axes: - if a.is_indexed: - assert new_t[a.name].is_indexed - - finally: - safe_close(store) - safe_close(tstore) - try: - os.close(fd) - except (OSError, ValueError): - pass - os.remove(new_f) - - # new table - df = tm.makeDataFrame() - - with tm.ensure_clean() as path: - with HDFStore(path) as st: - st.append("df", df, data_columns=["A"]) - do_copy(f=path) - do_copy(f=path, propindexes=False) - - -def test_duplicate_column_name(tmp_path, setup_path): - df = DataFrame(columns=["a", "a"], data=[[0, 0]]) - - path = tmp_path / setup_path - msg = "Columns index has to be unique for fixed format" - with pytest.raises(ValueError, match=msg): - df.to_hdf(path, "df", format="fixed") - - df.to_hdf(path, "df", format="table") - other = read_hdf(path, "df") - - tm.assert_frame_equal(df, other) - assert df.equals(other) - assert other.equals(df) - - -def test_preserve_timedeltaindex_type(setup_path): - # GH9635 - df = DataFrame(np.random.normal(size=(10, 5))) - df.index = timedelta_range(start="0s", periods=10, freq="1s", name="example") - - with ensure_clean_store(setup_path) as store: - - store["df"] = df - tm.assert_frame_equal(store["df"], df) - - -def test_columns_multiindex_modified(tmp_path, setup_path): - # BUG: 7212 - - df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) - df.index.name = "letters" - df = df.set_index(keys="E", append=True) - - data_columns = df.index.names + df.columns.tolist() - path = tmp_path / setup_path - df.to_hdf( - path, - "df", - mode="a", - append=True, - data_columns=data_columns, - index=False, - ) - cols2load = list("BCD") - cols2load_original = list(cols2load) - # GH#10055 make sure read_hdf call does not alter cols2load inplace - read_hdf(path, "df", columns=cols2load) - assert cols2load_original == cols2load - - -def test_to_hdf_with_object_column_names(tmp_path, setup_path): - # GH9057 - - types_should_fail = [ - tm.makeIntIndex, - tm.makeFloatIndex, - tm.makeDateIndex, - tm.makeTimedeltaIndex, - tm.makePeriodIndex, - ] - types_should_run = [ - tm.makeStringIndex, - tm.makeCategoricalIndex, - ] - - for index in types_should_fail: - df = DataFrame(np.random.randn(10, 2), columns=index(2)) - path = tmp_path / setup_path - with catch_warnings(record=True): - msg = "cannot have non-object label DataIndexableCol" - with pytest.raises(ValueError, match=msg): - df.to_hdf(path, "df", format="table", data_columns=True) - - for index in types_should_run: - df = DataFrame(np.random.randn(10, 2), columns=index(2)) - path = tmp_path / setup_path - with catch_warnings(record=True): - df.to_hdf(path, "df", format="table", data_columns=True) - result = read_hdf(path, "df", where=f"index = [{df.index[0]}]") - assert len(result) - - -def test_hdfstore_strides(setup_path): - # GH22073 - df = DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) - with ensure_clean_store(setup_path) as store: - store.put("df", df) - assert df["a"].values.strides == store["df"]["a"].values.strides - - -def test_store_bool_index(tmp_path, setup_path): - # GH#48667 - df = DataFrame([[1]], columns=[True], index=Index([False], dtype="bool")) - expected = df.copy() - - # # Test to make sure defaults are to not drop. - # # Corresponding to Issue 9382 - path = tmp_path / setup_path - df.to_hdf(path, "a") - result = read_hdf(path, "a") - tm.assert_frame_equal(expected, result) diff --git a/pandas/tests/io/pytables/test_subclass.py b/pandas/tests/io/pytables/test_subclass.py deleted file mode 100644 index 823d2875c5417..0000000000000 --- a/pandas/tests/io/pytables/test_subclass.py +++ /dev/null @@ -1,52 +0,0 @@ -import numpy as np -import pytest - -from pandas import ( - DataFrame, - Series, -) -import pandas._testing as tm - -from pandas.io.pytables import ( - HDFStore, - read_hdf, -) - -pytest.importorskip("tables") - - -class TestHDFStoreSubclass: - # GH 33748 - def test_supported_for_subclass_dataframe(self, tmp_path): - data = {"a": [1, 2], "b": [3, 4]} - sdf = tm.SubclassedDataFrame(data, dtype=np.intp) - - expected = DataFrame(data, dtype=np.intp) - - path = tmp_path / "temp.h5" - sdf.to_hdf(path, "df") - result = read_hdf(path, "df") - tm.assert_frame_equal(result, expected) - - path = tmp_path / "temp.h5" - with HDFStore(path) as store: - store.put("df", sdf) - result = read_hdf(path, "df") - tm.assert_frame_equal(result, expected) - - def test_supported_for_subclass_series(self, tmp_path): - data = [1, 2, 3] - sser = tm.SubclassedSeries(data, dtype=np.intp) - - expected = Series(data, dtype=np.intp) - - path = tmp_path / "temp.h5" - sser.to_hdf(path, "ser") - result = read_hdf(path, "ser") - tm.assert_series_equal(result, expected) - - path = tmp_path / "temp.h5" - with HDFStore(path) as store: - store.put("ser", sser) - result = read_hdf(path, "ser") - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_time_series.py b/pandas/tests/io/pytables/test_time_series.py deleted file mode 100644 index 6625984961c11..0000000000000 --- a/pandas/tests/io/pytables/test_time_series.py +++ /dev/null @@ -1,66 +0,0 @@ -import datetime - -import numpy as np -import pytest - -from pandas import ( - DataFrame, - Series, - _testing as tm, -) -from pandas.tests.io.pytables.common import ensure_clean_store - -pytestmark = pytest.mark.single_cpu - - -def test_store_datetime_fractional_secs(setup_path): - - with ensure_clean_store(setup_path) as store: - dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456) - series = Series([0], [dt]) - store["a"] = series - assert store["a"].index[0] == dt - - -def test_tseries_indices_series(setup_path): - - with ensure_clean_store(setup_path) as store: - idx = tm.makeDateIndex(10) - ser = Series(np.random.randn(len(idx)), idx) - store["a"] = ser - result = store["a"] - - tm.assert_series_equal(result, ser) - assert result.index.freq == ser.index.freq - tm.assert_class_equal(result.index, ser.index, obj="series index") - - idx = tm.makePeriodIndex(10) - ser = Series(np.random.randn(len(idx)), idx) - store["a"] = ser - result = store["a"] - - tm.assert_series_equal(result, ser) - assert result.index.freq == ser.index.freq - tm.assert_class_equal(result.index, ser.index, obj="series index") - - -def test_tseries_indices_frame(setup_path): - - with ensure_clean_store(setup_path) as store: - idx = tm.makeDateIndex(10) - df = DataFrame(np.random.randn(len(idx), 3), index=idx) - store["a"] = df - result = store["a"] - - tm.assert_frame_equal(result, df) - assert result.index.freq == df.index.freq - tm.assert_class_equal(result.index, df.index, obj="dataframe index") - - idx = tm.makePeriodIndex(10) - df = DataFrame(np.random.randn(len(idx), 3), idx) - store["a"] = df - result = store["a"] - - tm.assert_frame_equal(result, df) - assert result.index.freq == df.index.freq - tm.assert_class_equal(result.index, df.index, obj="dataframe index") diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py deleted file mode 100644 index ba125ffd28581..0000000000000 --- a/pandas/tests/io/pytables/test_timezones.py +++ /dev/null @@ -1,369 +0,0 @@ -from datetime import ( - date, - timedelta, -) - -import numpy as np -import pytest - -from pandas._libs.tslibs.timezones import maybe_get_tz -import pandas.util._test_decorators as td - -import pandas as pd -from pandas import ( - DataFrame, - DatetimeIndex, - Series, - Timestamp, - date_range, -) -import pandas._testing as tm -from pandas.tests.io.pytables.common import ( - _maybe_remove, - ensure_clean_store, -) - - -def _compare_with_tz(a, b): - tm.assert_frame_equal(a, b) - - # compare the zones on each element - for c in a.columns: - for i in a.index: - a_e = a.loc[i, c] - b_e = b.loc[i, c] - if not (a_e == b_e and a_e.tz == b_e.tz): - raise AssertionError(f"invalid tz comparison [{a_e}] [{b_e}]") - - -# use maybe_get_tz instead of dateutil.tz.gettz to handle the windows -# filename issues. -gettz_dateutil = lambda x: maybe_get_tz("dateutil/" + x) -gettz_pytz = lambda x: x - - -@pytest.mark.parametrize("gettz", [gettz_dateutil, gettz_pytz]) -def test_append_with_timezones(setup_path, gettz): - # as columns - - # Single-tzinfo, no DST transition - df_est = DataFrame( - { - "A": [ - Timestamp("20130102 2:00:00", tz=gettz("US/Eastern")) - + timedelta(hours=1) * i - for i in range(5) - ] - } - ) - - # frame with all columns having same tzinfo, but different sides - # of DST transition - df_crosses_dst = DataFrame( - { - "A": Timestamp("20130102", tz=gettz("US/Eastern")), - "B": Timestamp("20130603", tz=gettz("US/Eastern")), - }, - index=range(5), - ) - - df_mixed_tz = DataFrame( - { - "A": Timestamp("20130102", tz=gettz("US/Eastern")), - "B": Timestamp("20130102", tz=gettz("EET")), - }, - index=range(5), - ) - - df_different_tz = DataFrame( - { - "A": Timestamp("20130102", tz=gettz("US/Eastern")), - "B": Timestamp("20130102", tz=gettz("CET")), - }, - index=range(5), - ) - - with ensure_clean_store(setup_path) as store: - - _maybe_remove(store, "df_tz") - store.append("df_tz", df_est, data_columns=["A"]) - result = store["df_tz"] - _compare_with_tz(result, df_est) - tm.assert_frame_equal(result, df_est) - - # select with tz aware - expected = df_est[df_est.A >= df_est.A[3]] - result = store.select("df_tz", where="A>=df_est.A[3]") - _compare_with_tz(result, expected) - - # ensure we include dates in DST and STD time here. - _maybe_remove(store, "df_tz") - store.append("df_tz", df_crosses_dst) - result = store["df_tz"] - _compare_with_tz(result, df_crosses_dst) - tm.assert_frame_equal(result, df_crosses_dst) - - msg = ( - r"invalid info for \[values_block_1\] for \[tz\], " - r"existing_value \[(dateutil/.*)?US/Eastern\] " - r"conflicts with new value \[(dateutil/.*)?EET\]" - ) - with pytest.raises(ValueError, match=msg): - store.append("df_tz", df_mixed_tz) - - # this is ok - _maybe_remove(store, "df_tz") - store.append("df_tz", df_mixed_tz, data_columns=["A", "B"]) - result = store["df_tz"] - _compare_with_tz(result, df_mixed_tz) - tm.assert_frame_equal(result, df_mixed_tz) - - # can't append with diff timezone - msg = ( - r"invalid info for \[B\] for \[tz\], " - r"existing_value \[(dateutil/.*)?EET\] " - r"conflicts with new value \[(dateutil/.*)?CET\]" - ) - with pytest.raises(ValueError, match=msg): - store.append("df_tz", df_different_tz) - - -@pytest.mark.parametrize("gettz", [gettz_dateutil, gettz_pytz]) -def test_append_with_timezones_as_index(setup_path, gettz): - # GH#4098 example - - dti = date_range("2000-1-1", periods=3, freq="H", tz=gettz("US/Eastern")) - dti = dti._with_freq(None) # freq doesn't round-trip - - df = DataFrame({"A": Series(range(3), index=dti)}) - - with ensure_clean_store(setup_path) as store: - - _maybe_remove(store, "df") - store.put("df", df) - result = store.select("df") - tm.assert_frame_equal(result, df) - - _maybe_remove(store, "df") - store.append("df", df) - result = store.select("df") - tm.assert_frame_equal(result, df) - - -def test_roundtrip_tz_aware_index(setup_path): - # GH 17618 - time = Timestamp("2000-01-01 01:00:00", tz="US/Eastern") - df = DataFrame(data=[0], index=[time]) - - with ensure_clean_store(setup_path) as store: - store.put("frame", df, format="fixed") - recons = store["frame"] - tm.assert_frame_equal(recons, df) - assert recons.index[0].value == 946706400000000000 - - -def test_store_index_name_with_tz(setup_path): - # GH 13884 - df = DataFrame({"A": [1, 2]}) - df.index = DatetimeIndex([1234567890123456787, 1234567890123456788]) - df.index = df.index.tz_localize("UTC") - df.index.name = "foo" - - with ensure_clean_store(setup_path) as store: - store.put("frame", df, format="table") - recons = store["frame"] - tm.assert_frame_equal(recons, df) - - -def test_tseries_select_index_column(setup_path): - # GH7777 - # selecting a UTC datetimeindex column did - # not preserve UTC tzinfo set before storing - - # check that no tz still works - rng = date_range("1/1/2000", "1/30/2000") - frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - - with ensure_clean_store(setup_path) as store: - store.append("frame", frame) - result = store.select_column("frame", "index") - assert rng.tz == DatetimeIndex(result.values).tz - - # check utc - rng = date_range("1/1/2000", "1/30/2000", tz="UTC") - frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - - with ensure_clean_store(setup_path) as store: - store.append("frame", frame) - result = store.select_column("frame", "index") - assert rng.tz == result.dt.tz - - # double check non-utc - rng = date_range("1/1/2000", "1/30/2000", tz="US/Eastern") - frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - - with ensure_clean_store(setup_path) as store: - store.append("frame", frame) - result = store.select_column("frame", "index") - assert rng.tz == result.dt.tz - - -def test_timezones_fixed_format_frame_non_empty(setup_path): - with ensure_clean_store(setup_path) as store: - - # index - rng = date_range("1/1/2000", "1/30/2000", tz="US/Eastern") - rng = rng._with_freq(None) # freq doesn't round-trip - df = DataFrame(np.random.randn(len(rng), 4), index=rng) - store["df"] = df - result = store["df"] - tm.assert_frame_equal(result, df) - - # as data - # GH11411 - _maybe_remove(store, "df") - df = DataFrame( - { - "A": rng, - "B": rng.tz_convert("UTC").tz_localize(None), - "C": rng.tz_convert("CET"), - "D": range(len(rng)), - }, - index=rng, - ) - store["df"] = df - result = store["df"] - tm.assert_frame_equal(result, df) - - -def test_timezones_fixed_format_empty(setup_path, tz_aware_fixture, frame_or_series): - # GH 20594 - - dtype = pd.DatetimeTZDtype(tz=tz_aware_fixture) - - obj = Series(dtype=dtype, name="A") - if frame_or_series is DataFrame: - obj = obj.to_frame() - - with ensure_clean_store(setup_path) as store: - store["obj"] = obj - result = store["obj"] - tm.assert_equal(result, obj) - - -def test_timezones_fixed_format_series_nonempty(setup_path, tz_aware_fixture): - # GH 20594 - - dtype = pd.DatetimeTZDtype(tz=tz_aware_fixture) - - with ensure_clean_store(setup_path) as store: - s = Series([0], dtype=dtype) - store["s"] = s - result = store["s"] - tm.assert_series_equal(result, s) - - -def test_fixed_offset_tz(setup_path): - rng = date_range("1/1/2000 00:00:00-07:00", "1/30/2000 00:00:00-07:00") - frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - - with ensure_clean_store(setup_path) as store: - store["frame"] = frame - recons = store["frame"] - tm.assert_index_equal(recons.index, rng) - assert rng.tz == recons.index.tz - - -@td.skip_if_windows -def test_store_timezone(setup_path): - # GH2852 - # issue storing datetime.date with a timezone as it resets when read - # back in a new timezone - - # original method - with ensure_clean_store(setup_path) as store: - - today = date(2013, 9, 10) - df = DataFrame([1, 2, 3], index=[today, today, today]) - store["obj1"] = df - result = store["obj1"] - tm.assert_frame_equal(result, df) - - # with tz setting - with ensure_clean_store(setup_path) as store: - - with tm.set_timezone("EST5EDT"): - today = date(2013, 9, 10) - df = DataFrame([1, 2, 3], index=[today, today, today]) - store["obj1"] = df - - with tm.set_timezone("CST6CDT"): - result = store["obj1"] - - tm.assert_frame_equal(result, df) - - -def test_legacy_datetimetz_object(datapath): - # legacy from < 0.17.0 - # 8260 - expected = DataFrame( - { - "A": Timestamp("20130102", tz="US/Eastern"), - "B": Timestamp("20130603", tz="CET"), - }, - index=range(5), - ) - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "datetimetz_object.h5"), mode="r" - ) as store: - result = store["df"] - tm.assert_frame_equal(result, expected) - - -def test_dst_transitions(setup_path): - # make sure we are not failing on transitions - with ensure_clean_store(setup_path) as store: - times = date_range( - "2013-10-26 23:00", - "2013-10-27 01:00", - tz="Europe/London", - freq="H", - ambiguous="infer", - ) - times = times._with_freq(None) # freq doesn't round-trip - - for i in [times, times + pd.Timedelta("10min")]: - _maybe_remove(store, "df") - df = DataFrame({"A": range(len(i)), "B": i}, index=i) - store.append("df", df) - result = store.select("df") - tm.assert_frame_equal(result, df) - - -def test_read_with_where_tz_aware_index(tmp_path, setup_path): - # GH 11926 - periods = 10 - dts = date_range("20151201", periods=periods, freq="D", tz="UTC") - mi = pd.MultiIndex.from_arrays([dts, range(periods)], names=["DATE", "NO"]) - expected = DataFrame({"MYCOL": 0}, index=mi) - - key = "mykey" - path = tmp_path / setup_path - with pd.HDFStore(path) as store: - store.append(key, expected, format="table", append=True) - result = pd.read_hdf(path, key, where="DATE > 20151130") - tm.assert_frame_equal(result, expected) - - -def test_py2_created_with_datetimez(datapath): - # The test HDF5 file was created in Python 2, but could not be read in - # Python 3. - # - # GH26443 - index = [Timestamp("2019-01-01T18:00").tz_localize("America/New_York")] - expected = DataFrame({"data": 123}, index=index) - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "gh26443.h5"), mode="r" - ) as store: - result = store["key"] - tm.assert_frame_equal(result, expected) From 5a80bf426edfec933d4fc4dd50faaddf3fc414a9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 15 Jan 2023 17:02:59 +0100 Subject: [PATCH 08/14] Revert "Remov pytables tests" This reverts commit e84c458b5161143b0b7d3aed9254d801ac9e7d7a. --- pandas/tests/io/pytables/__init__.py | 0 pandas/tests/io/pytables/common.py | 51 + pandas/tests/io/pytables/conftest.py | 9 + pandas/tests/io/pytables/test_append.py | 935 +++++++++++++++ pandas/tests/io/pytables/test_categorical.py | 219 ++++ pandas/tests/io/pytables/test_compat.py | 75 ++ pandas/tests/io/pytables/test_complex.py | 200 ++++ pandas/tests/io/pytables/test_errors.py | 236 ++++ .../tests/io/pytables/test_file_handling.py | 446 ++++++++ pandas/tests/io/pytables/test_keys.py | 79 ++ pandas/tests/io/pytables/test_put.py | 372 ++++++ .../io/pytables/test_pytables_missing.py | 14 + pandas/tests/io/pytables/test_read.py | 344 ++++++ .../io/pytables/test_retain_attributes.py | 105 ++ pandas/tests/io/pytables/test_round_trip.py | 557 +++++++++ pandas/tests/io/pytables/test_select.py | 973 ++++++++++++++++ pandas/tests/io/pytables/test_store.py | 1018 +++++++++++++++++ pandas/tests/io/pytables/test_subclass.py | 52 + pandas/tests/io/pytables/test_time_series.py | 66 ++ pandas/tests/io/pytables/test_timezones.py | 369 ++++++ 20 files changed, 6120 insertions(+) create mode 100644 pandas/tests/io/pytables/__init__.py create mode 100644 pandas/tests/io/pytables/common.py create mode 100644 pandas/tests/io/pytables/conftest.py create mode 100644 pandas/tests/io/pytables/test_append.py create mode 100644 pandas/tests/io/pytables/test_categorical.py create mode 100644 pandas/tests/io/pytables/test_compat.py create mode 100644 pandas/tests/io/pytables/test_complex.py create mode 100644 pandas/tests/io/pytables/test_errors.py create mode 100644 pandas/tests/io/pytables/test_file_handling.py create mode 100644 pandas/tests/io/pytables/test_keys.py create mode 100644 pandas/tests/io/pytables/test_put.py create mode 100644 pandas/tests/io/pytables/test_pytables_missing.py create mode 100644 pandas/tests/io/pytables/test_read.py create mode 100644 pandas/tests/io/pytables/test_retain_attributes.py create mode 100644 pandas/tests/io/pytables/test_round_trip.py create mode 100644 pandas/tests/io/pytables/test_select.py create mode 100644 pandas/tests/io/pytables/test_store.py create mode 100644 pandas/tests/io/pytables/test_subclass.py create mode 100644 pandas/tests/io/pytables/test_time_series.py create mode 100644 pandas/tests/io/pytables/test_timezones.py diff --git a/pandas/tests/io/pytables/__init__.py b/pandas/tests/io/pytables/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/io/pytables/common.py b/pandas/tests/io/pytables/common.py new file mode 100644 index 0000000000000..9446d9df3a038 --- /dev/null +++ b/pandas/tests/io/pytables/common.py @@ -0,0 +1,51 @@ +from contextlib import contextmanager +import pathlib +import tempfile +from typing import Generator + +import pytest + +from pandas.io.pytables import HDFStore + +tables = pytest.importorskip("tables") +# set these parameters so we don't have file sharing +tables.parameters.MAX_NUMEXPR_THREADS = 1 +tables.parameters.MAX_BLOSC_THREADS = 1 +tables.parameters.MAX_THREADS = 1 + + +def safe_close(store): + try: + if store is not None: + store.close() + except OSError: + pass + + +# contextmanager to ensure the file cleanup +@contextmanager +def ensure_clean_store( + path, mode="a", complevel=None, complib=None, fletcher32=False +) -> Generator[HDFStore, None, None]: + + with tempfile.TemporaryDirectory() as tmpdirname: + tmp_path = pathlib.Path(tmpdirname, path) + with HDFStore( + tmp_path, + mode=mode, + complevel=complevel, + complib=complib, + fletcher32=fletcher32, + ) as store: + yield store + + +def _maybe_remove(store, key): + """ + For tests using tables, try removing the table to be sure there is + no content from previous tests using the same table name. + """ + try: + store.remove(key) + except (ValueError, KeyError): + pass diff --git a/pandas/tests/io/pytables/conftest.py b/pandas/tests/io/pytables/conftest.py new file mode 100644 index 0000000000000..466e4ae8bb99c --- /dev/null +++ b/pandas/tests/io/pytables/conftest.py @@ -0,0 +1,9 @@ +import uuid + +import pytest + + +@pytest.fixture +def setup_path(): + """Fixture for setup path""" + return f"tmp.__{uuid.uuid4()}__.h5" diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py new file mode 100644 index 0000000000000..80562e77cae02 --- /dev/null +++ b/pandas/tests/io/pytables/test_append.py @@ -0,0 +1,935 @@ +import datetime +from datetime import timedelta +import re +from warnings import catch_warnings + +import numpy as np +import pytest + +from pandas._libs.tslibs import Timestamp +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + Series, + _testing as tm, + concat, + date_range, + read_hdf, +) +from pandas.tests.io.pytables.common import ( + _maybe_remove, + ensure_clean_store, +) + +pytestmark = pytest.mark.single_cpu + + +def test_append(setup_path): + + with ensure_clean_store(setup_path) as store: + + # this is allowed by almost always don't want to do it + # tables.NaturalNameWarning): + with catch_warnings(record=True): + + df = tm.makeTimeDataFrame() + _maybe_remove(store, "df1") + store.append("df1", df[:10]) + store.append("df1", df[10:]) + tm.assert_frame_equal(store["df1"], df) + + _maybe_remove(store, "df2") + store.put("df2", df[:10], format="table") + store.append("df2", df[10:]) + tm.assert_frame_equal(store["df2"], df) + + _maybe_remove(store, "df3") + store.append("/df3", df[:10]) + store.append("/df3", df[10:]) + tm.assert_frame_equal(store["df3"], df) + + # this is allowed by almost always don't want to do it + # tables.NaturalNameWarning + _maybe_remove(store, "/df3 foo") + store.append("/df3 foo", df[:10]) + store.append("/df3 foo", df[10:]) + tm.assert_frame_equal(store["df3 foo"], df) + + # dtype issues - mizxed type in a single object column + df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]]) + df["mixed_column"] = "testing" + df.loc[2, "mixed_column"] = np.nan + _maybe_remove(store, "df") + store.append("df", df) + tm.assert_frame_equal(store["df"], df) + + # uints - test storage of uints + uint_data = DataFrame( + { + "u08": Series( + np.random.randint(0, high=255, size=5), dtype=np.uint8 + ), + "u16": Series( + np.random.randint(0, high=65535, size=5), dtype=np.uint16 + ), + "u32": Series( + np.random.randint(0, high=2**30, size=5), dtype=np.uint32 + ), + "u64": Series( + [2**58, 2**59, 2**60, 2**61, 2**62], + dtype=np.uint64, + ), + }, + index=np.arange(5), + ) + _maybe_remove(store, "uints") + store.append("uints", uint_data) + tm.assert_frame_equal(store["uints"], uint_data) + + # uints - test storage of uints in indexable columns + _maybe_remove(store, "uints") + # 64-bit indices not yet supported + store.append("uints", uint_data, data_columns=["u08", "u16", "u32"]) + tm.assert_frame_equal(store["uints"], uint_data) + + +def test_append_series(setup_path): + + with ensure_clean_store(setup_path) as store: + + # basic + ss = tm.makeStringSeries() + ts = tm.makeTimeSeries() + ns = Series(np.arange(100)) + + store.append("ss", ss) + result = store["ss"] + tm.assert_series_equal(result, ss) + assert result.name is None + + store.append("ts", ts) + result = store["ts"] + tm.assert_series_equal(result, ts) + assert result.name is None + + ns.name = "foo" + store.append("ns", ns) + result = store["ns"] + tm.assert_series_equal(result, ns) + assert result.name == ns.name + + # select on the values + expected = ns[ns > 60] + result = store.select("ns", "foo>60") + tm.assert_series_equal(result, expected) + + # select on the index and values + expected = ns[(ns > 70) & (ns.index < 90)] + result = store.select("ns", "foo>70 and index<90") + tm.assert_series_equal(result, expected) + + # multi-index + mi = DataFrame(np.random.randn(5, 1), columns=["A"]) + mi["B"] = np.arange(len(mi)) + mi["C"] = "foo" + mi.loc[3:5, "C"] = "bar" + mi.set_index(["C", "B"], inplace=True) + s = mi.stack() + s.index = s.index.droplevel(2) + store.append("mi", s) + tm.assert_series_equal(store["mi"], s) + + +def test_append_some_nans(setup_path): + + with ensure_clean_store(setup_path) as store: + df = DataFrame( + { + "A": Series(np.random.randn(20)).astype("int32"), + "A1": np.random.randn(20), + "A2": np.random.randn(20), + "B": "foo", + "C": "bar", + "D": Timestamp("20010101"), + "E": datetime.datetime(2001, 1, 2, 0, 0), + }, + index=np.arange(20), + ) + # some nans + _maybe_remove(store, "df1") + df.loc[0:15, ["A1", "B", "D", "E"]] = np.nan + store.append("df1", df[:10]) + store.append("df1", df[10:]) + tm.assert_frame_equal(store["df1"], df) + + # first column + df1 = df.copy() + df1["A1"] = np.nan + _maybe_remove(store, "df1") + store.append("df1", df1[:10]) + store.append("df1", df1[10:]) + tm.assert_frame_equal(store["df1"], df1) + + # 2nd column + df2 = df.copy() + df2["A2"] = np.nan + _maybe_remove(store, "df2") + store.append("df2", df2[:10]) + store.append("df2", df2[10:]) + tm.assert_frame_equal(store["df2"], df2) + + # datetimes + df3 = df.copy() + df3["E"] = np.nan + _maybe_remove(store, "df3") + store.append("df3", df3[:10]) + store.append("df3", df3[10:]) + tm.assert_frame_equal(store["df3"], df3) + + +def test_append_all_nans(setup_path): + + with ensure_clean_store(setup_path) as store: + + df = DataFrame( + {"A1": np.random.randn(20), "A2": np.random.randn(20)}, + index=np.arange(20), + ) + df.loc[0:15, :] = np.nan + + # nan some entire rows (dropna=True) + _maybe_remove(store, "df") + store.append("df", df[:10], dropna=True) + store.append("df", df[10:], dropna=True) + tm.assert_frame_equal(store["df"], df[-4:]) + + # nan some entire rows (dropna=False) + _maybe_remove(store, "df2") + store.append("df2", df[:10], dropna=False) + store.append("df2", df[10:], dropna=False) + tm.assert_frame_equal(store["df2"], df) + + # tests the option io.hdf.dropna_table + with pd.option_context("io.hdf.dropna_table", False): + _maybe_remove(store, "df3") + store.append("df3", df[:10]) + store.append("df3", df[10:]) + tm.assert_frame_equal(store["df3"], df) + + with pd.option_context("io.hdf.dropna_table", True): + _maybe_remove(store, "df4") + store.append("df4", df[:10]) + store.append("df4", df[10:]) + tm.assert_frame_equal(store["df4"], df[-4:]) + + # nan some entire rows (string are still written!) + df = DataFrame( + { + "A1": np.random.randn(20), + "A2": np.random.randn(20), + "B": "foo", + "C": "bar", + }, + index=np.arange(20), + ) + + df.loc[0:15, :] = np.nan + + _maybe_remove(store, "df") + store.append("df", df[:10], dropna=True) + store.append("df", df[10:], dropna=True) + tm.assert_frame_equal(store["df"], df) + + _maybe_remove(store, "df2") + store.append("df2", df[:10], dropna=False) + store.append("df2", df[10:], dropna=False) + tm.assert_frame_equal(store["df2"], df) + + # nan some entire rows (but since we have dates they are still + # written!) + df = DataFrame( + { + "A1": np.random.randn(20), + "A2": np.random.randn(20), + "B": "foo", + "C": "bar", + "D": Timestamp("20010101"), + "E": datetime.datetime(2001, 1, 2, 0, 0), + }, + index=np.arange(20), + ) + + df.loc[0:15, :] = np.nan + + _maybe_remove(store, "df") + store.append("df", df[:10], dropna=True) + store.append("df", df[10:], dropna=True) + tm.assert_frame_equal(store["df"], df) + + _maybe_remove(store, "df2") + store.append("df2", df[:10], dropna=False) + store.append("df2", df[10:], dropna=False) + tm.assert_frame_equal(store["df2"], df) + + +def test_append_frame_column_oriented(setup_path): + with ensure_clean_store(setup_path) as store: + + # column oriented + df = tm.makeTimeDataFrame() + df.index = df.index._with_freq(None) # freq doesn't round-trip + + _maybe_remove(store, "df1") + store.append("df1", df.iloc[:, :2], axes=["columns"]) + store.append("df1", df.iloc[:, 2:]) + tm.assert_frame_equal(store["df1"], df) + + result = store.select("df1", "columns=A") + expected = df.reindex(columns=["A"]) + tm.assert_frame_equal(expected, result) + + # selection on the non-indexable + result = store.select("df1", ("columns=A", "index=df.index[0:4]")) + expected = df.reindex(columns=["A"], index=df.index[0:4]) + tm.assert_frame_equal(expected, result) + + # this isn't supported + msg = re.escape( + "passing a filterable condition to a non-table indexer " + "[Filter: Not Initialized]" + ) + with pytest.raises(TypeError, match=msg): + store.select("df1", "columns=A and index>df.index[4]") + + +def test_append_with_different_block_ordering(setup_path): + + # GH 4096; using same frames, but different block orderings + with ensure_clean_store(setup_path) as store: + + for i in range(10): + + df = DataFrame(np.random.randn(10, 2), columns=list("AB")) + df["index"] = range(10) + df["index"] += i * 10 + df["int64"] = Series([1] * len(df), dtype="int64") + df["int16"] = Series([1] * len(df), dtype="int16") + + if i % 2 == 0: + del df["int64"] + df["int64"] = Series([1] * len(df), dtype="int64") + if i % 3 == 0: + a = df.pop("A") + df["A"] = a + + df.set_index("index", inplace=True) + + store.append("df", df) + + # test a different ordering but with more fields (like invalid + # combinations) + with ensure_clean_store(setup_path) as store: + + df = DataFrame(np.random.randn(10, 2), columns=list("AB"), dtype="float64") + df["int64"] = Series([1] * len(df), dtype="int64") + df["int16"] = Series([1] * len(df), dtype="int16") + store.append("df", df) + + # store additional fields in different blocks + df["int16_2"] = Series([1] * len(df), dtype="int16") + msg = re.escape( + "cannot match existing table structure for [int16] on appending data" + ) + with pytest.raises(ValueError, match=msg): + store.append("df", df) + + # store multiple additional fields in different blocks + df["float_3"] = Series([1.0] * len(df), dtype="float64") + msg = re.escape( + "cannot match existing table structure for [A,B] on appending data" + ) + with pytest.raises(ValueError, match=msg): + store.append("df", df) + + +def test_append_with_strings(setup_path): + + with ensure_clean_store(setup_path) as store: + with catch_warnings(record=True): + + def check_col(key, name, size): + assert ( + getattr(store.get_storer(key).table.description, name).itemsize + == size + ) + + # avoid truncation on elements + df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]]) + store.append("df_big", df) + tm.assert_frame_equal(store.select("df_big"), df) + check_col("df_big", "values_block_1", 15) + + # appending smaller string ok + df2 = DataFrame([[124, "asdqy"], [346, "dggnhefbdfb"]]) + store.append("df_big", df2) + expected = concat([df, df2]) + tm.assert_frame_equal(store.select("df_big"), expected) + check_col("df_big", "values_block_1", 15) + + # avoid truncation on elements + df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]]) + store.append("df_big2", df, min_itemsize={"values": 50}) + tm.assert_frame_equal(store.select("df_big2"), df) + check_col("df_big2", "values_block_1", 50) + + # bigger string on next append + store.append("df_new", df) + df_new = DataFrame( + [[124, "abcdefqhij"], [346, "abcdefghijklmnopqrtsuvwxyz"]] + ) + msg = ( + r"Trying to store a string with len \[26\] in " + r"\[values_block_1\] column but\n" + r"this column has a limit of \[15\]!\n" + "Consider using min_itemsize to preset the sizes on these " + "columns" + ) + with pytest.raises(ValueError, match=msg): + store.append("df_new", df_new) + + # min_itemsize on Series index (GH 11412) + df = tm.makeMixedDataFrame().set_index("C") + store.append("ss", df["B"], min_itemsize={"index": 4}) + tm.assert_series_equal(store.select("ss"), df["B"]) + + # same as above, with data_columns=True + store.append("ss2", df["B"], data_columns=True, min_itemsize={"index": 4}) + tm.assert_series_equal(store.select("ss2"), df["B"]) + + # min_itemsize in index without appending (GH 10381) + store.put("ss3", df, format="table", min_itemsize={"index": 6}) + # just make sure there is a longer string: + df2 = df.copy().reset_index().assign(C="longer").set_index("C") + store.append("ss3", df2) + tm.assert_frame_equal(store.select("ss3"), concat([df, df2])) + + # same as above, with a Series + store.put("ss4", df["B"], format="table", min_itemsize={"index": 6}) + store.append("ss4", df2["B"]) + tm.assert_series_equal(store.select("ss4"), concat([df["B"], df2["B"]])) + + # with nans + _maybe_remove(store, "df") + df = tm.makeTimeDataFrame() + df["string"] = "foo" + df.loc[df.index[1:4], "string"] = np.nan + df["string2"] = "bar" + df.loc[df.index[4:8], "string2"] = np.nan + df["string3"] = "bah" + df.loc[df.index[1:], "string3"] = np.nan + store.append("df", df) + result = store.select("df") + tm.assert_frame_equal(result, df) + + with ensure_clean_store(setup_path) as store: + + df = DataFrame({"A": "foo", "B": "bar"}, index=range(10)) + + # a min_itemsize that creates a data_column + _maybe_remove(store, "df") + store.append("df", df, min_itemsize={"A": 200}) + check_col("df", "A", 200) + assert store.get_storer("df").data_columns == ["A"] + + # a min_itemsize that creates a data_column2 + _maybe_remove(store, "df") + store.append("df", df, data_columns=["B"], min_itemsize={"A": 200}) + check_col("df", "A", 200) + assert store.get_storer("df").data_columns == ["B", "A"] + + # a min_itemsize that creates a data_column2 + _maybe_remove(store, "df") + store.append("df", df, data_columns=["B"], min_itemsize={"values": 200}) + check_col("df", "B", 200) + check_col("df", "values_block_0", 200) + assert store.get_storer("df").data_columns == ["B"] + + # infer the .typ on subsequent appends + _maybe_remove(store, "df") + store.append("df", df[:5], min_itemsize=200) + store.append("df", df[5:], min_itemsize=200) + tm.assert_frame_equal(store["df"], df) + + # invalid min_itemsize keys + df = DataFrame(["foo", "foo", "foo", "barh", "barh", "barh"], columns=["A"]) + _maybe_remove(store, "df") + msg = re.escape( + "min_itemsize has the key [foo] which is not an axis or data_column" + ) + with pytest.raises(ValueError, match=msg): + store.append("df", df, min_itemsize={"foo": 20, "foobar": 20}) + + +def test_append_with_empty_string(setup_path): + + with ensure_clean_store(setup_path) as store: + + # with all empty strings (GH 12242) + df = DataFrame({"x": ["a", "b", "c", "d", "e", "f", ""]}) + store.append("df", df[:-1], min_itemsize={"x": 1}) + store.append("df", df[-1:], min_itemsize={"x": 1}) + tm.assert_frame_equal(store.select("df"), df) + + +def test_append_with_data_columns(setup_path): + + with ensure_clean_store(setup_path) as store: + df = tm.makeTimeDataFrame() + df.iloc[0, df.columns.get_loc("B")] = 1.0 + _maybe_remove(store, "df") + store.append("df", df[:2], data_columns=["B"]) + store.append("df", df[2:]) + tm.assert_frame_equal(store["df"], df) + + # check that we have indices created + assert store._handle.root.df.table.cols.index.is_indexed is True + assert store._handle.root.df.table.cols.B.is_indexed is True + + # data column searching + result = store.select("df", "B>0") + expected = df[df.B > 0] + tm.assert_frame_equal(result, expected) + + # data column searching (with an indexable and a data_columns) + result = store.select("df", "B>0 and index>df.index[3]") + df_new = df.reindex(index=df.index[4:]) + expected = df_new[df_new.B > 0] + tm.assert_frame_equal(result, expected) + + # data column selection with a string data_column + df_new = df.copy() + df_new["string"] = "foo" + df_new.loc[df_new.index[1:4], "string"] = np.nan + df_new.loc[df_new.index[5:6], "string"] = "bar" + _maybe_remove(store, "df") + store.append("df", df_new, data_columns=["string"]) + result = store.select("df", "string='foo'") + expected = df_new[df_new.string == "foo"] + tm.assert_frame_equal(result, expected) + + # using min_itemsize and a data column + def check_col(key, name, size): + assert ( + getattr(store.get_storer(key).table.description, name).itemsize == size + ) + + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "df") + store.append("df", df_new, data_columns=["string"], min_itemsize={"string": 30}) + check_col("df", "string", 30) + _maybe_remove(store, "df") + store.append("df", df_new, data_columns=["string"], min_itemsize=30) + check_col("df", "string", 30) + _maybe_remove(store, "df") + store.append("df", df_new, data_columns=["string"], min_itemsize={"values": 30}) + check_col("df", "string", 30) + + with ensure_clean_store(setup_path) as store: + df_new["string2"] = "foobarbah" + df_new["string_block1"] = "foobarbah1" + df_new["string_block2"] = "foobarbah2" + _maybe_remove(store, "df") + store.append( + "df", + df_new, + data_columns=["string", "string2"], + min_itemsize={"string": 30, "string2": 40, "values": 50}, + ) + check_col("df", "string", 30) + check_col("df", "string2", 40) + check_col("df", "values_block_1", 50) + + with ensure_clean_store(setup_path) as store: + # multiple data columns + df_new = df.copy() + df_new.iloc[0, df_new.columns.get_loc("A")] = 1.0 + df_new.iloc[0, df_new.columns.get_loc("B")] = -1.0 + df_new["string"] = "foo" + + sl = df_new.columns.get_loc("string") + df_new.iloc[1:4, sl] = np.nan + df_new.iloc[5:6, sl] = "bar" + + df_new["string2"] = "foo" + sl = df_new.columns.get_loc("string2") + df_new.iloc[2:5, sl] = np.nan + df_new.iloc[7:8, sl] = "bar" + _maybe_remove(store, "df") + store.append("df", df_new, data_columns=["A", "B", "string", "string2"]) + result = store.select("df", "string='foo' and string2='foo' and A>0 and B<0") + expected = df_new[ + (df_new.string == "foo") + & (df_new.string2 == "foo") + & (df_new.A > 0) + & (df_new.B < 0) + ] + tm.assert_frame_equal(result, expected, check_freq=False) + # FIXME: 2020-05-07 freq check randomly fails in the CI + + # yield an empty frame + result = store.select("df", "string='foo' and string2='cool'") + expected = df_new[(df_new.string == "foo") & (df_new.string2 == "cool")] + tm.assert_frame_equal(result, expected) + + with ensure_clean_store(setup_path) as store: + # doc example + df_dc = df.copy() + df_dc["string"] = "foo" + df_dc.loc[df_dc.index[4:6], "string"] = np.nan + df_dc.loc[df_dc.index[7:9], "string"] = "bar" + df_dc["string2"] = "cool" + df_dc["datetime"] = Timestamp("20010102") + df_dc.loc[df_dc.index[3:5], ["A", "B", "datetime"]] = np.nan + + _maybe_remove(store, "df_dc") + store.append( + "df_dc", df_dc, data_columns=["B", "C", "string", "string2", "datetime"] + ) + result = store.select("df_dc", "B>0") + + expected = df_dc[df_dc.B > 0] + tm.assert_frame_equal(result, expected) + + result = store.select("df_dc", ["B > 0", "C > 0", "string == foo"]) + expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] + tm.assert_frame_equal(result, expected, check_freq=False) + # FIXME: 2020-12-07 intermittent build failures here with freq of + # None instead of BDay(4) + + with ensure_clean_store(setup_path) as store: + # doc example part 2 + np.random.seed(1234) + index = date_range("1/1/2000", periods=8) + df_dc = DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"]) + df_dc["string"] = "foo" + df_dc.loc[df_dc.index[4:6], "string"] = np.nan + df_dc.loc[df_dc.index[7:9], "string"] = "bar" + df_dc[["B", "C"]] = df_dc[["B", "C"]].abs() + df_dc["string2"] = "cool" + + # on-disk operations + store.append("df_dc", df_dc, data_columns=["B", "C", "string", "string2"]) + + result = store.select("df_dc", "B>0") + expected = df_dc[df_dc.B > 0] + tm.assert_frame_equal(result, expected) + + result = store.select("df_dc", ["B > 0", "C > 0", 'string == "foo"']) + expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] + tm.assert_frame_equal(result, expected) + + +def test_append_hierarchical(tmp_path, setup_path, multiindex_dataframe_random_data): + df = multiindex_dataframe_random_data + df.columns.name = None + + with ensure_clean_store(setup_path) as store: + store.append("mi", df) + result = store.select("mi") + tm.assert_frame_equal(result, df) + + # GH 3748 + result = store.select("mi", columns=["A", "B"]) + expected = df.reindex(columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + path = tmp_path / "test.hdf" + df.to_hdf(path, "df", format="table") + result = read_hdf(path, "df", columns=["A", "B"]) + expected = df.reindex(columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + +def test_append_misc(setup_path): + + with ensure_clean_store(setup_path) as store: + df = tm.makeDataFrame() + store.append("df", df, chunksize=1) + result = store.select("df") + tm.assert_frame_equal(result, df) + + store.append("df1", df, expectedrows=10) + result = store.select("df1") + tm.assert_frame_equal(result, df) + + +@pytest.mark.parametrize("chunksize", [10, 200, 1000]) +def test_append_misc_chunksize(setup_path, chunksize): + # more chunksize in append tests + df = tm.makeDataFrame() + df["string"] = "foo" + df["float322"] = 1.0 + df["float322"] = df["float322"].astype("float32") + df["bool"] = df["float322"] > 0 + df["time1"] = Timestamp("20130101") + df["time2"] = Timestamp("20130102") + with ensure_clean_store(setup_path, mode="w") as store: + store.append("obj", df, chunksize=chunksize) + result = store.select("obj") + tm.assert_frame_equal(result, df) + + +def test_append_misc_empty_frame(setup_path): + # empty frame, GH4273 + with ensure_clean_store(setup_path) as store: + + # 0 len + df_empty = DataFrame(columns=list("ABC")) + store.append("df", df_empty) + with pytest.raises(KeyError, match="'No object named df in the file'"): + store.select("df") + + # repeated append of 0/non-zero frames + df = DataFrame(np.random.rand(10, 3), columns=list("ABC")) + store.append("df", df) + tm.assert_frame_equal(store.select("df"), df) + store.append("df", df_empty) + tm.assert_frame_equal(store.select("df"), df) + + # store + df = DataFrame(columns=list("ABC")) + store.put("df2", df) + tm.assert_frame_equal(store.select("df2"), df) + + +# TODO(ArrayManager) currently we rely on falling back to BlockManager, but +# the conversion from AM->BM converts the invalid object dtype column into +# a datetime64 column no longer raising an error +@td.skip_array_manager_not_yet_implemented +def test_append_raise(setup_path): + + with ensure_clean_store(setup_path) as store: + + # test append with invalid input to get good error messages + + # list in column + df = tm.makeDataFrame() + df["invalid"] = [["a"]] * len(df) + assert df.dtypes["invalid"] == np.object_ + msg = re.escape( + """Cannot serialize the column [invalid] +because its data contents are not [string] but [mixed] object dtype""" + ) + with pytest.raises(TypeError, match=msg): + store.append("df", df) + + # multiple invalid columns + df["invalid2"] = [["a"]] * len(df) + df["invalid3"] = [["a"]] * len(df) + with pytest.raises(TypeError, match=msg): + store.append("df", df) + + # datetime with embedded nans as object + df = tm.makeDataFrame() + s = Series(datetime.datetime(2001, 1, 2), index=df.index) + s = s.astype(object) + s[0:5] = np.nan + df["invalid"] = s + assert df.dtypes["invalid"] == np.object_ + msg = "too many timezones in this block, create separate data columns" + with pytest.raises(TypeError, match=msg): + store.append("df", df) + + # directly ndarray + msg = "value must be None, Series, or DataFrame" + with pytest.raises(TypeError, match=msg): + store.append("df", np.arange(10)) + + # series directly + msg = re.escape( + "cannot properly create the storer for: " + "[group->df,value->]" + ) + with pytest.raises(TypeError, match=msg): + store.append("df", Series(np.arange(10))) + + # appending an incompatible table + df = tm.makeDataFrame() + store.append("df", df) + + df["foo"] = "foo" + msg = re.escape( + "invalid combination of [non_index_axes] on appending data " + "[(1, ['A', 'B', 'C', 'D', 'foo'])] vs current table " + "[(1, ['A', 'B', 'C', 'D'])]" + ) + with pytest.raises(ValueError, match=msg): + store.append("df", df) + + # incompatible type (GH 41897) + _maybe_remove(store, "df") + df["foo"] = Timestamp("20130101") + store.append("df", df) + df["foo"] = "bar" + msg = re.escape( + "invalid combination of [values_axes] on appending data " + "[name->values_block_1,cname->values_block_1," + "dtype->bytes24,kind->string,shape->(1, 30)] " + "vs current table " + "[name->values_block_1,cname->values_block_1," + "dtype->datetime64,kind->datetime64,shape->None]" + ) + with pytest.raises(ValueError, match=msg): + store.append("df", df) + + +def test_append_with_timedelta(setup_path): + # GH 3577 + # append timedelta + + df = DataFrame( + { + "A": Timestamp("20130101"), + "B": [ + Timestamp("20130101") + timedelta(days=i, seconds=10) for i in range(10) + ], + } + ) + df["C"] = df["A"] - df["B"] + df.loc[3:5, "C"] = np.nan + + with ensure_clean_store(setup_path) as store: + + # table + _maybe_remove(store, "df") + store.append("df", df, data_columns=True) + result = store.select("df") + tm.assert_frame_equal(result, df) + + result = store.select("df", where="C<100000") + tm.assert_frame_equal(result, df) + + result = store.select("df", where="C0", "B>0"], selector="df1" + ) + expected = df[(df.A > 0) & (df.B > 0)] + tm.assert_frame_equal(result, expected) + + +def test_append_to_multiple_dropna(setup_path): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) + df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan + df = concat([df1, df2], axis=1) + + with ensure_clean_store(setup_path) as store: + + # dropna=True should guarantee rows are synchronized + store.append_to_multiple( + {"df1": ["A", "B"], "df2": None}, df, selector="df1", dropna=True + ) + result = store.select_as_multiple(["df1", "df2"]) + expected = df.dropna() + tm.assert_frame_equal(result, expected) + tm.assert_index_equal(store.select("df1").index, store.select("df2").index) + + +def test_append_to_multiple_dropna_false(setup_path): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) + df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan + df = concat([df1, df2], axis=1) + + with ensure_clean_store(setup_path) as store, pd.option_context( + "io.hdf.dropna_table", True + ): + # dropna=False shouldn't synchronize row indexes + store.append_to_multiple( + {"df1a": ["A", "B"], "df2a": None}, df, selector="df1a", dropna=False + ) + + msg = "all tables must have exactly the same nrows!" + with pytest.raises(ValueError, match=msg): + store.select_as_multiple(["df1a", "df2a"]) + + assert not store.select("df1a").index.equals(store.select("df2a").index) + + +def test_append_to_multiple_min_itemsize(setup_path): + # GH 11238 + df = DataFrame( + { + "IX": np.arange(1, 21), + "Num": np.arange(1, 21), + "BigNum": np.arange(1, 21) * 88, + "Str": ["a" for _ in range(20)], + "LongStr": ["abcde" for _ in range(20)], + } + ) + expected = df.iloc[[0]] + + with ensure_clean_store(setup_path) as store: + store.append_to_multiple( + { + "index": ["IX"], + "nums": ["Num", "BigNum"], + "strs": ["Str", "LongStr"], + }, + df.iloc[[0]], + "index", + min_itemsize={"Str": 10, "LongStr": 100, "Num": 2}, + ) + result = store.select_as_multiple(["index", "nums", "strs"]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_categorical.py b/pandas/tests/io/pytables/test_categorical.py new file mode 100644 index 0000000000000..7c2ab9b4f6ec0 --- /dev/null +++ b/pandas/tests/io/pytables/test_categorical.py @@ -0,0 +1,219 @@ +import numpy as np +import pytest + +from pandas import ( + Categorical, + DataFrame, + Series, + _testing as tm, + concat, + read_hdf, +) +from pandas.tests.io.pytables.common import ( + _maybe_remove, + ensure_clean_store, +) + +pytestmark = [ + pytest.mark.single_cpu, +] + + +def test_categorical(setup_path): + + with ensure_clean_store(setup_path) as store: + + # Basic + _maybe_remove(store, "s") + s = Series( + Categorical( + ["a", "b", "b", "a", "a", "c"], + categories=["a", "b", "c", "d"], + ordered=False, + ) + ) + store.append("s", s, format="table") + result = store.select("s") + tm.assert_series_equal(s, result) + + _maybe_remove(store, "s_ordered") + s = Series( + Categorical( + ["a", "b", "b", "a", "a", "c"], + categories=["a", "b", "c", "d"], + ordered=True, + ) + ) + store.append("s_ordered", s, format="table") + result = store.select("s_ordered") + tm.assert_series_equal(s, result) + + _maybe_remove(store, "df") + df = DataFrame({"s": s, "vals": [1, 2, 3, 4, 5, 6]}) + store.append("df", df, format="table") + result = store.select("df") + tm.assert_frame_equal(result, df) + + # Dtypes + _maybe_remove(store, "si") + s = Series([1, 1, 2, 2, 3, 4, 5]).astype("category") + store.append("si", s) + result = store.select("si") + tm.assert_series_equal(result, s) + + _maybe_remove(store, "si2") + s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype("category") + store.append("si2", s) + result = store.select("si2") + tm.assert_series_equal(result, s) + + # Multiple + _maybe_remove(store, "df2") + df2 = df.copy() + df2["s2"] = Series(list("abcdefg")).astype("category") + store.append("df2", df2) + result = store.select("df2") + tm.assert_frame_equal(result, df2) + + # Make sure the metadata is OK + info = store.info() + assert "/df2 " in info + # df2._mgr.blocks[0] and df2._mgr.blocks[2] are Categorical + assert "/df2/meta/values_block_0/meta" in info + assert "/df2/meta/values_block_2/meta" in info + + # unordered + _maybe_remove(store, "s2") + s = Series( + Categorical( + ["a", "b", "b", "a", "a", "c"], + categories=["a", "b", "c", "d"], + ordered=False, + ) + ) + store.append("s2", s, format="table") + result = store.select("s2") + tm.assert_series_equal(result, s) + + # Query + _maybe_remove(store, "df3") + store.append("df3", df, data_columns=["s"]) + expected = df[df.s.isin(["b", "c"])] + result = store.select("df3", where=['s in ["b","c"]']) + tm.assert_frame_equal(result, expected) + + expected = df[df.s.isin(["b", "c"])] + result = store.select("df3", where=['s = ["b","c"]']) + tm.assert_frame_equal(result, expected) + + expected = df[df.s.isin(["d"])] + result = store.select("df3", where=['s in ["d"]']) + tm.assert_frame_equal(result, expected) + + expected = df[df.s.isin(["f"])] + result = store.select("df3", where=['s in ["f"]']) + tm.assert_frame_equal(result, expected) + + # Appending with same categories is ok + store.append("df3", df) + + df = concat([df, df]) + expected = df[df.s.isin(["b", "c"])] + result = store.select("df3", where=['s in ["b","c"]']) + tm.assert_frame_equal(result, expected) + + # Appending must have the same categories + df3 = df.copy() + df3["s"] = df3["s"].cat.remove_unused_categories() + + msg = "cannot append a categorical with different categories to the existing" + with pytest.raises(ValueError, match=msg): + store.append("df3", df3) + + # Remove, and make sure meta data is removed (its a recursive + # removal so should be). + result = store.select("df3/meta/s/meta") + assert result is not None + store.remove("df3") + + with pytest.raises( + KeyError, match="'No object named df3/meta/s/meta in the file'" + ): + store.select("df3/meta/s/meta") + + +def test_categorical_conversion(tmp_path, setup_path): + + # GH13322 + # Check that read_hdf with categorical columns doesn't return rows if + # where criteria isn't met. + obsids = ["ESP_012345_6789", "ESP_987654_3210"] + imgids = ["APF00006np", "APF0001imm"] + data = [4.3, 9.8] + + # Test without categories + df = DataFrame({"obsids": obsids, "imgids": imgids, "data": data}) + + # We are expecting an empty DataFrame matching types of df + expected = df.iloc[[], :] + path = tmp_path / setup_path + df.to_hdf(path, "df", format="table", data_columns=True) + result = read_hdf(path, "df", where="obsids=B") + tm.assert_frame_equal(result, expected) + + # Test with categories + df.obsids = df.obsids.astype("category") + df.imgids = df.imgids.astype("category") + + # We are expecting an empty DataFrame matching types of df + expected = df.iloc[[], :] + path = tmp_path / setup_path + df.to_hdf(path, "df", format="table", data_columns=True) + result = read_hdf(path, "df", where="obsids=B") + tm.assert_frame_equal(result, expected) + + +def test_categorical_nan_only_columns(tmp_path, setup_path): + # GH18413 + # Check that read_hdf with categorical columns with NaN-only values can + # be read back. + df = DataFrame( + { + "a": ["a", "b", "c", np.nan], + "b": [np.nan, np.nan, np.nan, np.nan], + "c": [1, 2, 3, 4], + "d": Series([None] * 4, dtype=object), + } + ) + df["a"] = df.a.astype("category") + df["b"] = df.b.astype("category") + df["d"] = df.b.astype("category") + expected = df + path = tmp_path / setup_path + df.to_hdf(path, "df", format="table", data_columns=True) + result = read_hdf(path, "df") + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "where, df, expected", + [ + ('col=="q"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": []})), + ('col=="a"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": ["a"]})), + ], +) +def test_convert_value( + tmp_path, setup_path, where: str, df: DataFrame, expected: DataFrame +): + # GH39420 + # Check that read_hdf with categorical columns can filter by where condition. + df.col = df.col.astype("category") + max_widths = {"col": 1} + categorical_values = sorted(df.col.unique()) + expected.col = expected.col.astype("category") + expected.col = expected.col.cat.set_categories(categorical_values) + + path = tmp_path / setup_path + df.to_hdf(path, "df", format="table", min_itemsize=max_widths) + result = read_hdf(path, where=where) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_compat.py b/pandas/tests/io/pytables/test_compat.py new file mode 100644 index 0000000000000..7f71d2666c9ae --- /dev/null +++ b/pandas/tests/io/pytables/test_compat.py @@ -0,0 +1,75 @@ +import pytest + +import pandas as pd +import pandas._testing as tm + +tables = pytest.importorskip("tables") + + +@pytest.fixture +def pytables_hdf5_file(tmp_path): + """ + Use PyTables to create a simple HDF5 file. + """ + table_schema = { + "c0": tables.Time64Col(pos=0), + "c1": tables.StringCol(5, pos=1), + "c2": tables.Int64Col(pos=2), + } + + t0 = 1_561_105_000.0 + + testsamples = [ + {"c0": t0, "c1": "aaaaa", "c2": 1}, + {"c0": t0 + 1, "c1": "bbbbb", "c2": 2}, + {"c0": t0 + 2, "c1": "ccccc", "c2": 10**5}, + {"c0": t0 + 3, "c1": "ddddd", "c2": 4_294_967_295}, + ] + + objname = "pandas_test_timeseries" + + path = tmp_path / "written_with_pytables.h5" + with tables.open_file(path, mode="w") as f: + t = f.create_table("/", name=objname, description=table_schema) + for sample in testsamples: + for key, value in sample.items(): + t.row[key] = value + t.row.append() + + yield path, objname, pd.DataFrame(testsamples) + + +class TestReadPyTablesHDF5: + """ + A group of tests which covers reading HDF5 files written by plain PyTables + (not written by pandas). + + Was introduced for regression-testing issue 11188. + """ + + def test_read_complete(self, pytables_hdf5_file): + path, objname, df = pytables_hdf5_file + result = pd.read_hdf(path, key=objname) + expected = df + tm.assert_frame_equal(result, expected) + + def test_read_with_start(self, pytables_hdf5_file): + path, objname, df = pytables_hdf5_file + # This is a regression test for pandas-dev/pandas/issues/11188 + result = pd.read_hdf(path, key=objname, start=1) + expected = df[1:].reset_index(drop=True) + tm.assert_frame_equal(result, expected) + + def test_read_with_stop(self, pytables_hdf5_file): + path, objname, df = pytables_hdf5_file + # This is a regression test for pandas-dev/pandas/issues/11188 + result = pd.read_hdf(path, key=objname, stop=1) + expected = df[:1].reset_index(drop=True) + tm.assert_frame_equal(result, expected) + + def test_read_with_startstop(self, pytables_hdf5_file): + path, objname, df = pytables_hdf5_file + # This is a regression test for pandas-dev/pandas/issues/11188 + result = pd.read_hdf(path, key=objname, start=1, stop=2) + expected = df[1:2].reset_index(drop=True) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py new file mode 100644 index 0000000000000..870458e93689f --- /dev/null +++ b/pandas/tests/io/pytables/test_complex.py @@ -0,0 +1,200 @@ +from warnings import catch_warnings + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm +from pandas.tests.io.pytables.common import ensure_clean_store + +from pandas.io.pytables import read_hdf + + +def test_complex_fixed(tmp_path, setup_path): + df = DataFrame( + np.random.rand(4, 5).astype(np.complex64), + index=list("abcd"), + columns=list("ABCDE"), + ) + + path = tmp_path / setup_path + df.to_hdf(path, "df") + reread = read_hdf(path, "df") + tm.assert_frame_equal(df, reread) + + df = DataFrame( + np.random.rand(4, 5).astype(np.complex128), + index=list("abcd"), + columns=list("ABCDE"), + ) + path = tmp_path / setup_path + df.to_hdf(path, "df") + reread = read_hdf(path, "df") + tm.assert_frame_equal(df, reread) + + +def test_complex_table(tmp_path, setup_path): + df = DataFrame( + np.random.rand(4, 5).astype(np.complex64), + index=list("abcd"), + columns=list("ABCDE"), + ) + + path = tmp_path / setup_path + df.to_hdf(path, "df", format="table") + reread = read_hdf(path, "df") + tm.assert_frame_equal(df, reread) + + df = DataFrame( + np.random.rand(4, 5).astype(np.complex128), + index=list("abcd"), + columns=list("ABCDE"), + ) + + path = tmp_path / setup_path + df.to_hdf(path, "df", format="table", mode="w") + reread = read_hdf(path, "df") + tm.assert_frame_equal(df, reread) + + +def test_complex_mixed_fixed(tmp_path, setup_path): + complex64 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64 + ) + complex128 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 + ) + df = DataFrame( + { + "A": [1, 2, 3, 4], + "B": ["a", "b", "c", "d"], + "C": complex64, + "D": complex128, + "E": [1.0, 2.0, 3.0, 4.0], + }, + index=list("abcd"), + ) + path = tmp_path / setup_path + df.to_hdf(path, "df") + reread = read_hdf(path, "df") + tm.assert_frame_equal(df, reread) + + +def test_complex_mixed_table(tmp_path, setup_path): + complex64 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64 + ) + complex128 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 + ) + df = DataFrame( + { + "A": [1, 2, 3, 4], + "B": ["a", "b", "c", "d"], + "C": complex64, + "D": complex128, + "E": [1.0, 2.0, 3.0, 4.0], + }, + index=list("abcd"), + ) + + with ensure_clean_store(setup_path) as store: + store.append("df", df, data_columns=["A", "B"]) + result = store.select("df", where="A>2") + tm.assert_frame_equal(df.loc[df.A > 2], result) + + path = tmp_path / setup_path + df.to_hdf(path, "df", format="table") + reread = read_hdf(path, "df") + tm.assert_frame_equal(df, reread) + + +def test_complex_across_dimensions_fixed(tmp_path, setup_path): + with catch_warnings(record=True): + complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) + s = Series(complex128, index=list("abcd")) + df = DataFrame({"A": s, "B": s}) + + objs = [s, df] + comps = [tm.assert_series_equal, tm.assert_frame_equal] + for obj, comp in zip(objs, comps): + path = tmp_path / setup_path + obj.to_hdf(path, "obj", format="fixed") + reread = read_hdf(path, "obj") + comp(obj, reread) + + +def test_complex_across_dimensions(tmp_path, setup_path): + complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) + s = Series(complex128, index=list("abcd")) + df = DataFrame({"A": s, "B": s}) + + with catch_warnings(record=True): + + objs = [df] + comps = [tm.assert_frame_equal] + for obj, comp in zip(objs, comps): + path = tmp_path / setup_path + obj.to_hdf(path, "obj", format="table") + reread = read_hdf(path, "obj") + comp(obj, reread) + + +def test_complex_indexing_error(setup_path): + complex128 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 + ) + df = DataFrame( + {"A": [1, 2, 3, 4], "B": ["a", "b", "c", "d"], "C": complex128}, + index=list("abcd"), + ) + + msg = ( + "Columns containing complex values can be stored " + "but cannot be indexed when using table format. " + "Either use fixed format, set index=False, " + "or do not include the columns containing complex " + "values to data_columns when initializing the table." + ) + + with ensure_clean_store(setup_path) as store: + with pytest.raises(TypeError, match=msg): + store.append("df", df, data_columns=["C"]) + + +def test_complex_series_error(tmp_path, setup_path): + complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) + s = Series(complex128, index=list("abcd")) + + msg = ( + "Columns containing complex values can be stored " + "but cannot be indexed when using table format. " + "Either use fixed format, set index=False, " + "or do not include the columns containing complex " + "values to data_columns when initializing the table." + ) + + path = tmp_path / setup_path + with pytest.raises(TypeError, match=msg): + s.to_hdf(path, "obj", format="t") + + path = tmp_path / setup_path + s.to_hdf(path, "obj", format="t", index=False) + reread = read_hdf(path, "obj") + tm.assert_series_equal(s, reread) + + +def test_complex_append(setup_path): + df = DataFrame( + {"a": np.random.randn(100).astype(np.complex128), "b": np.random.randn(100)} + ) + + with ensure_clean_store(setup_path) as store: + store.append("df", df, data_columns=["b"]) + store.append("df", df) + result = store.select("df") + tm.assert_frame_equal(pd.concat([df, df], axis=0), result) diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py new file mode 100644 index 0000000000000..7629e8ca7dfc2 --- /dev/null +++ b/pandas/tests/io/pytables/test_errors.py @@ -0,0 +1,236 @@ +import datetime +from io import BytesIO +import re +from warnings import catch_warnings + +import numpy as np +import pytest + +from pandas import ( + CategoricalIndex, + DataFrame, + HDFStore, + MultiIndex, + _testing as tm, + date_range, + read_hdf, +) +from pandas.tests.io.pytables.common import ensure_clean_store + +from pandas.io.pytables import ( + Term, + _maybe_adjust_name, +) + +pytestmark = pytest.mark.single_cpu + + +def test_pass_spec_to_storer(setup_path): + + df = tm.makeDataFrame() + + with ensure_clean_store(setup_path) as store: + store.put("df", df) + msg = ( + "cannot pass a column specification when reading a Fixed format " + "store. this store must be selected in its entirety" + ) + with pytest.raises(TypeError, match=msg): + store.select("df", columns=["A"]) + msg = ( + "cannot pass a where specification when reading from a Fixed " + "format store. this store must be selected in its entirety" + ) + with pytest.raises(TypeError, match=msg): + store.select("df", where=[("columns=A")]) + + +def test_table_index_incompatible_dtypes(setup_path): + df1 = DataFrame({"a": [1, 2, 3]}) + df2 = DataFrame({"a": [4, 5, 6]}, index=date_range("1/1/2000", periods=3)) + + with ensure_clean_store(setup_path) as store: + store.put("frame", df1, format="table") + msg = re.escape("incompatible kind in col [integer - datetime64]") + with pytest.raises(TypeError, match=msg): + store.put("frame", df2, format="table", append=True) + + +def test_unimplemented_dtypes_table_columns(setup_path): + + with ensure_clean_store(setup_path) as store: + + dtypes = [("date", datetime.date(2001, 1, 2))] + + # currently not supported dtypes #### + for n, f in dtypes: + df = tm.makeDataFrame() + df[n] = f + msg = re.escape(f"[{n}] is not implemented as a table column") + with pytest.raises(TypeError, match=msg): + store.append(f"df1_{n}", df) + + # frame + df = tm.makeDataFrame() + df["obj1"] = "foo" + df["obj2"] = "bar" + df["datetime1"] = datetime.date(2001, 1, 2) + df = df._consolidate() + + with ensure_clean_store(setup_path) as store: + # this fails because we have a date in the object block...... + msg = re.escape( + """Cannot serialize the column [datetime1] +because its data contents are not [string] but [date] object dtype""" + ) + with pytest.raises(TypeError, match=msg): + store.append("df_unimplemented", df) + + +def test_invalid_terms(tmp_path, setup_path): + + with ensure_clean_store(setup_path) as store: + + with catch_warnings(record=True): + + df = tm.makeTimeDataFrame() + df["string"] = "foo" + df.loc[df.index[0:4], "string"] = "bar" + + store.put("df", df, format="table") + + # some invalid terms + msg = re.escape( + "__init__() missing 1 required positional argument: 'where'" + ) + with pytest.raises(TypeError, match=msg): + Term() + + # more invalid + msg = re.escape( + "cannot process expression [df.index[3]], " + "[2000-01-06 00:00:00] is not a valid condition" + ) + with pytest.raises(ValueError, match=msg): + store.select("df", "df.index[3]") + + msg = "invalid syntax" + with pytest.raises(SyntaxError, match=msg): + store.select("df", "index>") + + # from the docs + path = tmp_path / setup_path + dfq = DataFrame( + np.random.randn(10, 4), + columns=list("ABCD"), + index=date_range("20130101", periods=10), + ) + dfq.to_hdf(path, "dfq", format="table", data_columns=True) + + # check ok + read_hdf(path, "dfq", where="index>Timestamp('20130104') & columns=['A', 'B']") + read_hdf(path, "dfq", where="A>0 or C>0") + + # catch the invalid reference + path = tmp_path / setup_path + dfq = DataFrame( + np.random.randn(10, 4), + columns=list("ABCD"), + index=date_range("20130101", periods=10), + ) + dfq.to_hdf(path, "dfq", format="table") + + msg = ( + r"The passed where expression: A>0 or C>0\n\s*" + r"contains an invalid variable reference\n\s*" + r"all of the variable references must be a reference to\n\s*" + r"an axis \(e.g. 'index' or 'columns'\), or a data_column\n\s*" + r"The currently defined references are: index,columns\n" + ) + with pytest.raises(ValueError, match=msg): + read_hdf(path, "dfq", where="A>0 or C>0") + + +def test_append_with_diff_col_name_types_raises_value_error(setup_path): + df = DataFrame(np.random.randn(10, 1)) + df2 = DataFrame({"a": np.random.randn(10)}) + df3 = DataFrame({(1, 2): np.random.randn(10)}) + df4 = DataFrame({("1", 2): np.random.randn(10)}) + df5 = DataFrame({("1", 2, object): np.random.randn(10)}) + + with ensure_clean_store(setup_path) as store: + name = f"df_{tm.rands(10)}" + store.append(name, df) + + for d in (df2, df3, df4, df5): + msg = re.escape( + "cannot match existing table structure for [0] on appending data" + ) + with pytest.raises(ValueError, match=msg): + store.append(name, d) + + +def test_invalid_complib(setup_path): + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + with tm.ensure_clean(setup_path) as path: + msg = r"complib only supports \[.*\] compression." + with pytest.raises(ValueError, match=msg): + df.to_hdf(path, "df", complib="foolib") + + +@pytest.mark.parametrize( + "idx", + [ + date_range("2019", freq="D", periods=3, tz="UTC"), + CategoricalIndex(list("abc")), + ], +) +def test_to_hdf_multiindex_extension_dtype(idx, tmp_path, setup_path): + # GH 7775 + mi = MultiIndex.from_arrays([idx, idx]) + df = DataFrame(0, index=mi, columns=["a"]) + path = tmp_path / setup_path + with pytest.raises(NotImplementedError, match="Saving a MultiIndex"): + df.to_hdf(path, "df") + + +def test_unsuppored_hdf_file_error(datapath): + # GH 9539 + data_path = datapath("io", "data", "legacy_hdf/incompatible_dataset.h5") + message = ( + r"Dataset\(s\) incompatible with Pandas data types, " + "not table, or no datasets found in HDF5 file." + ) + + with pytest.raises(ValueError, match=message): + read_hdf(data_path) + + +def test_read_hdf_errors(setup_path, tmp_path): + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + + path = tmp_path / setup_path + msg = r"File [\S]* does not exist" + with pytest.raises(OSError, match=msg): + read_hdf(path, "key") + + df.to_hdf(path, "df") + store = HDFStore(path, mode="r") + store.close() + + msg = "The HDFStore must be open for reading." + with pytest.raises(OSError, match=msg): + read_hdf(store, "df") + + +def test_read_hdf_generic_buffer_errors(): + msg = "Support for generic buffers has not been implemented." + with pytest.raises(NotImplementedError, match=msg): + read_hdf(BytesIO(b""), "df") + + +@pytest.mark.parametrize("bad_version", [(1, 2), (1,), [], "12", "123"]) +def test_maybe_adjust_name_bad_version_raises(bad_version): + msg = "Version is incorrect, expected sequence of 3 integers" + with pytest.raises(ValueError, match=msg): + _maybe_adjust_name("values_block_0", version=bad_version) diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py new file mode 100644 index 0000000000000..19a92163c6dd2 --- /dev/null +++ b/pandas/tests/io/pytables/test_file_handling.py @@ -0,0 +1,446 @@ +import os + +import numpy as np +import pytest + +from pandas.compat import is_platform_little_endian +from pandas.errors import ( + ClosedFileError, + PossibleDataLossError, +) + +from pandas import ( + DataFrame, + HDFStore, + Series, + _testing as tm, + read_hdf, +) +from pandas.tests.io.pytables.common import ( + _maybe_remove, + ensure_clean_store, + tables, +) + +from pandas.io import pytables +from pandas.io.pytables import Term + +pytestmark = pytest.mark.single_cpu + + +@pytest.mark.parametrize("mode", ["r", "r+", "a", "w"]) +def test_mode(setup_path, tmp_path, mode): + + df = tm.makeTimeDataFrame() + msg = r"[\S]* does not exist" + path = tmp_path / setup_path + + # constructor + if mode in ["r", "r+"]: + with pytest.raises(OSError, match=msg): + HDFStore(path, mode=mode) + + else: + with HDFStore(path, mode=mode) as store: + assert store._handle.mode == mode + + path = tmp_path / setup_path + + # context + if mode in ["r", "r+"]: + with pytest.raises(OSError, match=msg): + with HDFStore(path, mode=mode) as store: + pass + else: + with HDFStore(path, mode=mode) as store: + assert store._handle.mode == mode + + path = tmp_path / setup_path + + # conv write + if mode in ["r", "r+"]: + with pytest.raises(OSError, match=msg): + df.to_hdf(path, "df", mode=mode) + df.to_hdf(path, "df", mode="w") + else: + df.to_hdf(path, "df", mode=mode) + + # conv read + if mode in ["w"]: + msg = ( + "mode w is not allowed while performing a read. " + r"Allowed modes are r, r\+ and a." + ) + with pytest.raises(ValueError, match=msg): + read_hdf(path, "df", mode=mode) + else: + result = read_hdf(path, "df", mode=mode) + tm.assert_frame_equal(result, df) + + +def test_default_mode(tmp_path, setup_path): + # read_hdf uses default mode + df = tm.makeTimeDataFrame() + path = tmp_path / setup_path + df.to_hdf(path, "df", mode="w") + result = read_hdf(path, "df") + tm.assert_frame_equal(result, df) + + +def test_reopen_handle(tmp_path, setup_path): + + path = tmp_path / setup_path + + store = HDFStore(path, mode="a") + store["a"] = tm.makeTimeSeries() + + msg = ( + r"Re-opening the file \[[\S]*\] with mode \[a\] will delete the " + "current file!" + ) + # invalid mode change + with pytest.raises(PossibleDataLossError, match=msg): + store.open("w") + + store.close() + assert not store.is_open + + # truncation ok here + store.open("w") + assert store.is_open + assert len(store) == 0 + store.close() + assert not store.is_open + + store = HDFStore(path, mode="a") + store["a"] = tm.makeTimeSeries() + + # reopen as read + store.open("r") + assert store.is_open + assert len(store) == 1 + assert store._mode == "r" + store.close() + assert not store.is_open + + # reopen as append + store.open("a") + assert store.is_open + assert len(store) == 1 + assert store._mode == "a" + store.close() + assert not store.is_open + + # reopen as append (again) + store.open("a") + assert store.is_open + assert len(store) == 1 + assert store._mode == "a" + store.close() + assert not store.is_open + + +def test_open_args(setup_path): + + with tm.ensure_clean(setup_path) as path: + + df = tm.makeDataFrame() + + # create an in memory store + store = HDFStore( + path, mode="a", driver="H5FD_CORE", driver_core_backing_store=0 + ) + store["df"] = df + store.append("df2", df) + + tm.assert_frame_equal(store["df"], df) + tm.assert_frame_equal(store["df2"], df) + + store.close() + + # the file should not have actually been written + assert not os.path.exists(path) + + +def test_flush(setup_path): + + with ensure_clean_store(setup_path) as store: + store["a"] = tm.makeTimeSeries() + store.flush() + store.flush(fsync=True) + + +def test_complibs_default_settings(tmp_path, setup_path): + # GH15943 + df = tm.makeDataFrame() + + # Set complevel and check if complib is automatically set to + # default value + tmpfile = tmp_path / setup_path + df.to_hdf(tmpfile, "df", complevel=9) + result = read_hdf(tmpfile, "df") + tm.assert_frame_equal(result, df) + + with tables.open_file(tmpfile, mode="r") as h5file: + for node in h5file.walk_nodes(where="/df", classname="Leaf"): + assert node.filters.complevel == 9 + assert node.filters.complib == "zlib" + + # Set complib and check to see if compression is disabled + tmpfile = tmp_path / setup_path + df.to_hdf(tmpfile, "df", complib="zlib") + result = read_hdf(tmpfile, "df") + tm.assert_frame_equal(result, df) + + with tables.open_file(tmpfile, mode="r") as h5file: + for node in h5file.walk_nodes(where="/df", classname="Leaf"): + assert node.filters.complevel == 0 + assert node.filters.complib is None + + # Check if not setting complib or complevel results in no compression + tmpfile = tmp_path / setup_path + df.to_hdf(tmpfile, "df") + result = read_hdf(tmpfile, "df") + tm.assert_frame_equal(result, df) + + with tables.open_file(tmpfile, mode="r") as h5file: + for node in h5file.walk_nodes(where="/df", classname="Leaf"): + assert node.filters.complevel == 0 + assert node.filters.complib is None + + +def test_complibs_default_settings_override(tmp_path, setup_path): + # Check if file-defaults can be overridden on a per table basis + df = tm.makeDataFrame() + tmpfile = tmp_path / setup_path + store = HDFStore(tmpfile) + store.append("dfc", df, complevel=9, complib="blosc") + store.append("df", df) + store.close() + + with tables.open_file(tmpfile, mode="r") as h5file: + for node in h5file.walk_nodes(where="/df", classname="Leaf"): + assert node.filters.complevel == 0 + assert node.filters.complib is None + for node in h5file.walk_nodes(where="/dfc", classname="Leaf"): + assert node.filters.complevel == 9 + assert node.filters.complib == "blosc" + + +def test_complibs(tmp_path, setup_path): + # GH14478 + df = tm.makeDataFrame() + + # Building list of all complibs and complevels tuples + all_complibs = tables.filters.all_complibs + # Remove lzo if its not available on this platform + if not tables.which_lib_version("lzo"): + all_complibs.remove("lzo") + # Remove bzip2 if its not available on this platform + if not tables.which_lib_version("bzip2"): + all_complibs.remove("bzip2") + + all_levels = range(0, 10) + all_tests = [(lib, lvl) for lib in all_complibs for lvl in all_levels] + + for (lib, lvl) in all_tests: + tmpfile = tmp_path / setup_path + gname = "foo" + + # Write and read file to see if data is consistent + df.to_hdf(tmpfile, gname, complib=lib, complevel=lvl) + result = read_hdf(tmpfile, gname) + tm.assert_frame_equal(result, df) + + # Open file and check metadata for correct amount of compression + with tables.open_file(tmpfile, mode="r") as h5table: + for node in h5table.walk_nodes(where="/" + gname, classname="Leaf"): + assert node.filters.complevel == lvl + if lvl == 0: + assert node.filters.complib is None + else: + assert node.filters.complib == lib + + +@pytest.mark.skipif( + not is_platform_little_endian(), reason="reason platform is not little endian" +) +def test_encoding(setup_path): + + with ensure_clean_store(setup_path) as store: + df = DataFrame({"A": "foo", "B": "bar"}, index=range(5)) + df.loc[2, "A"] = np.nan + df.loc[3, "B"] = np.nan + _maybe_remove(store, "df") + store.append("df", df, encoding="ascii") + tm.assert_frame_equal(store["df"], df) + + expected = df.reindex(columns=["A"]) + result = store.select("df", Term("columns=A", encoding="ascii")) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "val", + [ + [b"E\xc9, 17", b"", b"a", b"b", b"c"], + [b"E\xc9, 17", b"a", b"b", b"c"], + [b"EE, 17", b"", b"a", b"b", b"c"], + [b"E\xc9, 17", b"\xf8\xfc", b"a", b"b", b"c"], + [b"", b"a", b"b", b"c"], + [b"\xf8\xfc", b"a", b"b", b"c"], + [b"A\xf8\xfc", b"", b"a", b"b", b"c"], + [np.nan, b"", b"b", b"c"], + [b"A\xf8\xfc", np.nan, b"", b"b", b"c"], + ], +) +@pytest.mark.parametrize("dtype", ["category", object]) +def test_latin_encoding(tmp_path, setup_path, dtype, val): + enc = "latin-1" + nan_rep = "" + key = "data" + + val = [x.decode(enc) if isinstance(x, bytes) else x for x in val] + ser = Series(val, dtype=dtype) + + store = tmp_path / setup_path + ser.to_hdf(store, key, format="table", encoding=enc, nan_rep=nan_rep) + retr = read_hdf(store, key) + + s_nan = ser.replace(nan_rep, np.nan) + + tm.assert_series_equal(s_nan, retr) + + +def test_multiple_open_close(tmp_path, setup_path): + # gh-4409: open & close multiple times + + path = tmp_path / setup_path + + df = tm.makeDataFrame() + df.to_hdf(path, "df", mode="w", format="table") + + # single + store = HDFStore(path) + assert "CLOSED" not in store.info() + assert store.is_open + + store.close() + assert "CLOSED" in store.info() + assert not store.is_open + + path = tmp_path / setup_path + + if pytables._table_file_open_policy_is_strict: + # multiples + store1 = HDFStore(path) + msg = ( + r"The file [\S]* is already opened\. Please close it before " + r"reopening in write mode\." + ) + with pytest.raises(ValueError, match=msg): + HDFStore(path) + + store1.close() + else: + + # multiples + store1 = HDFStore(path) + store2 = HDFStore(path) + + assert "CLOSED" not in store1.info() + assert "CLOSED" not in store2.info() + assert store1.is_open + assert store2.is_open + + store1.close() + assert "CLOSED" in store1.info() + assert not store1.is_open + assert "CLOSED" not in store2.info() + assert store2.is_open + + store2.close() + assert "CLOSED" in store1.info() + assert "CLOSED" in store2.info() + assert not store1.is_open + assert not store2.is_open + + # nested close + store = HDFStore(path, mode="w") + store.append("df", df) + + store2 = HDFStore(path) + store2.append("df2", df) + store2.close() + assert "CLOSED" in store2.info() + assert not store2.is_open + + store.close() + assert "CLOSED" in store.info() + assert not store.is_open + + # double closing + store = HDFStore(path, mode="w") + store.append("df", df) + + store2 = HDFStore(path) + store.close() + assert "CLOSED" in store.info() + assert not store.is_open + + store2.close() + assert "CLOSED" in store2.info() + assert not store2.is_open + + # ops on a closed store + path = tmp_path / setup_path + + df = tm.makeDataFrame() + df.to_hdf(path, "df", mode="w", format="table") + + store = HDFStore(path) + store.close() + + msg = r"[\S]* file is not open!" + with pytest.raises(ClosedFileError, match=msg): + store.keys() + + with pytest.raises(ClosedFileError, match=msg): + "df" in store + + with pytest.raises(ClosedFileError, match=msg): + len(store) + + with pytest.raises(ClosedFileError, match=msg): + store["df"] + + with pytest.raises(ClosedFileError, match=msg): + store.select("df") + + with pytest.raises(ClosedFileError, match=msg): + store.get("df") + + with pytest.raises(ClosedFileError, match=msg): + store.append("df2", df) + + with pytest.raises(ClosedFileError, match=msg): + store.put("df3", df) + + with pytest.raises(ClosedFileError, match=msg): + store.get_storer("df2") + + with pytest.raises(ClosedFileError, match=msg): + store.remove("df2") + + with pytest.raises(ClosedFileError, match=msg): + store.select("df") + + msg = "'HDFStore' object has no attribute 'df'" + with pytest.raises(AttributeError, match=msg): + store.df + + +def test_fspath(): + with tm.ensure_clean("foo.h5") as path: + with HDFStore(path) as store: + assert os.fspath(store) == str(path) diff --git a/pandas/tests/io/pytables/test_keys.py b/pandas/tests/io/pytables/test_keys.py new file mode 100644 index 0000000000000..dff7e2144d3c2 --- /dev/null +++ b/pandas/tests/io/pytables/test_keys.py @@ -0,0 +1,79 @@ +import pytest + +from pandas import ( + DataFrame, + HDFStore, + _testing as tm, +) +from pandas.tests.io.pytables.common import ( + ensure_clean_store, + tables, +) + +pytestmark = pytest.mark.single_cpu + + +def test_keys(setup_path): + + with ensure_clean_store(setup_path) as store: + store["a"] = tm.makeTimeSeries() + store["b"] = tm.makeStringSeries() + store["c"] = tm.makeDataFrame() + + assert len(store) == 3 + expected = {"/a", "/b", "/c"} + assert set(store.keys()) == expected + assert set(store) == expected + + +def test_non_pandas_keys(tmp_path, setup_path): + class Table1(tables.IsDescription): + value1 = tables.Float32Col() + + class Table2(tables.IsDescription): + value2 = tables.Float32Col() + + class Table3(tables.IsDescription): + value3 = tables.Float32Col() + + path = tmp_path / setup_path + with tables.open_file(path, mode="w") as h5file: + group = h5file.create_group("/", "group") + h5file.create_table(group, "table1", Table1, "Table 1") + h5file.create_table(group, "table2", Table2, "Table 2") + h5file.create_table(group, "table3", Table3, "Table 3") + with HDFStore(path) as store: + assert len(store.keys(include="native")) == 3 + expected = {"/group/table1", "/group/table2", "/group/table3"} + assert set(store.keys(include="native")) == expected + assert set(store.keys(include="pandas")) == set() + for name in expected: + df = store.get(name) + assert len(df.columns) == 1 + + +def test_keys_illegal_include_keyword_value(setup_path): + with ensure_clean_store(setup_path) as store: + with pytest.raises( + ValueError, + match="`include` should be either 'pandas' or 'native' but is 'illegal'", + ): + store.keys(include="illegal") + + +def test_keys_ignore_hdf_softlink(setup_path): + + # GH 20523 + # Puts a softlink into HDF file and rereads + + with ensure_clean_store(setup_path) as store: + + df = DataFrame({"A": range(5), "B": range(5)}) + store.put("df", df) + + assert store.keys() == ["/df"] + + store._handle.create_soft_link(store._handle.root, "symlink", "df") + + # Should ignore the softlink + assert store.keys() == ["/df"] diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py new file mode 100644 index 0000000000000..8cff9e65ce23b --- /dev/null +++ b/pandas/tests/io/pytables/test_put.py @@ -0,0 +1,372 @@ +import datetime +import re +from warnings import ( + catch_warnings, + simplefilter, +) + +import numpy as np +import pytest + +from pandas._libs.tslibs import Timestamp + +import pandas as pd +from pandas import ( + DataFrame, + HDFStore, + Index, + MultiIndex, + Series, + _testing as tm, + concat, +) +from pandas.tests.io.pytables.common import ( + _maybe_remove, + ensure_clean_store, +) +from pandas.util import _test_decorators as td + +pytestmark = pytest.mark.single_cpu + + +def test_format_type(tmp_path, setup_path): + df = DataFrame({"A": [1, 2]}) + with HDFStore(tmp_path / setup_path) as store: + store.put("a", df, format="fixed") + store.put("b", df, format="table") + + assert store.get_storer("a").format_type == "fixed" + assert store.get_storer("b").format_type == "table" + + +def test_format_kwarg_in_constructor(tmp_path, setup_path): + # GH 13291 + + msg = "format is not a defined argument for HDFStore" + + with pytest.raises(ValueError, match=msg): + HDFStore(tmp_path / setup_path, format="table") + + +def test_api_default_format(tmp_path, setup_path): + + # default_format option + with ensure_clean_store(setup_path) as store: + df = tm.makeDataFrame() + + with pd.option_context("io.hdf.default_format", "fixed"): + _maybe_remove(store, "df") + store.put("df", df) + assert not store.get_storer("df").is_table + + msg = "Can only append to Tables" + with pytest.raises(ValueError, match=msg): + store.append("df2", df) + + with pd.option_context("io.hdf.default_format", "table"): + _maybe_remove(store, "df") + store.put("df", df) + assert store.get_storer("df").is_table + + _maybe_remove(store, "df2") + store.append("df2", df) + assert store.get_storer("df").is_table + + path = tmp_path / setup_path + df = tm.makeDataFrame() + + with pd.option_context("io.hdf.default_format", "fixed"): + df.to_hdf(path, "df") + with HDFStore(path) as store: + assert not store.get_storer("df").is_table + with pytest.raises(ValueError, match=msg): + df.to_hdf(path, "df2", append=True) + + with pd.option_context("io.hdf.default_format", "table"): + df.to_hdf(path, "df3") + with HDFStore(path) as store: + assert store.get_storer("df3").is_table + df.to_hdf(path, "df4", append=True) + with HDFStore(path) as store: + assert store.get_storer("df4").is_table + + +def test_put(setup_path): + + with ensure_clean_store(setup_path) as store: + + ts = tm.makeTimeSeries() + df = tm.makeTimeDataFrame() + store["a"] = ts + store["b"] = df[:10] + store["foo/bar/bah"] = df[:10] + store["foo"] = df[:10] + store["/foo"] = df[:10] + store.put("c", df[:10], format="table") + + # not OK, not a table + msg = "Can only append to Tables" + with pytest.raises(ValueError, match=msg): + store.put("b", df[10:], append=True) + + # node does not currently exist, test _is_table_type returns False + # in this case + _maybe_remove(store, "f") + with pytest.raises(ValueError, match=msg): + store.put("f", df[10:], append=True) + + # can't put to a table (use append instead) + with pytest.raises(ValueError, match=msg): + store.put("c", df[10:], append=True) + + # overwrite table + store.put("c", df[:10], format="table", append=False) + tm.assert_frame_equal(df[:10], store["c"]) + + +def test_put_string_index(setup_path): + + with ensure_clean_store(setup_path) as store: + + index = Index([f"I am a very long string index: {i}" for i in range(20)]) + s = Series(np.arange(20), index=index) + df = DataFrame({"A": s, "B": s}) + + store["a"] = s + tm.assert_series_equal(store["a"], s) + + store["b"] = df + tm.assert_frame_equal(store["b"], df) + + # mixed length + index = Index( + ["abcdefghijklmnopqrstuvwxyz1234567890"] + + [f"I am a very long string index: {i}" for i in range(20)] + ) + s = Series(np.arange(21), index=index) + df = DataFrame({"A": s, "B": s}) + store["a"] = s + tm.assert_series_equal(store["a"], s) + + store["b"] = df + tm.assert_frame_equal(store["b"], df) + + +def test_put_compression(setup_path): + + with ensure_clean_store(setup_path) as store: + df = tm.makeTimeDataFrame() + + store.put("c", df, format="table", complib="zlib") + tm.assert_frame_equal(store["c"], df) + + # can't compress if format='fixed' + msg = "Compression not supported on Fixed format stores" + with pytest.raises(ValueError, match=msg): + store.put("b", df, format="fixed", complib="zlib") + + +@td.skip_if_windows +def test_put_compression_blosc(setup_path): + df = tm.makeTimeDataFrame() + + with ensure_clean_store(setup_path) as store: + + # can't compress if format='fixed' + msg = "Compression not supported on Fixed format stores" + with pytest.raises(ValueError, match=msg): + store.put("b", df, format="fixed", complib="blosc") + + store.put("c", df, format="table", complib="blosc") + tm.assert_frame_equal(store["c"], df) + + +def test_put_mixed_type(setup_path): + df = tm.makeTimeDataFrame() + df["obj1"] = "foo" + df["obj2"] = "bar" + df["bool1"] = df["A"] > 0 + df["bool2"] = df["B"] > 0 + df["bool3"] = True + df["int1"] = 1 + df["int2"] = 2 + df["timestamp1"] = Timestamp("20010102") + df["timestamp2"] = Timestamp("20010103") + df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) + df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) + df.loc[df.index[3:6], ["obj1"]] = np.nan + df = df._consolidate() + + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "df") + + # PerformanceWarning + with catch_warnings(record=True): + simplefilter("ignore", pd.errors.PerformanceWarning) + store.put("df", df) + + expected = store.get("df") + tm.assert_frame_equal(expected, df) + + +@pytest.mark.parametrize( + "format, index", + [ + ["table", tm.makeFloatIndex], + ["table", tm.makeStringIndex], + ["table", tm.makeIntIndex], + ["table", tm.makeDateIndex], + ["fixed", tm.makeFloatIndex], + ["fixed", tm.makeStringIndex], + ["fixed", tm.makeIntIndex], + ["fixed", tm.makeDateIndex], + ["table", tm.makePeriodIndex], # GH#7796 + ["fixed", tm.makePeriodIndex], + ], +) +def test_store_index_types(setup_path, format, index): + # GH5386 + # test storing various index types + + with ensure_clean_store(setup_path) as store: + + df = DataFrame(np.random.randn(10, 2), columns=list("AB")) + df.index = index(len(df)) + + _maybe_remove(store, "df") + store.put("df", df, format=format) + tm.assert_frame_equal(df, store["df"]) + + +def test_column_multiindex(setup_path): + # GH 4710 + # recreate multi-indexes properly + + index = MultiIndex.from_tuples( + [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")], names=["first", "second"] + ) + df = DataFrame(np.arange(12).reshape(3, 4), columns=index) + expected = df.set_axis(df.index.to_numpy()) + + with ensure_clean_store(setup_path) as store: + + store.put("df", df) + tm.assert_frame_equal( + store["df"], expected, check_index_type=True, check_column_type=True + ) + + store.put("df1", df, format="table") + tm.assert_frame_equal( + store["df1"], expected, check_index_type=True, check_column_type=True + ) + + msg = re.escape("cannot use a multi-index on axis [1] with data_columns ['A']") + with pytest.raises(ValueError, match=msg): + store.put("df2", df, format="table", data_columns=["A"]) + msg = re.escape("cannot use a multi-index on axis [1] with data_columns True") + with pytest.raises(ValueError, match=msg): + store.put("df3", df, format="table", data_columns=True) + + # appending multi-column on existing table (see GH 6167) + with ensure_clean_store(setup_path) as store: + store.append("df2", df) + store.append("df2", df) + + tm.assert_frame_equal(store["df2"], concat((df, df))) + + # non_index_axes name + df = DataFrame(np.arange(12).reshape(3, 4), columns=Index(list("ABCD"), name="foo")) + expected = df.set_axis(df.index.to_numpy()) + + with ensure_clean_store(setup_path) as store: + + store.put("df1", df, format="table") + tm.assert_frame_equal( + store["df1"], expected, check_index_type=True, check_column_type=True + ) + + +def test_store_multiindex(setup_path): + + # validate multi-index names + # GH 5527 + with ensure_clean_store(setup_path) as store: + + def make_index(names=None): + return MultiIndex.from_tuples( + [ + (datetime.datetime(2013, 12, d), s, t) + for d in range(1, 3) + for s in range(2) + for t in range(3) + ], + names=names, + ) + + # no names + _maybe_remove(store, "df") + df = DataFrame(np.zeros((12, 2)), columns=["a", "b"], index=make_index()) + store.append("df", df) + tm.assert_frame_equal(store.select("df"), df) + + # partial names + _maybe_remove(store, "df") + df = DataFrame( + np.zeros((12, 2)), + columns=["a", "b"], + index=make_index(["date", None, None]), + ) + store.append("df", df) + tm.assert_frame_equal(store.select("df"), df) + + # series + _maybe_remove(store, "s") + s = Series(np.zeros(12), index=make_index(["date", None, None])) + store.append("s", s) + xp = Series(np.zeros(12), index=make_index(["date", "level_1", "level_2"])) + tm.assert_series_equal(store.select("s"), xp) + + # dup with column + _maybe_remove(store, "df") + df = DataFrame( + np.zeros((12, 2)), + columns=["a", "b"], + index=make_index(["date", "a", "t"]), + ) + msg = "duplicate names/columns in the multi-index when storing as a table" + with pytest.raises(ValueError, match=msg): + store.append("df", df) + + # dup within level + _maybe_remove(store, "df") + df = DataFrame( + np.zeros((12, 2)), + columns=["a", "b"], + index=make_index(["date", "date", "date"]), + ) + with pytest.raises(ValueError, match=msg): + store.append("df", df) + + # fully names + _maybe_remove(store, "df") + df = DataFrame( + np.zeros((12, 2)), + columns=["a", "b"], + index=make_index(["date", "s", "t"]), + ) + store.append("df", df) + tm.assert_frame_equal(store.select("df"), df) + + +@pytest.mark.parametrize("format", ["fixed", "table"]) +def test_store_periodindex(tmp_path, setup_path, format): + # GH 7796 + # test of PeriodIndex in HDFStore + df = DataFrame( + np.random.randn(5, 1), index=pd.period_range("20220101", freq="M", periods=5) + ) + + path = tmp_path / setup_path + df.to_hdf(path, "df", mode="w", format=format) + expected = pd.read_hdf(path, "df") + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/io/pytables/test_pytables_missing.py b/pandas/tests/io/pytables/test_pytables_missing.py new file mode 100644 index 0000000000000..9adb0a6d227da --- /dev/null +++ b/pandas/tests/io/pytables/test_pytables_missing.py @@ -0,0 +1,14 @@ +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm + + +@td.skip_if_installed("tables") +def test_pytables_raises(): + df = pd.DataFrame({"A": [1, 2]}) + with pytest.raises(ImportError, match="tables"): + with tm.ensure_clean("foo.h5") as path: + df.to_hdf(path, "df") diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py new file mode 100644 index 0000000000000..6d92c15f1ea10 --- /dev/null +++ b/pandas/tests/io/pytables/test_read.py @@ -0,0 +1,344 @@ +from contextlib import closing +from pathlib import Path +import re + +import numpy as np +import pytest + +from pandas._libs.tslibs import Timestamp +from pandas.compat import is_platform_windows + +import pandas as pd +from pandas import ( + DataFrame, + HDFStore, + Index, + Series, + _testing as tm, + read_hdf, +) +from pandas.tests.io.pytables.common import ( + _maybe_remove, + ensure_clean_store, +) +from pandas.util import _test_decorators as td + +from pandas.io.pytables import TableIterator + +pytestmark = pytest.mark.single_cpu + + +def test_read_missing_key_close_store(tmp_path, setup_path): + # GH 25766 + path = tmp_path / setup_path + df = DataFrame({"a": range(2), "b": range(2)}) + df.to_hdf(path, "k1") + + with pytest.raises(KeyError, match="'No object named k2 in the file'"): + read_hdf(path, "k2") + + # smoke test to test that file is properly closed after + # read with KeyError before another write + df.to_hdf(path, "k2") + + +def test_read_missing_key_opened_store(tmp_path, setup_path): + # GH 28699 + path = tmp_path / setup_path + df = DataFrame({"a": range(2), "b": range(2)}) + df.to_hdf(path, "k1") + + with HDFStore(path, "r") as store: + + with pytest.raises(KeyError, match="'No object named k2 in the file'"): + read_hdf(store, "k2") + + # Test that the file is still open after a KeyError and that we can + # still read from it. + read_hdf(store, "k1") + + +def test_read_column(setup_path): + + df = tm.makeTimeDataFrame() + + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "df") + + # GH 17912 + # HDFStore.select_column should raise a KeyError + # exception if the key is not a valid store + with pytest.raises(KeyError, match="No object named df in the file"): + store.select_column("df", "index") + + store.append("df", df) + # error + with pytest.raises( + KeyError, match=re.escape("'column [foo] not found in the table'") + ): + store.select_column("df", "foo") + + msg = re.escape("select_column() got an unexpected keyword argument 'where'") + with pytest.raises(TypeError, match=msg): + store.select_column("df", "index", where=["index>5"]) + + # valid + result = store.select_column("df", "index") + tm.assert_almost_equal(result.values, Series(df.index).values) + assert isinstance(result, Series) + + # not a data indexable column + msg = re.escape( + "column [values_block_0] can not be extracted individually; " + "it is not data indexable" + ) + with pytest.raises(ValueError, match=msg): + store.select_column("df", "values_block_0") + + # a data column + df2 = df.copy() + df2["string"] = "foo" + store.append("df2", df2, data_columns=["string"]) + result = store.select_column("df2", "string") + tm.assert_almost_equal(result.values, df2["string"].values) + + # a data column with NaNs, result excludes the NaNs + df3 = df.copy() + df3["string"] = "foo" + df3.loc[df3.index[4:6], "string"] = np.nan + store.append("df3", df3, data_columns=["string"]) + result = store.select_column("df3", "string") + tm.assert_almost_equal(result.values, df3["string"].values) + + # start/stop + result = store.select_column("df3", "string", start=2) + tm.assert_almost_equal(result.values, df3["string"].values[2:]) + + result = store.select_column("df3", "string", start=-2) + tm.assert_almost_equal(result.values, df3["string"].values[-2:]) + + result = store.select_column("df3", "string", stop=2) + tm.assert_almost_equal(result.values, df3["string"].values[:2]) + + result = store.select_column("df3", "string", stop=-2) + tm.assert_almost_equal(result.values, df3["string"].values[:-2]) + + result = store.select_column("df3", "string", start=2, stop=-2) + tm.assert_almost_equal(result.values, df3["string"].values[2:-2]) + + result = store.select_column("df3", "string", start=-2, stop=2) + tm.assert_almost_equal(result.values, df3["string"].values[-2:2]) + + # GH 10392 - make sure column name is preserved + df4 = DataFrame({"A": np.random.randn(10), "B": "foo"}) + store.append("df4", df4, data_columns=True) + expected = df4["B"] + result = store.select_column("df4", "B") + tm.assert_series_equal(result, expected) + + +def test_pytables_native_read(datapath): + with ensure_clean_store( + datapath("io", "data", "legacy_hdf/pytables_native.h5"), mode="r" + ) as store: + d2 = store["detector/readout"] + assert isinstance(d2, DataFrame) + + +@pytest.mark.skipif(is_platform_windows(), reason="native2 read fails oddly on windows") +def test_pytables_native2_read(datapath): + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "pytables_native2.h5"), mode="r" + ) as store: + str(store) + d1 = store["detector"] + assert isinstance(d1, DataFrame) + + +def test_legacy_table_fixed_format_read_py2(datapath): + # GH 24510 + # legacy table with fixed format written in Python 2 + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "legacy_table_fixed_py2.h5"), mode="r" + ) as store: + result = store.select("df") + expected = DataFrame( + [[1, 2, 3, "D"]], + columns=["A", "B", "C", "D"], + index=Index(["ABC"], name="INDEX_NAME"), + ) + tm.assert_frame_equal(expected, result) + + +def test_legacy_table_fixed_format_read_datetime_py2(datapath): + # GH 31750 + # legacy table with fixed format and datetime64 column written in Python 2 + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "legacy_table_fixed_datetime_py2.h5"), + mode="r", + ) as store: + result = store.select("df") + expected = DataFrame( + [[Timestamp("2020-02-06T18:00")]], + columns=["A"], + index=Index(["date"]), + ) + tm.assert_frame_equal(expected, result) + + +def test_legacy_table_read_py2(datapath): + # issue: 24925 + # legacy table written in Python 2 + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "legacy_table_py2.h5"), mode="r" + ) as store: + result = store.select("table") + + expected = DataFrame({"a": ["a", "b"], "b": [2, 3]}) + tm.assert_frame_equal(expected, result) + + +def test_read_hdf_open_store(tmp_path, setup_path): + # GH10330 + # No check for non-string path_or-buf, and no test of open store + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + df.index.name = "letters" + df = df.set_index(keys="E", append=True) + + path = tmp_path / setup_path + df.to_hdf(path, "df", mode="w") + direct = read_hdf(path, "df") + with HDFStore(path, mode="r") as store: + indirect = read_hdf(store, "df") + tm.assert_frame_equal(direct, indirect) + assert store.is_open + + +def test_read_hdf_iterator(tmp_path, setup_path): + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + df.index.name = "letters" + df = df.set_index(keys="E", append=True) + + path = tmp_path / setup_path + df.to_hdf(path, "df", mode="w", format="t") + direct = read_hdf(path, "df") + iterator = read_hdf(path, "df", iterator=True) + with closing(iterator.store): + assert isinstance(iterator, TableIterator) + indirect = next(iterator.__iter__()) + tm.assert_frame_equal(direct, indirect) + + +def test_read_nokey(tmp_path, setup_path): + # GH10443 + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + + # Categorical dtype not supported for "fixed" format. So no need + # to test with that dtype in the dataframe here. + path = tmp_path / setup_path + df.to_hdf(path, "df", mode="a") + reread = read_hdf(path) + tm.assert_frame_equal(df, reread) + df.to_hdf(path, "df2", mode="a") + + msg = "key must be provided when HDF5 file contains multiple datasets." + with pytest.raises(ValueError, match=msg): + read_hdf(path) + + +def test_read_nokey_table(tmp_path, setup_path): + # GH13231 + df = DataFrame({"i": range(5), "c": Series(list("abacd"), dtype="category")}) + + path = tmp_path / setup_path + df.to_hdf(path, "df", mode="a", format="table") + reread = read_hdf(path) + tm.assert_frame_equal(df, reread) + df.to_hdf(path, "df2", mode="a", format="table") + + msg = "key must be provided when HDF5 file contains multiple datasets." + with pytest.raises(ValueError, match=msg): + read_hdf(path) + + +def test_read_nokey_empty(tmp_path, setup_path): + path = tmp_path / setup_path + store = HDFStore(path) + store.close() + msg = re.escape( + "Dataset(s) incompatible with Pandas data types, not table, or no " + "datasets found in HDF5 file." + ) + with pytest.raises(ValueError, match=msg): + read_hdf(path) + + +def test_read_from_pathlib_path(tmp_path, setup_path): + + # GH11773 + expected = DataFrame( + np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE") + ) + filename = tmp_path / setup_path + path_obj = Path(filename) + + expected.to_hdf(path_obj, "df", mode="a") + actual = read_hdf(path_obj, "df") + + tm.assert_frame_equal(expected, actual) + + +@td.skip_if_no("py.path") +def test_read_from_py_localpath(tmp_path, setup_path): + + # GH11773 + from py.path import local as LocalPath + + expected = DataFrame( + np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE") + ) + filename = tmp_path / setup_path + path_obj = LocalPath(filename) + + expected.to_hdf(path_obj, "df", mode="a") + actual = read_hdf(path_obj, "df") + + tm.assert_frame_equal(expected, actual) + + +@pytest.mark.parametrize("format", ["fixed", "table"]) +def test_read_hdf_series_mode_r(tmp_path, format, setup_path): + # GH 16583 + # Tests that reading a Series saved to an HDF file + # still works if a mode='r' argument is supplied + series = tm.makeFloatSeries() + path = tmp_path / setup_path + series.to_hdf(path, key="data", format=format) + result = read_hdf(path, key="data", mode="r") + tm.assert_series_equal(result, series) + + +def test_read_py2_hdf_file_in_py3(datapath): + # GH 16781 + + # tests reading a PeriodIndex DataFrame written in Python2 in Python3 + + # the file was generated in Python 2.7 like so: + # + # df = DataFrame([1.,2,3], index=pd.PeriodIndex( + # ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B')) + # df.to_hdf('periodindex_0.20.1_x86_64_darwin_2.7.13.h5', 'p') + + expected = DataFrame( + [1.0, 2, 3], + index=pd.PeriodIndex(["2015-01-01", "2015-01-02", "2015-01-05"], freq="B"), + ) + + with ensure_clean_store( + datapath( + "io", "data", "legacy_hdf", "periodindex_0.20.1_x86_64_darwin_2.7.13.h5" + ), + mode="r", + ) as store: + result = store["p"] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_retain_attributes.py b/pandas/tests/io/pytables/test_retain_attributes.py new file mode 100644 index 0000000000000..3043cd3604e58 --- /dev/null +++ b/pandas/tests/io/pytables/test_retain_attributes.py @@ -0,0 +1,105 @@ +from warnings import catch_warnings + +import pytest + +from pandas._libs.tslibs import Timestamp + +from pandas import ( + DataFrame, + Series, + _testing as tm, + date_range, + read_hdf, +) +from pandas.tests.io.pytables.common import ( + _maybe_remove, + ensure_clean_store, +) + +pytestmark = pytest.mark.single_cpu + + +def test_retain_index_attributes(setup_path): + + # GH 3499, losing frequency info on index recreation + df = DataFrame( + {"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="H"))} + ) + + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "data") + store.put("data", df, format="table") + + result = store.get("data") + tm.assert_frame_equal(df, result) + + for attr in ["freq", "tz", "name"]: + for idx in ["index", "columns"]: + assert getattr(getattr(df, idx), attr, None) == getattr( + getattr(result, idx), attr, None + ) + + # try to append a table with a different frequency + with catch_warnings(record=True): + df2 = DataFrame( + { + "A": Series( + range(3), index=date_range("2002-1-1", periods=3, freq="D") + ) + } + ) + store.append("data", df2) + + assert store.get_storer("data").info["index"]["freq"] is None + + # this is ok + _maybe_remove(store, "df2") + df2 = DataFrame( + { + "A": Series( + range(3), + index=[ + Timestamp("20010101"), + Timestamp("20010102"), + Timestamp("20020101"), + ], + ) + } + ) + store.append("df2", df2) + df3 = DataFrame( + {"A": Series(range(3), index=date_range("2002-1-1", periods=3, freq="D"))} + ) + store.append("df2", df3) + + +def test_retain_index_attributes2(tmp_path, setup_path): + path = tmp_path / setup_path + + with catch_warnings(record=True): + + df = DataFrame( + {"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="H"))} + ) + df.to_hdf(path, "data", mode="w", append=True) + df2 = DataFrame( + {"A": Series(range(3), index=date_range("2002-1-1", periods=3, freq="D"))} + ) + + df2.to_hdf(path, "data", append=True) + + idx = date_range("2000-1-1", periods=3, freq="H") + idx.name = "foo" + df = DataFrame({"A": Series(range(3), index=idx)}) + df.to_hdf(path, "data", mode="w", append=True) + + assert read_hdf(path, "data").index.name == "foo" + + with catch_warnings(record=True): + + idx2 = date_range("2001-1-1", periods=3, freq="H") + idx2.name = "bar" + df2 = DataFrame({"A": Series(range(3), index=idx2)}) + df2.to_hdf(path, "data", append=True) + + assert read_hdf(path, "data").index.name is None diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py new file mode 100644 index 0000000000000..5c7c4f9ce0b75 --- /dev/null +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -0,0 +1,557 @@ +import datetime +import re +from warnings import ( + catch_warnings, + simplefilter, +) + +import numpy as np +import pytest + +from pandas._libs.tslibs import Timestamp +from pandas.compat import is_platform_windows + +import pandas as pd +from pandas import ( + DataFrame, + Index, + Series, + _testing as tm, + bdate_range, + read_hdf, +) +from pandas.tests.io.pytables.common import ( + _maybe_remove, + ensure_clean_store, +) +from pandas.util import _test_decorators as td + +_default_compressor = "blosc" + + +pytestmark = pytest.mark.single_cpu + + +def test_conv_read_write(): + with tm.ensure_clean() as path: + + def roundtrip(key, obj, **kwargs): + obj.to_hdf(path, key, **kwargs) + return read_hdf(path, key) + + o = tm.makeTimeSeries() + tm.assert_series_equal(o, roundtrip("series", o)) + + o = tm.makeStringSeries() + tm.assert_series_equal(o, roundtrip("string_series", o)) + + o = tm.makeDataFrame() + tm.assert_frame_equal(o, roundtrip("frame", o)) + + # table + df = DataFrame({"A": range(5), "B": range(5)}) + df.to_hdf(path, "table", append=True) + result = read_hdf(path, "table", where=["index>2"]) + tm.assert_frame_equal(df[df.index > 2], result) + + +def test_long_strings(setup_path): + + # GH6166 + df = DataFrame( + {"a": tm.rands_array(100, size=10)}, index=tm.rands_array(100, size=10) + ) + + with ensure_clean_store(setup_path) as store: + store.append("df", df, data_columns=["a"]) + + result = store.select("df") + tm.assert_frame_equal(df, result) + + +def test_api(tmp_path, setup_path): + + # GH4584 + # API issue when to_hdf doesn't accept append AND format args + path = tmp_path / setup_path + + df = tm.makeDataFrame() + df.iloc[:10].to_hdf(path, "df", append=True, format="table") + df.iloc[10:].to_hdf(path, "df", append=True, format="table") + tm.assert_frame_equal(read_hdf(path, "df"), df) + + # append to False + df.iloc[:10].to_hdf(path, "df", append=False, format="table") + df.iloc[10:].to_hdf(path, "df", append=True, format="table") + tm.assert_frame_equal(read_hdf(path, "df"), df) + + +def test_api_append(tmp_path, setup_path): + path = tmp_path / setup_path + + df = tm.makeDataFrame() + df.iloc[:10].to_hdf(path, "df", append=True) + df.iloc[10:].to_hdf(path, "df", append=True, format="table") + tm.assert_frame_equal(read_hdf(path, "df"), df) + + # append to False + df.iloc[:10].to_hdf(path, "df", append=False, format="table") + df.iloc[10:].to_hdf(path, "df", append=True) + tm.assert_frame_equal(read_hdf(path, "df"), df) + + +def test_api_2(tmp_path, setup_path): + path = tmp_path / setup_path + + df = tm.makeDataFrame() + df.to_hdf(path, "df", append=False, format="fixed") + tm.assert_frame_equal(read_hdf(path, "df"), df) + + df.to_hdf(path, "df", append=False, format="f") + tm.assert_frame_equal(read_hdf(path, "df"), df) + + df.to_hdf(path, "df", append=False) + tm.assert_frame_equal(read_hdf(path, "df"), df) + + df.to_hdf(path, "df") + tm.assert_frame_equal(read_hdf(path, "df"), df) + + with ensure_clean_store(setup_path) as store: + + df = tm.makeDataFrame() + + _maybe_remove(store, "df") + store.append("df", df.iloc[:10], append=True, format="table") + store.append("df", df.iloc[10:], append=True, format="table") + tm.assert_frame_equal(store.select("df"), df) + + # append to False + _maybe_remove(store, "df") + store.append("df", df.iloc[:10], append=False, format="table") + store.append("df", df.iloc[10:], append=True, format="table") + tm.assert_frame_equal(store.select("df"), df) + + # formats + _maybe_remove(store, "df") + store.append("df", df.iloc[:10], append=False, format="table") + store.append("df", df.iloc[10:], append=True, format="table") + tm.assert_frame_equal(store.select("df"), df) + + _maybe_remove(store, "df") + store.append("df", df.iloc[:10], append=False, format="table") + store.append("df", df.iloc[10:], append=True, format=None) + tm.assert_frame_equal(store.select("df"), df) + + +def test_api_invalid(tmp_path, setup_path): + path = tmp_path / setup_path + # Invalid. + df = tm.makeDataFrame() + + msg = "Can only append to Tables" + + with pytest.raises(ValueError, match=msg): + df.to_hdf(path, "df", append=True, format="f") + + with pytest.raises(ValueError, match=msg): + df.to_hdf(path, "df", append=True, format="fixed") + + msg = r"invalid HDFStore format specified \[foo\]" + + with pytest.raises(TypeError, match=msg): + df.to_hdf(path, "df", append=True, format="foo") + + with pytest.raises(TypeError, match=msg): + df.to_hdf(path, "df", append=False, format="foo") + + # File path doesn't exist + path = "" + msg = f"File {path} does not exist" + + with pytest.raises(FileNotFoundError, match=msg): + read_hdf(path, "df") + + +def test_get(setup_path): + + with ensure_clean_store(setup_path) as store: + store["a"] = tm.makeTimeSeries() + left = store.get("a") + right = store["a"] + tm.assert_series_equal(left, right) + + left = store.get("/a") + right = store["/a"] + tm.assert_series_equal(left, right) + + with pytest.raises(KeyError, match="'No object named b in the file'"): + store.get("b") + + +def test_put_integer(setup_path): + # non-date, non-string index + df = DataFrame(np.random.randn(50, 100)) + _check_roundtrip(df, tm.assert_frame_equal, setup_path) + + +def test_table_values_dtypes_roundtrip(setup_path): + + with ensure_clean_store(setup_path) as store: + df1 = DataFrame({"a": [1, 2, 3]}, dtype="f8") + store.append("df_f8", df1) + tm.assert_series_equal(df1.dtypes, store["df_f8"].dtypes) + + df2 = DataFrame({"a": [1, 2, 3]}, dtype="i8") + store.append("df_i8", df2) + tm.assert_series_equal(df2.dtypes, store["df_i8"].dtypes) + + # incompatible dtype + msg = re.escape( + "invalid combination of [values_axes] on appending data " + "[name->values_block_0,cname->values_block_0," + "dtype->float64,kind->float,shape->(1, 3)] vs " + "current table [name->values_block_0," + "cname->values_block_0,dtype->int64,kind->integer," + "shape->None]" + ) + with pytest.raises(ValueError, match=msg): + store.append("df_i8", df1) + + # check creation/storage/retrieval of float32 (a bit hacky to + # actually create them thought) + df1 = DataFrame(np.array([[1], [2], [3]], dtype="f4"), columns=["A"]) + store.append("df_f4", df1) + tm.assert_series_equal(df1.dtypes, store["df_f4"].dtypes) + assert df1.dtypes[0] == "float32" + + # check with mixed dtypes + df1 = DataFrame( + { + c: Series(np.random.randint(5), dtype=c) + for c in ["float32", "float64", "int32", "int64", "int16", "int8"] + } + ) + df1["string"] = "foo" + df1["float322"] = 1.0 + df1["float322"] = df1["float322"].astype("float32") + df1["bool"] = df1["float32"] > 0 + df1["time1"] = Timestamp("20130101") + df1["time2"] = Timestamp("20130102") + + store.append("df_mixed_dtypes1", df1) + result = store.select("df_mixed_dtypes1").dtypes.value_counts() + result.index = [str(i) for i in result.index] + expected = Series( + { + "float32": 2, + "float64": 1, + "int32": 1, + "bool": 1, + "int16": 1, + "int8": 1, + "int64": 1, + "object": 1, + "datetime64[ns]": 2, + } + ) + result = result.sort_index() + expected = expected.sort_index() + tm.assert_series_equal(result, expected) + + +@pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning") +def test_series(setup_path): + + s = tm.makeStringSeries() + _check_roundtrip(s, tm.assert_series_equal, path=setup_path) + + ts = tm.makeTimeSeries() + _check_roundtrip(ts, tm.assert_series_equal, path=setup_path) + + ts2 = Series(ts.index, Index(ts.index, dtype=object)) + _check_roundtrip(ts2, tm.assert_series_equal, path=setup_path) + + ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) + _check_roundtrip( + ts3, tm.assert_series_equal, path=setup_path, check_index_type=False + ) + + +def test_float_index(setup_path): + + # GH #454 + index = np.random.randn(10) + s = Series(np.random.randn(10), index=index) + _check_roundtrip(s, tm.assert_series_equal, path=setup_path) + + +def test_tuple_index(setup_path): + + # GH #492 + col = np.arange(10) + idx = [(0.0, 1.0), (2.0, 3.0), (4.0, 5.0)] + data = np.random.randn(30).reshape((3, 10)) + DF = DataFrame(data, index=idx, columns=col) + + with catch_warnings(record=True): + simplefilter("ignore", pd.errors.PerformanceWarning) + _check_roundtrip(DF, tm.assert_frame_equal, path=setup_path) + + +@pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning") +def test_index_types(setup_path): + with catch_warnings(record=True): + values = np.random.randn(2) + + func = lambda lhs, rhs: tm.assert_series_equal(lhs, rhs, check_index_type=True) + + with catch_warnings(record=True): + ser = Series(values, [0, "y"]) + _check_roundtrip(ser, func, path=setup_path) + + with catch_warnings(record=True): + ser = Series(values, [datetime.datetime.today(), 0]) + _check_roundtrip(ser, func, path=setup_path) + + with catch_warnings(record=True): + ser = Series(values, ["y", 0]) + _check_roundtrip(ser, func, path=setup_path) + + with catch_warnings(record=True): + ser = Series(values, [datetime.date.today(), "a"]) + _check_roundtrip(ser, func, path=setup_path) + + with catch_warnings(record=True): + ser = Series(values, [0, "y"]) + _check_roundtrip(ser, func, path=setup_path) + + ser = Series(values, [datetime.datetime.today(), 0]) + _check_roundtrip(ser, func, path=setup_path) + + ser = Series(values, ["y", 0]) + _check_roundtrip(ser, func, path=setup_path) + + ser = Series(values, [datetime.date.today(), "a"]) + _check_roundtrip(ser, func, path=setup_path) + + ser = Series(values, [1.23, "b"]) + _check_roundtrip(ser, func, path=setup_path) + + ser = Series(values, [1, 1.53]) + _check_roundtrip(ser, func, path=setup_path) + + ser = Series(values, [1, 5]) + _check_roundtrip(ser, func, path=setup_path) + + ser = Series( + values, [datetime.datetime(2012, 1, 1), datetime.datetime(2012, 1, 2)] + ) + _check_roundtrip(ser, func, path=setup_path) + + +def test_timeseries_preepoch(setup_path, request): + + dr = bdate_range("1/1/1940", "1/1/1960") + ts = Series(np.random.randn(len(dr)), index=dr) + try: + _check_roundtrip(ts, tm.assert_series_equal, path=setup_path) + except OverflowError: + if is_platform_windows(): + request.node.add_marker( + pytest.mark.xfail("known failure on some windows platforms") + ) + raise + + +@pytest.mark.parametrize( + "compression", [False, pytest.param(True, marks=td.skip_if_windows)] +) +def test_frame(compression, setup_path): + + df = tm.makeDataFrame() + + # put in some random NAs + df.values[0, 0] = np.nan + df.values[5, 3] = np.nan + + _check_roundtrip_table( + df, tm.assert_frame_equal, path=setup_path, compression=compression + ) + _check_roundtrip( + df, tm.assert_frame_equal, path=setup_path, compression=compression + ) + + tdf = tm.makeTimeDataFrame() + _check_roundtrip( + tdf, tm.assert_frame_equal, path=setup_path, compression=compression + ) + + with ensure_clean_store(setup_path) as store: + # not consolidated + df["foo"] = np.random.randn(len(df)) + store["df"] = df + recons = store["df"] + assert recons._mgr.is_consolidated() + + # empty + _check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path) + + +def test_empty_series_frame(setup_path): + s0 = Series(dtype=object) + s1 = Series(name="myseries", dtype=object) + df0 = DataFrame() + df1 = DataFrame(index=["a", "b", "c"]) + df2 = DataFrame(columns=["d", "e", "f"]) + + _check_roundtrip(s0, tm.assert_series_equal, path=setup_path) + _check_roundtrip(s1, tm.assert_series_equal, path=setup_path) + _check_roundtrip(df0, tm.assert_frame_equal, path=setup_path) + _check_roundtrip(df1, tm.assert_frame_equal, path=setup_path) + _check_roundtrip(df2, tm.assert_frame_equal, path=setup_path) + + +@pytest.mark.parametrize("dtype", [np.int64, np.float64, object, "m8[ns]", "M8[ns]"]) +def test_empty_series(dtype, setup_path): + s = Series(dtype=dtype) + _check_roundtrip(s, tm.assert_series_equal, path=setup_path) + + +def test_can_serialize_dates(setup_path): + + rng = [x.date() for x in bdate_range("1/1/2000", "1/30/2000")] + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + + _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) + + +def test_store_hierarchical(setup_path, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + + _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) + _check_roundtrip(frame.T, tm.assert_frame_equal, path=setup_path) + _check_roundtrip(frame["A"], tm.assert_series_equal, path=setup_path) + + # check that the names are stored + with ensure_clean_store(setup_path) as store: + store["frame"] = frame + recons = store["frame"] + tm.assert_frame_equal(recons, frame) + + +@pytest.mark.parametrize( + "compression", [False, pytest.param(True, marks=td.skip_if_windows)] +) +def test_store_mixed(compression, setup_path): + def _make_one(): + df = tm.makeDataFrame() + df["obj1"] = "foo" + df["obj2"] = "bar" + df["bool1"] = df["A"] > 0 + df["bool2"] = df["B"] > 0 + df["int1"] = 1 + df["int2"] = 2 + return df._consolidate() + + df1 = _make_one() + df2 = _make_one() + + _check_roundtrip(df1, tm.assert_frame_equal, path=setup_path) + _check_roundtrip(df2, tm.assert_frame_equal, path=setup_path) + + with ensure_clean_store(setup_path) as store: + store["obj"] = df1 + tm.assert_frame_equal(store["obj"], df1) + store["obj"] = df2 + tm.assert_frame_equal(store["obj"], df2) + + # check that can store Series of all of these types + _check_roundtrip( + df1["obj1"], + tm.assert_series_equal, + path=setup_path, + compression=compression, + ) + _check_roundtrip( + df1["bool1"], + tm.assert_series_equal, + path=setup_path, + compression=compression, + ) + _check_roundtrip( + df1["int1"], + tm.assert_series_equal, + path=setup_path, + compression=compression, + ) + + +def _check_roundtrip(obj, comparator, path, compression=False, **kwargs): + + options = {} + if compression: + options["complib"] = _default_compressor + + with ensure_clean_store(path, "w", **options) as store: + store["obj"] = obj + retrieved = store["obj"] + comparator(retrieved, obj, **kwargs) + + +def _check_roundtrip_table(obj, comparator, path, compression=False): + options = {} + if compression: + options["complib"] = _default_compressor + + with ensure_clean_store(path, "w", **options) as store: + store.put("obj", obj, format="table") + retrieved = store["obj"] + + comparator(retrieved, obj) + + +def test_unicode_index(setup_path): + + unicode_values = ["\u03c3", "\u03c3\u03c3"] + + # PerformanceWarning + with catch_warnings(record=True): + simplefilter("ignore", pd.errors.PerformanceWarning) + s = Series(np.random.randn(len(unicode_values)), unicode_values) + _check_roundtrip(s, tm.assert_series_equal, path=setup_path) + + +def test_unicode_longer_encoded(setup_path): + # GH 11234 + char = "\u0394" + df = DataFrame({"A": [char]}) + with ensure_clean_store(setup_path) as store: + store.put("df", df, format="table", encoding="utf-8") + result = store.get("df") + tm.assert_frame_equal(result, df) + + df = DataFrame({"A": ["a", char], "B": ["b", "b"]}) + with ensure_clean_store(setup_path) as store: + store.put("df", df, format="table", encoding="utf-8") + result = store.get("df") + tm.assert_frame_equal(result, df) + + +def test_store_datetime_mixed(setup_path): + + df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]}) + ts = tm.makeTimeSeries() + df["d"] = ts.index[:3] + _check_roundtrip(df, tm.assert_frame_equal, path=setup_path) + + +def test_round_trip_equals(tmp_path, setup_path): + # GH 9330 + df = DataFrame({"B": [1, 2], "A": ["x", "y"]}) + + path = tmp_path / setup_path + df.to_hdf(path, "df", format="table") + other = read_hdf(path, "df") + tm.assert_frame_equal(df, other) + assert df.equals(other) + assert other.equals(df) diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py new file mode 100644 index 0000000000000..b0c9b85e7ad05 --- /dev/null +++ b/pandas/tests/io/pytables/test_select.py @@ -0,0 +1,973 @@ +from warnings import catch_warnings + +import numpy as np +import pytest + +from pandas._libs.tslibs import Timestamp + +import pandas as pd +from pandas import ( + DataFrame, + HDFStore, + Index, + MultiIndex, + Series, + _testing as tm, + bdate_range, + concat, + date_range, + isna, + read_hdf, +) +from pandas.tests.io.pytables.common import ( + _maybe_remove, + ensure_clean_store, +) + +from pandas.io.pytables import Term + +pytestmark = pytest.mark.single_cpu + + +def test_select_columns_in_where(setup_path): + + # GH 6169 + # recreate multi-indexes when columns is passed + # in the `where` argument + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["foo_name", "bar_name"], + ) + + # With a DataFrame + df = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) + + with ensure_clean_store(setup_path) as store: + store.put("df", df, format="table") + expected = df[["A"]] + + tm.assert_frame_equal(store.select("df", columns=["A"]), expected) + + tm.assert_frame_equal(store.select("df", where="columns=['A']"), expected) + + # With a Series + s = Series(np.random.randn(10), index=index, name="A") + with ensure_clean_store(setup_path) as store: + store.put("s", s, format="table") + tm.assert_series_equal(store.select("s", where="columns=['A']"), s) + + +def test_select_with_dups(setup_path): + + # single dtypes + df = DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]) + df.index = date_range("20130101 9:30", periods=10, freq="T") + + with ensure_clean_store(setup_path) as store: + store.append("df", df) + + result = store.select("df") + expected = df + tm.assert_frame_equal(result, expected, by_blocks=True) + + result = store.select("df", columns=df.columns) + expected = df + tm.assert_frame_equal(result, expected, by_blocks=True) + + result = store.select("df", columns=["A"]) + expected = df.loc[:, ["A"]] + tm.assert_frame_equal(result, expected) + + # dups across dtypes + df = concat( + [ + DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]), + DataFrame( + np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"] + ), + ], + axis=1, + ) + df.index = date_range("20130101 9:30", periods=10, freq="T") + + with ensure_clean_store(setup_path) as store: + store.append("df", df) + + result = store.select("df") + expected = df + tm.assert_frame_equal(result, expected, by_blocks=True) + + result = store.select("df", columns=df.columns) + expected = df + tm.assert_frame_equal(result, expected, by_blocks=True) + + expected = df.loc[:, ["A"]] + result = store.select("df", columns=["A"]) + tm.assert_frame_equal(result, expected, by_blocks=True) + + expected = df.loc[:, ["B", "A"]] + result = store.select("df", columns=["B", "A"]) + tm.assert_frame_equal(result, expected, by_blocks=True) + + # duplicates on both index and columns + with ensure_clean_store(setup_path) as store: + store.append("df", df) + store.append("df", df) + + expected = df.loc[:, ["B", "A"]] + expected = concat([expected, expected]) + result = store.select("df", columns=["B", "A"]) + tm.assert_frame_equal(result, expected, by_blocks=True) + + +def test_select(setup_path): + + with ensure_clean_store(setup_path) as store: + + with catch_warnings(record=True): + + # select with columns= + df = tm.makeTimeDataFrame() + _maybe_remove(store, "df") + store.append("df", df) + result = store.select("df", columns=["A", "B"]) + expected = df.reindex(columns=["A", "B"]) + tm.assert_frame_equal(expected, result) + + # equivalently + result = store.select("df", [("columns=['A', 'B']")]) + expected = df.reindex(columns=["A", "B"]) + tm.assert_frame_equal(expected, result) + + # with a data column + _maybe_remove(store, "df") + store.append("df", df, data_columns=["A"]) + result = store.select("df", ["A > 0"], columns=["A", "B"]) + expected = df[df.A > 0].reindex(columns=["A", "B"]) + tm.assert_frame_equal(expected, result) + + # all a data columns + _maybe_remove(store, "df") + store.append("df", df, data_columns=True) + result = store.select("df", ["A > 0"], columns=["A", "B"]) + expected = df[df.A > 0].reindex(columns=["A", "B"]) + tm.assert_frame_equal(expected, result) + + # with a data column, but different columns + _maybe_remove(store, "df") + store.append("df", df, data_columns=["A"]) + result = store.select("df", ["A > 0"], columns=["C", "D"]) + expected = df[df.A > 0].reindex(columns=["C", "D"]) + tm.assert_frame_equal(expected, result) + + +def test_select_dtypes(setup_path): + + with ensure_clean_store(setup_path) as store: + # with a Timestamp data column (GH #2637) + df = DataFrame( + { + "ts": bdate_range("2012-01-01", periods=300), + "A": np.random.randn(300), + } + ) + _maybe_remove(store, "df") + store.append("df", df, data_columns=["ts", "A"]) + + result = store.select("df", "ts>=Timestamp('2012-02-01')") + expected = df[df.ts >= Timestamp("2012-02-01")] + tm.assert_frame_equal(expected, result) + + # bool columns (GH #2849) + df = DataFrame(np.random.randn(5, 2), columns=["A", "B"]) + df["object"] = "foo" + df.loc[4:5, "object"] = "bar" + df["boolv"] = df["A"] > 0 + _maybe_remove(store, "df") + store.append("df", df, data_columns=True) + + expected = df[df.boolv == True].reindex(columns=["A", "boolv"]) # noqa:E712 + for v in [True, "true", 1]: + result = store.select("df", f"boolv == {v}", columns=["A", "boolv"]) + tm.assert_frame_equal(expected, result) + + expected = df[df.boolv == False].reindex(columns=["A", "boolv"]) # noqa:E712 + for v in [False, "false", 0]: + result = store.select("df", f"boolv == {v}", columns=["A", "boolv"]) + tm.assert_frame_equal(expected, result) + + # integer index + df = DataFrame({"A": np.random.rand(20), "B": np.random.rand(20)}) + _maybe_remove(store, "df_int") + store.append("df_int", df) + result = store.select("df_int", "index<10 and columns=['A']") + expected = df.reindex(index=list(df.index)[0:10], columns=["A"]) + tm.assert_frame_equal(expected, result) + + # float index + df = DataFrame( + { + "A": np.random.rand(20), + "B": np.random.rand(20), + "index": np.arange(20, dtype="f8"), + } + ) + _maybe_remove(store, "df_float") + store.append("df_float", df) + result = store.select("df_float", "index<10.0 and columns=['A']") + expected = df.reindex(index=list(df.index)[0:10], columns=["A"]) + tm.assert_frame_equal(expected, result) + + with ensure_clean_store(setup_path) as store: + + # floats w/o NaN + df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64") + df["cols"] = (df["cols"] + 10).apply(str) + + store.append("df1", df, data_columns=True) + result = store.select("df1", where="values>2.0") + expected = df[df["values"] > 2.0] + tm.assert_frame_equal(expected, result) + + # floats with NaN + df.iloc[0] = np.nan + expected = df[df["values"] > 2.0] + + store.append("df2", df, data_columns=True, index=False) + result = store.select("df2", where="values>2.0") + tm.assert_frame_equal(expected, result) + + # https://github.com/PyTables/PyTables/issues/282 + # bug in selection when 0th row has a np.nan and an index + # store.append('df3',df,data_columns=True) + # result = store.select( + # 'df3', where='values>2.0') + # tm.assert_frame_equal(expected, result) + + # not in first position float with NaN ok too + df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64") + df["cols"] = (df["cols"] + 10).apply(str) + + df.iloc[1] = np.nan + expected = df[df["values"] > 2.0] + + store.append("df4", df, data_columns=True) + result = store.select("df4", where="values>2.0") + tm.assert_frame_equal(expected, result) + + # test selection with comparison against numpy scalar + # GH 11283 + with ensure_clean_store(setup_path) as store: + df = tm.makeDataFrame() + + expected = df[df["A"] > 0] + + store.append("df", df, data_columns=True) + np_zero = np.float64(0) # noqa:F841 + result = store.select("df", where=["A>np_zero"]) + tm.assert_frame_equal(expected, result) + + +def test_select_with_many_inputs(setup_path): + + with ensure_clean_store(setup_path) as store: + + df = DataFrame( + { + "ts": bdate_range("2012-01-01", periods=300), + "A": np.random.randn(300), + "B": range(300), + "users": ["a"] * 50 + + ["b"] * 50 + + ["c"] * 100 + + [f"a{i:03d}" for i in range(100)], + } + ) + _maybe_remove(store, "df") + store.append("df", df, data_columns=["ts", "A", "B", "users"]) + + # regular select + result = store.select("df", "ts>=Timestamp('2012-02-01')") + expected = df[df.ts >= Timestamp("2012-02-01")] + tm.assert_frame_equal(expected, result) + + # small selector + result = store.select("df", "ts>=Timestamp('2012-02-01') & users=['a','b','c']") + expected = df[ + (df.ts >= Timestamp("2012-02-01")) & df.users.isin(["a", "b", "c"]) + ] + tm.assert_frame_equal(expected, result) + + # big selector along the columns + selector = ["a", "b", "c"] + [f"a{i:03d}" for i in range(60)] + result = store.select("df", "ts>=Timestamp('2012-02-01') and users=selector") + expected = df[(df.ts >= Timestamp("2012-02-01")) & df.users.isin(selector)] + tm.assert_frame_equal(expected, result) + + selector = range(100, 200) + result = store.select("df", "B=selector") + expected = df[df.B.isin(selector)] + tm.assert_frame_equal(expected, result) + assert len(result) == 100 + + # big selector along the index + selector = Index(df.ts[0:100].values) + result = store.select("df", "ts=selector") + expected = df[df.ts.isin(selector.values)] + tm.assert_frame_equal(expected, result) + assert len(result) == 100 + + +def test_select_iterator(tmp_path, setup_path): + + # single table + with ensure_clean_store(setup_path) as store: + + df = tm.makeTimeDataFrame(500) + _maybe_remove(store, "df") + store.append("df", df) + + expected = store.select("df") + + results = list(store.select("df", iterator=True)) + result = concat(results) + tm.assert_frame_equal(expected, result) + + results = list(store.select("df", chunksize=100)) + assert len(results) == 5 + result = concat(results) + tm.assert_frame_equal(expected, result) + + results = list(store.select("df", chunksize=150)) + result = concat(results) + tm.assert_frame_equal(result, expected) + + path = tmp_path / setup_path + + df = tm.makeTimeDataFrame(500) + df.to_hdf(path, "df_non_table") + + msg = "can only use an iterator or chunksize on a table" + with pytest.raises(TypeError, match=msg): + read_hdf(path, "df_non_table", chunksize=100) + + with pytest.raises(TypeError, match=msg): + read_hdf(path, "df_non_table", iterator=True) + + path = tmp_path / setup_path + + df = tm.makeTimeDataFrame(500) + df.to_hdf(path, "df", format="table") + + results = list(read_hdf(path, "df", chunksize=100)) + result = concat(results) + + assert len(results) == 5 + tm.assert_frame_equal(result, df) + tm.assert_frame_equal(result, read_hdf(path, "df")) + + # multiple + + with ensure_clean_store(setup_path) as store: + + df1 = tm.makeTimeDataFrame(500) + store.append("df1", df1, data_columns=True) + df2 = tm.makeTimeDataFrame(500).rename(columns="{}_2".format) + df2["foo"] = "bar" + store.append("df2", df2) + + df = concat([df1, df2], axis=1) + + # full selection + expected = store.select_as_multiple(["df1", "df2"], selector="df1") + results = list( + store.select_as_multiple(["df1", "df2"], selector="df1", chunksize=150) + ) + result = concat(results) + tm.assert_frame_equal(expected, result) + + +def test_select_iterator_complete_8014(setup_path): + + # GH 8014 + # using iterator and where clause + chunksize = 1e4 + + # no iterator + with ensure_clean_store(setup_path) as store: + + expected = tm.makeTimeDataFrame(100064, "S") + _maybe_remove(store, "df") + store.append("df", expected) + + beg_dt = expected.index[0] + end_dt = expected.index[-1] + + # select w/o iteration and no where clause works + result = store.select("df") + tm.assert_frame_equal(expected, result) + + # select w/o iterator and where clause, single term, begin + # of range, works + where = f"index >= '{beg_dt}'" + result = store.select("df", where=where) + tm.assert_frame_equal(expected, result) + + # select w/o iterator and where clause, single term, end + # of range, works + where = f"index <= '{end_dt}'" + result = store.select("df", where=where) + tm.assert_frame_equal(expected, result) + + # select w/o iterator and where clause, inclusive range, + # works + where = f"index >= '{beg_dt}' & index <= '{end_dt}'" + result = store.select("df", where=where) + tm.assert_frame_equal(expected, result) + + # with iterator, full range + with ensure_clean_store(setup_path) as store: + + expected = tm.makeTimeDataFrame(100064, "S") + _maybe_remove(store, "df") + store.append("df", expected) + + beg_dt = expected.index[0] + end_dt = expected.index[-1] + + # select w/iterator and no where clause works + results = list(store.select("df", chunksize=chunksize)) + result = concat(results) + tm.assert_frame_equal(expected, result) + + # select w/iterator and where clause, single term, begin of range + where = f"index >= '{beg_dt}'" + results = list(store.select("df", where=where, chunksize=chunksize)) + result = concat(results) + tm.assert_frame_equal(expected, result) + + # select w/iterator and where clause, single term, end of range + where = f"index <= '{end_dt}'" + results = list(store.select("df", where=where, chunksize=chunksize)) + result = concat(results) + tm.assert_frame_equal(expected, result) + + # select w/iterator and where clause, inclusive range + where = f"index >= '{beg_dt}' & index <= '{end_dt}'" + results = list(store.select("df", where=where, chunksize=chunksize)) + result = concat(results) + tm.assert_frame_equal(expected, result) + + +def test_select_iterator_non_complete_8014(setup_path): + + # GH 8014 + # using iterator and where clause + chunksize = 1e4 + + # with iterator, non complete range + with ensure_clean_store(setup_path) as store: + + expected = tm.makeTimeDataFrame(100064, "S") + _maybe_remove(store, "df") + store.append("df", expected) + + beg_dt = expected.index[1] + end_dt = expected.index[-2] + + # select w/iterator and where clause, single term, begin of range + where = f"index >= '{beg_dt}'" + results = list(store.select("df", where=where, chunksize=chunksize)) + result = concat(results) + rexpected = expected[expected.index >= beg_dt] + tm.assert_frame_equal(rexpected, result) + + # select w/iterator and where clause, single term, end of range + where = f"index <= '{end_dt}'" + results = list(store.select("df", where=where, chunksize=chunksize)) + result = concat(results) + rexpected = expected[expected.index <= end_dt] + tm.assert_frame_equal(rexpected, result) + + # select w/iterator and where clause, inclusive range + where = f"index >= '{beg_dt}' & index <= '{end_dt}'" + results = list(store.select("df", where=where, chunksize=chunksize)) + result = concat(results) + rexpected = expected[(expected.index >= beg_dt) & (expected.index <= end_dt)] + tm.assert_frame_equal(rexpected, result) + + # with iterator, empty where + with ensure_clean_store(setup_path) as store: + + expected = tm.makeTimeDataFrame(100064, "S") + _maybe_remove(store, "df") + store.append("df", expected) + + end_dt = expected.index[-1] + + # select w/iterator and where clause, single term, begin of range + where = f"index > '{end_dt}'" + results = list(store.select("df", where=where, chunksize=chunksize)) + assert 0 == len(results) + + +def test_select_iterator_many_empty_frames(setup_path): + + # GH 8014 + # using iterator and where clause can return many empty + # frames. + chunksize = 10_000 + + # with iterator, range limited to the first chunk + with ensure_clean_store(setup_path) as store: + + expected = tm.makeTimeDataFrame(100000, "S") + _maybe_remove(store, "df") + store.append("df", expected) + + beg_dt = expected.index[0] + end_dt = expected.index[chunksize - 1] + + # select w/iterator and where clause, single term, begin of range + where = f"index >= '{beg_dt}'" + results = list(store.select("df", where=where, chunksize=chunksize)) + result = concat(results) + rexpected = expected[expected.index >= beg_dt] + tm.assert_frame_equal(rexpected, result) + + # select w/iterator and where clause, single term, end of range + where = f"index <= '{end_dt}'" + results = list(store.select("df", where=where, chunksize=chunksize)) + + assert len(results) == 1 + result = concat(results) + rexpected = expected[expected.index <= end_dt] + tm.assert_frame_equal(rexpected, result) + + # select w/iterator and where clause, inclusive range + where = f"index >= '{beg_dt}' & index <= '{end_dt}'" + results = list(store.select("df", where=where, chunksize=chunksize)) + + # should be 1, is 10 + assert len(results) == 1 + result = concat(results) + rexpected = expected[(expected.index >= beg_dt) & (expected.index <= end_dt)] + tm.assert_frame_equal(rexpected, result) + + # select w/iterator and where clause which selects + # *nothing*. + # + # To be consistent with Python idiom I suggest this should + # return [] e.g. `for e in []: print True` never prints + # True. + + where = f"index <= '{beg_dt}' & index >= '{end_dt}'" + results = list(store.select("df", where=where, chunksize=chunksize)) + + # should be [] + assert len(results) == 0 + + +def test_frame_select(setup_path): + + df = tm.makeTimeDataFrame() + + with ensure_clean_store(setup_path) as store: + store.put("frame", df, format="table") + date = df.index[len(df) // 2] + + crit1 = Term("index>=date") + assert crit1.env.scope["date"] == date + + crit2 = "columns=['A', 'D']" + crit3 = "columns=A" + + result = store.select("frame", [crit1, crit2]) + expected = df.loc[date:, ["A", "D"]] + tm.assert_frame_equal(result, expected) + + result = store.select("frame", [crit3]) + expected = df.loc[:, ["A"]] + tm.assert_frame_equal(result, expected) + + # invalid terms + df = tm.makeTimeDataFrame() + store.append("df_time", df) + msg = "could not convert string to Timestamp" + with pytest.raises(ValueError, match=msg): + store.select("df_time", "index>0") + + # can't select if not written as table + # store['frame'] = df + # with pytest.raises(ValueError): + # store.select('frame', [crit1, crit2]) + + +def test_frame_select_complex(setup_path): + # select via complex criteria + + df = tm.makeTimeDataFrame() + df["string"] = "foo" + df.loc[df.index[0:4], "string"] = "bar" + + with ensure_clean_store(setup_path) as store: + store.put("df", df, format="table", data_columns=["string"]) + + # empty + result = store.select("df", 'index>df.index[3] & string="bar"') + expected = df.loc[(df.index > df.index[3]) & (df.string == "bar")] + tm.assert_frame_equal(result, expected) + + result = store.select("df", 'index>df.index[3] & string="foo"') + expected = df.loc[(df.index > df.index[3]) & (df.string == "foo")] + tm.assert_frame_equal(result, expected) + + # or + result = store.select("df", 'index>df.index[3] | string="bar"') + expected = df.loc[(df.index > df.index[3]) | (df.string == "bar")] + tm.assert_frame_equal(result, expected) + + result = store.select( + "df", '(index>df.index[3] & index<=df.index[6]) | string="bar"' + ) + expected = df.loc[ + ((df.index > df.index[3]) & (df.index <= df.index[6])) + | (df.string == "bar") + ] + tm.assert_frame_equal(result, expected) + + # invert + result = store.select("df", 'string!="bar"') + expected = df.loc[df.string != "bar"] + tm.assert_frame_equal(result, expected) + + # invert not implemented in numexpr :( + msg = "cannot use an invert condition when passing to numexpr" + with pytest.raises(NotImplementedError, match=msg): + store.select("df", '~(string="bar")') + + # invert ok for filters + result = store.select("df", "~(columns=['A','B'])") + expected = df.loc[:, df.columns.difference(["A", "B"])] + tm.assert_frame_equal(result, expected) + + # in + result = store.select("df", "index>df.index[3] & columns in ['A','B']") + expected = df.loc[df.index > df.index[3]].reindex(columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + +def test_frame_select_complex2(tmp_path): + + pp = tmp_path / "params.hdf" + hh = tmp_path / "hist.hdf" + + # use non-trivial selection criteria + params = DataFrame({"A": [1, 1, 2, 2, 3]}) + params.to_hdf(pp, "df", mode="w", format="table", data_columns=["A"]) + + selection = read_hdf(pp, "df", where="A=[2,3]") + hist = DataFrame( + np.random.randn(25, 1), + columns=["data"], + index=MultiIndex.from_tuples( + [(i, j) for i in range(5) for j in range(5)], names=["l1", "l2"] + ), + ) + + hist.to_hdf(hh, "df", mode="w", format="table") + + expected = read_hdf(hh, "df", where="l1=[2, 3, 4]") + + # scope with list like + l0 = selection.index.tolist() # noqa:F841 + with HDFStore(hh) as store: + result = store.select("df", where="l1=l0") + tm.assert_frame_equal(result, expected) + + result = read_hdf(hh, "df", where="l1=l0") + tm.assert_frame_equal(result, expected) + + # index + index = selection.index # noqa:F841 + result = read_hdf(hh, "df", where="l1=index") + tm.assert_frame_equal(result, expected) + + result = read_hdf(hh, "df", where="l1=selection.index") + tm.assert_frame_equal(result, expected) + + result = read_hdf(hh, "df", where="l1=selection.index.tolist()") + tm.assert_frame_equal(result, expected) + + result = read_hdf(hh, "df", where="l1=list(selection.index)") + tm.assert_frame_equal(result, expected) + + # scope with index + with HDFStore(hh) as store: + result = store.select("df", where="l1=index") + tm.assert_frame_equal(result, expected) + + result = store.select("df", where="l1=selection.index") + tm.assert_frame_equal(result, expected) + + result = store.select("df", where="l1=selection.index.tolist()") + tm.assert_frame_equal(result, expected) + + result = store.select("df", where="l1=list(selection.index)") + tm.assert_frame_equal(result, expected) + + +def test_invalid_filtering(setup_path): + + # can't use more than one filter (atm) + + df = tm.makeTimeDataFrame() + + with ensure_clean_store(setup_path) as store: + store.put("df", df, format="table") + + msg = "unable to collapse Joint Filters" + # not implemented + with pytest.raises(NotImplementedError, match=msg): + store.select("df", "columns=['A'] | columns=['B']") + + # in theory we could deal with this + with pytest.raises(NotImplementedError, match=msg): + store.select("df", "columns=['A','B'] & columns=['C']") + + +def test_string_select(setup_path): + # GH 2973 + with ensure_clean_store(setup_path) as store: + + df = tm.makeTimeDataFrame() + + # test string ==/!= + df["x"] = "none" + df.loc[df.index[2:7], "x"] = "" + + store.append("df", df, data_columns=["x"]) + + result = store.select("df", "x=none") + expected = df[df.x == "none"] + tm.assert_frame_equal(result, expected) + + result = store.select("df", "x!=none") + expected = df[df.x != "none"] + tm.assert_frame_equal(result, expected) + + df2 = df.copy() + df2.loc[df2.x == "", "x"] = np.nan + + store.append("df2", df2, data_columns=["x"]) + result = store.select("df2", "x!=none") + expected = df2[isna(df2.x)] + tm.assert_frame_equal(result, expected) + + # int ==/!= + df["int"] = 1 + df.loc[df.index[2:7], "int"] = 2 + + store.append("df3", df, data_columns=["int"]) + + result = store.select("df3", "int=2") + expected = df[df.int == 2] + tm.assert_frame_equal(result, expected) + + result = store.select("df3", "int!=2") + expected = df[df.int != 2] + tm.assert_frame_equal(result, expected) + + +def test_select_as_multiple(setup_path): + + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) + df2["foo"] = "bar" + + with ensure_clean_store(setup_path) as store: + + msg = "keys must be a list/tuple" + # no tables stored + with pytest.raises(TypeError, match=msg): + store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1") + + store.append("df1", df1, data_columns=["A", "B"]) + store.append("df2", df2) + + # exceptions + with pytest.raises(TypeError, match=msg): + store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1") + + with pytest.raises(TypeError, match=msg): + store.select_as_multiple([None], where=["A>0", "B>0"], selector="df1") + + msg = "'No object named df3 in the file'" + with pytest.raises(KeyError, match=msg): + store.select_as_multiple( + ["df1", "df3"], where=["A>0", "B>0"], selector="df1" + ) + + with pytest.raises(KeyError, match=msg): + store.select_as_multiple(["df3"], where=["A>0", "B>0"], selector="df1") + + with pytest.raises(KeyError, match="'No object named df4 in the file'"): + store.select_as_multiple( + ["df1", "df2"], where=["A>0", "B>0"], selector="df4" + ) + + # default select + result = store.select("df1", ["A>0", "B>0"]) + expected = store.select_as_multiple( + ["df1"], where=["A>0", "B>0"], selector="df1" + ) + tm.assert_frame_equal(result, expected) + expected = store.select_as_multiple("df1", where=["A>0", "B>0"], selector="df1") + tm.assert_frame_equal(result, expected) + + # multiple + result = store.select_as_multiple( + ["df1", "df2"], where=["A>0", "B>0"], selector="df1" + ) + expected = concat([df1, df2], axis=1) + expected = expected[(expected.A > 0) & (expected.B > 0)] + tm.assert_frame_equal(result, expected, check_freq=False) + # FIXME: 2021-01-20 this is failing with freq None vs 4B on some builds + + # multiple (diff selector) + result = store.select_as_multiple( + ["df1", "df2"], where="index>df2.index[4]", selector="df2" + ) + expected = concat([df1, df2], axis=1) + expected = expected[5:] + tm.assert_frame_equal(result, expected) + + # test exception for diff rows + store.append("df3", tm.makeTimeDataFrame(nper=50)) + msg = "all tables must have exactly the same nrows!" + with pytest.raises(ValueError, match=msg): + store.select_as_multiple( + ["df1", "df3"], where=["A>0", "B>0"], selector="df1" + ) + + +def test_nan_selection_bug_4858(setup_path): + + with ensure_clean_store(setup_path) as store: + + df = DataFrame({"cols": range(6), "values": range(6)}, dtype="float64") + df["cols"] = (df["cols"] + 10).apply(str) + df.iloc[0] = np.nan + + expected = DataFrame( + {"cols": ["13.0", "14.0", "15.0"], "values": [3.0, 4.0, 5.0]}, + index=[3, 4, 5], + ) + + # write w/o the index on that particular column + store.append("df", df, data_columns=True, index=["cols"]) + result = store.select("df", where="values>2.0") + tm.assert_frame_equal(result, expected) + + +def test_query_with_nested_special_character(setup_path): + df = DataFrame( + { + "a": ["a", "a", "c", "b", "test & test", "c", "b", "e"], + "b": [1, 2, 3, 4, 5, 6, 7, 8], + } + ) + expected = df[df.a == "test & test"] + with ensure_clean_store(setup_path) as store: + store.append("test", df, format="table", data_columns=True) + result = store.select("test", 'a = "test & test"') + tm.assert_frame_equal(expected, result) + + +def test_query_long_float_literal(setup_path): + # GH 14241 + df = DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]}) + + with ensure_clean_store(setup_path) as store: + store.append("test", df, format="table", data_columns=True) + + cutoff = 1000000000.0006 + result = store.select("test", f"A < {cutoff:.4f}") + assert result.empty + + cutoff = 1000000000.0010 + result = store.select("test", f"A > {cutoff:.4f}") + expected = df.loc[[1, 2], :] + tm.assert_frame_equal(expected, result) + + exact = 1000000000.0011 + result = store.select("test", f"A == {exact:.4f}") + expected = df.loc[[1], :] + tm.assert_frame_equal(expected, result) + + +def test_query_compare_column_type(setup_path): + # GH 15492 + df = DataFrame( + { + "date": ["2014-01-01", "2014-01-02"], + "real_date": date_range("2014-01-01", periods=2), + "float": [1.1, 1.2], + "int": [1, 2], + }, + columns=["date", "real_date", "float", "int"], + ) + + with ensure_clean_store(setup_path) as store: + store.append("test", df, format="table", data_columns=True) + + ts = Timestamp("2014-01-01") # noqa:F841 + result = store.select("test", where="real_date > ts") + expected = df.loc[[1], :] + tm.assert_frame_equal(expected, result) + + for op in ["<", ">", "=="]: + # non strings to string column always fail + for v in [2.1, True, Timestamp("2014-01-01"), pd.Timedelta(1, "s")]: + query = f"date {op} v" + msg = f"Cannot compare {v} of type {type(v)} to string column" + with pytest.raises(TypeError, match=msg): + store.select("test", where=query) + + # strings to other columns must be convertible to type + v = "a" + for col in ["int", "float", "real_date"]: + query = f"{col} {op} v" + if col == "real_date": + msg = 'Given date string "a" not likely a datetime' + else: + msg = "could not convert string to " + with pytest.raises(ValueError, match=msg): + store.select("test", where=query) + + for v, col in zip( + ["1", "1.1", "2014-01-01"], ["int", "float", "real_date"] + ): + query = f"{col} {op} v" + result = store.select("test", where=query) + + if op == "==": + expected = df.loc[[0], :] + elif op == ">": + expected = df.loc[[1], :] + else: + expected = df.loc[[], :] + tm.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize("where", ["", (), (None,), [], [None]]) +def test_select_empty_where(tmp_path, where): + # GH26610 + + df = DataFrame([1, 2, 3]) + path = tmp_path / "empty_where.h5" + with HDFStore(path) as store: + store.put("df", df, "t") + result = read_hdf(store, "df", where=where) + tm.assert_frame_equal(result, df) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py new file mode 100644 index 0000000000000..06684f076aefe --- /dev/null +++ b/pandas/tests/io/pytables/test_store.py @@ -0,0 +1,1018 @@ +import datetime as dt +import hashlib +import os +import tempfile +import time +from warnings import ( + catch_warnings, + simplefilter, +) + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + MultiIndex, + Series, + Timestamp, + concat, + date_range, + timedelta_range, +) +import pandas._testing as tm +from pandas.tests.io.pytables.common import ( + _maybe_remove, + ensure_clean_store, + safe_close, +) + +_default_compressor = "blosc" + +from pandas.io.pytables import ( + HDFStore, + read_hdf, +) + +pytestmark = pytest.mark.single_cpu + + +def test_context(setup_path): + with tm.ensure_clean(setup_path) as path: + try: + with HDFStore(path) as tbl: + raise ValueError("blah") + except ValueError: + pass + with tm.ensure_clean(setup_path) as path: + with HDFStore(path) as tbl: + tbl["a"] = tm.makeDataFrame() + assert len(tbl) == 1 + assert type(tbl["a"]) == DataFrame + + +def test_no_track_times(tmp_path, setup_path): + + # GH 32682 + # enables to set track_times (see `pytables` `create_table` documentation) + + def checksum(filename, hash_factory=hashlib.md5, chunk_num_blocks=128): + h = hash_factory() + with open(filename, "rb") as f: + for chunk in iter(lambda: f.read(chunk_num_blocks * h.block_size), b""): + h.update(chunk) + return h.digest() + + def create_h5_and_return_checksum(tmp_path, track_times): + path = tmp_path / setup_path + df = DataFrame({"a": [1]}) + + with HDFStore(path, mode="w") as hdf: + hdf.put( + "table", + df, + format="table", + data_columns=True, + index=None, + track_times=track_times, + ) + + return checksum(path) + + checksum_0_tt_false = create_h5_and_return_checksum(tmp_path, track_times=False) + checksum_0_tt_true = create_h5_and_return_checksum(tmp_path, track_times=True) + + # sleep is necessary to create h5 with different creation time + time.sleep(1) + + checksum_1_tt_false = create_h5_and_return_checksum(tmp_path, track_times=False) + checksum_1_tt_true = create_h5_and_return_checksum(tmp_path, track_times=True) + + # checksums are the same if track_time = False + assert checksum_0_tt_false == checksum_1_tt_false + + # checksums are NOT same if track_time = True + assert checksum_0_tt_true != checksum_1_tt_true + + +def test_iter_empty(setup_path): + + with ensure_clean_store(setup_path) as store: + # GH 12221 + assert list(store) == [] + + +def test_repr(setup_path): + + with ensure_clean_store(setup_path) as store: + repr(store) + store.info() + store["a"] = tm.makeTimeSeries() + store["b"] = tm.makeStringSeries() + store["c"] = tm.makeDataFrame() + + df = tm.makeDataFrame() + df["obj1"] = "foo" + df["obj2"] = "bar" + df["bool1"] = df["A"] > 0 + df["bool2"] = df["B"] > 0 + df["bool3"] = True + df["int1"] = 1 + df["int2"] = 2 + df["timestamp1"] = Timestamp("20010102") + df["timestamp2"] = Timestamp("20010103") + df["datetime1"] = dt.datetime(2001, 1, 2, 0, 0) + df["datetime2"] = dt.datetime(2001, 1, 3, 0, 0) + df.loc[df.index[3:6], ["obj1"]] = np.nan + df = df._consolidate() + + with catch_warnings(record=True): + simplefilter("ignore", pd.errors.PerformanceWarning) + store["df"] = df + + # make a random group in hdf space + store._handle.create_group(store._handle.root, "bah") + + assert store.filename in repr(store) + assert store.filename in str(store) + store.info() + + # storers + with ensure_clean_store(setup_path) as store: + + df = tm.makeDataFrame() + store.append("df", df) + + s = store.get_storer("df") + repr(s) + str(s) + + +def test_contains(setup_path): + + with ensure_clean_store(setup_path) as store: + store["a"] = tm.makeTimeSeries() + store["b"] = tm.makeDataFrame() + store["foo/bar"] = tm.makeDataFrame() + assert "a" in store + assert "b" in store + assert "c" not in store + assert "foo/bar" in store + assert "/foo/bar" in store + assert "/foo/b" not in store + assert "bar" not in store + + # gh-2694: tables.NaturalNameWarning + with catch_warnings(record=True): + store["node())"] = tm.makeDataFrame() + assert "node())" in store + + +def test_versioning(setup_path): + + with ensure_clean_store(setup_path) as store: + store["a"] = tm.makeTimeSeries() + store["b"] = tm.makeDataFrame() + df = tm.makeTimeDataFrame() + _maybe_remove(store, "df1") + store.append("df1", df[:10]) + store.append("df1", df[10:]) + assert store.root.a._v_attrs.pandas_version == "0.15.2" + assert store.root.b._v_attrs.pandas_version == "0.15.2" + assert store.root.df1._v_attrs.pandas_version == "0.15.2" + + # write a file and wipe its versioning + _maybe_remove(store, "df2") + store.append("df2", df) + + # this is an error because its table_type is appendable, but no + # version info + store.get_node("df2")._v_attrs.pandas_version = None + + msg = "'NoneType' object has no attribute 'startswith'" + + with pytest.raises(Exception, match=msg): + store.select("df2") + + +@pytest.mark.parametrize( + "where, expected", + [ + ( + "/", + { + "": ({"first_group", "second_group"}, set()), + "/first_group": (set(), {"df1", "df2"}), + "/second_group": ({"third_group"}, {"df3", "s1"}), + "/second_group/third_group": (set(), {"df4"}), + }, + ), + ( + "/second_group", + { + "/second_group": ({"third_group"}, {"df3", "s1"}), + "/second_group/third_group": (set(), {"df4"}), + }, + ), + ], +) +def test_walk(where, expected): + # GH10143 + objs = { + "df1": DataFrame([1, 2, 3]), + "df2": DataFrame([4, 5, 6]), + "df3": DataFrame([6, 7, 8]), + "df4": DataFrame([9, 10, 11]), + "s1": Series([10, 9, 8]), + # Next 3 items aren't pandas objects and should be ignored + "a1": np.array([[1, 2, 3], [4, 5, 6]]), + "tb1": np.array([(1, 2, 3), (4, 5, 6)], dtype="i,i,i"), + "tb2": np.array([(7, 8, 9), (10, 11, 12)], dtype="i,i,i"), + } + + with ensure_clean_store("walk_groups.hdf", mode="w") as store: + store.put("/first_group/df1", objs["df1"]) + store.put("/first_group/df2", objs["df2"]) + store.put("/second_group/df3", objs["df3"]) + store.put("/second_group/s1", objs["s1"]) + store.put("/second_group/third_group/df4", objs["df4"]) + # Create non-pandas objects + store._handle.create_array("/first_group", "a1", objs["a1"]) + store._handle.create_table("/first_group", "tb1", obj=objs["tb1"]) + store._handle.create_table("/second_group", "tb2", obj=objs["tb2"]) + + assert len(list(store.walk(where=where))) == len(expected) + for path, groups, leaves in store.walk(where=where): + assert path in expected + expected_groups, expected_frames = expected[path] + assert expected_groups == set(groups) + assert expected_frames == set(leaves) + for leaf in leaves: + frame_path = "/".join([path, leaf]) + obj = store.get(frame_path) + if "df" in leaf: + tm.assert_frame_equal(obj, objs[leaf]) + else: + tm.assert_series_equal(obj, objs[leaf]) + + +def test_getattr(setup_path): + + with ensure_clean_store(setup_path) as store: + + s = tm.makeTimeSeries() + store["a"] = s + + # test attribute access + result = store.a + tm.assert_series_equal(result, s) + result = getattr(store, "a") + tm.assert_series_equal(result, s) + + df = tm.makeTimeDataFrame() + store["df"] = df + result = store.df + tm.assert_frame_equal(result, df) + + # errors + for x in ["d", "mode", "path", "handle", "complib"]: + msg = f"'HDFStore' object has no attribute '{x}'" + with pytest.raises(AttributeError, match=msg): + getattr(store, x) + + # not stores + for x in ["mode", "path", "handle", "complib"]: + getattr(store, f"_{x}") + + +def test_store_dropna(tmp_path, setup_path): + df_with_missing = DataFrame( + {"col1": [0.0, np.nan, 2.0], "col2": [1.0, np.nan, np.nan]}, + index=list("abc"), + ) + df_without_missing = DataFrame( + {"col1": [0.0, 2.0], "col2": [1.0, np.nan]}, index=list("ac") + ) + + # # Test to make sure defaults are to not drop. + # # Corresponding to Issue 9382 + path = tmp_path / setup_path + df_with_missing.to_hdf(path, "df", format="table") + reloaded = read_hdf(path, "df") + tm.assert_frame_equal(df_with_missing, reloaded) + + path = tmp_path / setup_path + df_with_missing.to_hdf(path, "df", format="table", dropna=False) + reloaded = read_hdf(path, "df") + tm.assert_frame_equal(df_with_missing, reloaded) + + path = tmp_path / setup_path + df_with_missing.to_hdf(path, "df", format="table", dropna=True) + reloaded = read_hdf(path, "df") + tm.assert_frame_equal(df_without_missing, reloaded) + + +def test_to_hdf_with_min_itemsize(tmp_path, setup_path): + + path = tmp_path / setup_path + + # min_itemsize in index with to_hdf (GH 10381) + df = tm.makeMixedDataFrame().set_index("C") + df.to_hdf(path, "ss3", format="table", min_itemsize={"index": 6}) + # just make sure there is a longer string: + df2 = df.copy().reset_index().assign(C="longer").set_index("C") + df2.to_hdf(path, "ss3", append=True, format="table") + tm.assert_frame_equal(read_hdf(path, "ss3"), concat([df, df2])) + + # same as above, with a Series + df["B"].to_hdf(path, "ss4", format="table", min_itemsize={"index": 6}) + df2["B"].to_hdf(path, "ss4", append=True, format="table") + tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]])) + + +@pytest.mark.parametrize("format", ["fixed", "table"]) +def test_to_hdf_errors(tmp_path, format, setup_path): + + data = ["\ud800foo"] + ser = Series(data, index=Index(data)) + path = tmp_path / setup_path + # GH 20835 + ser.to_hdf(path, "table", format=format, errors="surrogatepass") + + result = read_hdf(path, "table", errors="surrogatepass") + tm.assert_series_equal(result, ser) + + +def test_create_table_index(setup_path): + + with ensure_clean_store(setup_path) as store: + + with catch_warnings(record=True): + + def col(t, column): + return getattr(store.get_storer(t).table.cols, column) + + # data columns + df = tm.makeTimeDataFrame() + df["string"] = "foo" + df["string2"] = "bar" + store.append("f", df, data_columns=["string", "string2"]) + assert col("f", "index").is_indexed is True + assert col("f", "string").is_indexed is True + assert col("f", "string2").is_indexed is True + + # specify index=columns + store.append("f2", df, index=["string"], data_columns=["string", "string2"]) + assert col("f2", "index").is_indexed is False + assert col("f2", "string").is_indexed is True + assert col("f2", "string2").is_indexed is False + + # try to index a non-table + _maybe_remove(store, "f2") + store.put("f2", df) + msg = "cannot create table index on a Fixed format store" + with pytest.raises(TypeError, match=msg): + store.create_table_index("f2") + + +def test_create_table_index_data_columns_argument(setup_path): + # GH 28156 + + with ensure_clean_store(setup_path) as store: + + with catch_warnings(record=True): + + def col(t, column): + return getattr(store.get_storer(t).table.cols, column) + + # data columns + df = tm.makeTimeDataFrame() + df["string"] = "foo" + df["string2"] = "bar" + store.append("f", df, data_columns=["string"]) + assert col("f", "index").is_indexed is True + assert col("f", "string").is_indexed is True + + msg = "'Cols' object has no attribute 'string2'" + with pytest.raises(AttributeError, match=msg): + col("f", "string2").is_indexed + + # try to index a col which isn't a data_column + msg = ( + "column string2 is not a data_column.\n" + "In order to read column string2 you must reload the dataframe \n" + "into HDFStore and include string2 with the data_columns argument." + ) + with pytest.raises(AttributeError, match=msg): + store.create_table_index("f", columns=["string2"]) + + +def test_mi_data_columns(setup_path): + # GH 14435 + idx = MultiIndex.from_arrays( + [date_range("2000-01-01", periods=5), range(5)], names=["date", "id"] + ) + df = DataFrame({"a": [1.1, 1.2, 1.3, 1.4, 1.5]}, index=idx) + + with ensure_clean_store(setup_path) as store: + store.append("df", df, data_columns=True) + + actual = store.select("df", where="id == 1") + expected = df.iloc[[1], :] + tm.assert_frame_equal(actual, expected) + + +def test_table_mixed_dtypes(setup_path): + + # frame + df = tm.makeDataFrame() + df["obj1"] = "foo" + df["obj2"] = "bar" + df["bool1"] = df["A"] > 0 + df["bool2"] = df["B"] > 0 + df["bool3"] = True + df["int1"] = 1 + df["int2"] = 2 + df["timestamp1"] = Timestamp("20010102") + df["timestamp2"] = Timestamp("20010103") + df["datetime1"] = dt.datetime(2001, 1, 2, 0, 0) + df["datetime2"] = dt.datetime(2001, 1, 3, 0, 0) + df.loc[df.index[3:6], ["obj1"]] = np.nan + df = df._consolidate() + + with ensure_clean_store(setup_path) as store: + store.append("df1_mixed", df) + tm.assert_frame_equal(store.select("df1_mixed"), df) + + +def test_calendar_roundtrip_issue(setup_path): + + # 8591 + # doc example from tseries holiday section + weekmask_egypt = "Sun Mon Tue Wed Thu" + holidays = [ + "2012-05-01", + dt.datetime(2013, 5, 1), + np.datetime64("2014-05-01"), + ] + bday_egypt = pd.offsets.CustomBusinessDay( + holidays=holidays, weekmask=weekmask_egypt + ) + mydt = dt.datetime(2013, 4, 30) + dts = date_range(mydt, periods=5, freq=bday_egypt) + + s = Series(dts.weekday, dts).map(Series("Mon Tue Wed Thu Fri Sat Sun".split())) + + with ensure_clean_store(setup_path) as store: + + store.put("fixed", s) + result = store.select("fixed") + tm.assert_series_equal(result, s) + + store.append("table", s) + result = store.select("table") + tm.assert_series_equal(result, s) + + +def test_remove(setup_path): + + with ensure_clean_store(setup_path) as store: + + ts = tm.makeTimeSeries() + df = tm.makeDataFrame() + store["a"] = ts + store["b"] = df + _maybe_remove(store, "a") + assert len(store) == 1 + tm.assert_frame_equal(df, store["b"]) + + _maybe_remove(store, "b") + assert len(store) == 0 + + # nonexistence + with pytest.raises( + KeyError, match="'No object named a_nonexistent_store in the file'" + ): + store.remove("a_nonexistent_store") + + # pathing + store["a"] = ts + store["b/foo"] = df + _maybe_remove(store, "foo") + _maybe_remove(store, "b/foo") + assert len(store) == 1 + + store["a"] = ts + store["b/foo"] = df + _maybe_remove(store, "b") + assert len(store) == 1 + + # __delitem__ + store["a"] = ts + store["b"] = df + del store["a"] + del store["b"] + assert len(store) == 0 + + +def test_same_name_scoping(setup_path): + + with ensure_clean_store(setup_path) as store: + + df = DataFrame(np.random.randn(20, 2), index=date_range("20130101", periods=20)) + store.put("df", df, format="table") + expected = df[df.index > Timestamp("20130105")] + + result = store.select("df", "index>datetime.datetime(2013,1,5)") + tm.assert_frame_equal(result, expected) + + # changes what 'datetime' points to in the namespace where + # 'select' does the lookup + + # technically an error, but allow it + result = store.select("df", "index>datetime.datetime(2013,1,5)") + tm.assert_frame_equal(result, expected) + + result = store.select("df", "index>datetime(2013,1,5)") + tm.assert_frame_equal(result, expected) + + +def test_store_index_name(setup_path): + df = tm.makeDataFrame() + df.index.name = "foo" + + with ensure_clean_store(setup_path) as store: + store["frame"] = df + recons = store["frame"] + tm.assert_frame_equal(recons, df) + + +@pytest.mark.parametrize("table_format", ["table", "fixed"]) +def test_store_index_name_numpy_str(tmp_path, table_format, setup_path): + # GH #13492 + idx = Index( + pd.to_datetime([dt.date(2000, 1, 1), dt.date(2000, 1, 2)]), + name="cols\u05d2", + ) + idx1 = Index( + pd.to_datetime([dt.date(2010, 1, 1), dt.date(2010, 1, 2)]), + name="rows\u05d0", + ) + df = DataFrame(np.arange(4).reshape(2, 2), columns=idx, index=idx1) + + # This used to fail, returning numpy strings instead of python strings. + path = tmp_path / setup_path + df.to_hdf(path, "df", format=table_format) + df2 = read_hdf(path, "df") + + tm.assert_frame_equal(df, df2, check_names=True) + + assert type(df2.index.name) == str + assert type(df2.columns.name) == str + + +def test_store_series_name(setup_path): + df = tm.makeDataFrame() + series = df["A"] + + with ensure_clean_store(setup_path) as store: + store["series"] = series + recons = store["series"] + tm.assert_series_equal(recons, series) + + +def test_overwrite_node(setup_path): + + with ensure_clean_store(setup_path) as store: + store["a"] = tm.makeTimeDataFrame() + ts = tm.makeTimeSeries() + store["a"] = ts + + tm.assert_series_equal(store["a"], ts) + + +def test_coordinates(setup_path): + df = tm.makeTimeDataFrame() + + with ensure_clean_store(setup_path) as store: + + _maybe_remove(store, "df") + store.append("df", df) + + # all + c = store.select_as_coordinates("df") + assert (c.values == np.arange(len(df.index))).all() + + # get coordinates back & test vs frame + _maybe_remove(store, "df") + + df = DataFrame({"A": range(5), "B": range(5)}) + store.append("df", df) + c = store.select_as_coordinates("df", ["index<3"]) + assert (c.values == np.arange(3)).all() + result = store.select("df", where=c) + expected = df.loc[0:2, :] + tm.assert_frame_equal(result, expected) + + c = store.select_as_coordinates("df", ["index>=3", "index<=4"]) + assert (c.values == np.arange(2) + 3).all() + result = store.select("df", where=c) + expected = df.loc[3:4, :] + tm.assert_frame_equal(result, expected) + assert isinstance(c, Index) + + # multiple tables + _maybe_remove(store, "df1") + _maybe_remove(store, "df2") + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) + store.append("df1", df1, data_columns=["A", "B"]) + store.append("df2", df2) + + c = store.select_as_coordinates("df1", ["A>0", "B>0"]) + df1_result = store.select("df1", c) + df2_result = store.select("df2", c) + result = concat([df1_result, df2_result], axis=1) + + expected = concat([df1, df2], axis=1) + expected = expected[(expected.A > 0) & (expected.B > 0)] + tm.assert_frame_equal(result, expected, check_freq=False) + # FIXME: 2021-01-18 on some (mostly windows) builds we get freq=None + # but expect freq="18B" + + # pass array/mask as the coordinates + with ensure_clean_store(setup_path) as store: + + df = DataFrame( + np.random.randn(1000, 2), index=date_range("20000101", periods=1000) + ) + store.append("df", df) + c = store.select_column("df", "index") + where = c[DatetimeIndex(c).month == 5].index + expected = df.iloc[where] + + # locations + result = store.select("df", where=where) + tm.assert_frame_equal(result, expected) + + # boolean + result = store.select("df", where=where) + tm.assert_frame_equal(result, expected) + + # invalid + msg = ( + "where must be passed as a string, PyTablesExpr, " + "or list-like of PyTablesExpr" + ) + with pytest.raises(TypeError, match=msg): + store.select("df", where=np.arange(len(df), dtype="float64")) + + with pytest.raises(TypeError, match=msg): + store.select("df", where=np.arange(len(df) + 1)) + + with pytest.raises(TypeError, match=msg): + store.select("df", where=np.arange(len(df)), start=5) + + with pytest.raises(TypeError, match=msg): + store.select("df", where=np.arange(len(df)), start=5, stop=10) + + # selection with filter + selection = date_range("20000101", periods=500) + result = store.select("df", where="index in selection") + expected = df[df.index.isin(selection)] + tm.assert_frame_equal(result, expected) + + # list + df = DataFrame(np.random.randn(10, 2)) + store.append("df2", df) + result = store.select("df2", where=[0, 3, 5]) + expected = df.iloc[[0, 3, 5]] + tm.assert_frame_equal(result, expected) + + # boolean + where = [True] * 10 + where[-2] = False + result = store.select("df2", where=where) + expected = df.loc[where] + tm.assert_frame_equal(result, expected) + + # start/stop + result = store.select("df2", start=5, stop=10) + expected = df[5:10] + tm.assert_frame_equal(result, expected) + + +def test_start_stop_table(setup_path): + + with ensure_clean_store(setup_path) as store: + + # table + df = DataFrame({"A": np.random.rand(20), "B": np.random.rand(20)}) + store.append("df", df) + + result = store.select("df", "columns=['A']", start=0, stop=5) + expected = df.loc[0:4, ["A"]] + tm.assert_frame_equal(result, expected) + + # out of range + result = store.select("df", "columns=['A']", start=30, stop=40) + assert len(result) == 0 + expected = df.loc[30:40, ["A"]] + tm.assert_frame_equal(result, expected) + + +def test_start_stop_multiple(setup_path): + + # GH 16209 + with ensure_clean_store(setup_path) as store: + + df = DataFrame({"foo": [1, 2], "bar": [1, 2]}) + + store.append_to_multiple( + {"selector": ["foo"], "data": None}, df, selector="selector" + ) + result = store.select_as_multiple( + ["selector", "data"], selector="selector", start=0, stop=1 + ) + expected = df.loc[[0], ["foo", "bar"]] + tm.assert_frame_equal(result, expected) + + +def test_start_stop_fixed(setup_path): + + with ensure_clean_store(setup_path) as store: + + # fixed, GH 8287 + df = DataFrame( + {"A": np.random.rand(20), "B": np.random.rand(20)}, + index=date_range("20130101", periods=20), + ) + store.put("df", df) + + result = store.select("df", start=0, stop=5) + expected = df.iloc[0:5, :] + tm.assert_frame_equal(result, expected) + + result = store.select("df", start=5, stop=10) + expected = df.iloc[5:10, :] + tm.assert_frame_equal(result, expected) + + # out of range + result = store.select("df", start=30, stop=40) + expected = df.iloc[30:40, :] + tm.assert_frame_equal(result, expected) + + # series + s = df.A + store.put("s", s) + result = store.select("s", start=0, stop=5) + expected = s.iloc[0:5] + tm.assert_series_equal(result, expected) + + result = store.select("s", start=5, stop=10) + expected = s.iloc[5:10] + tm.assert_series_equal(result, expected) + + # sparse; not implemented + df = tm.makeDataFrame() + df.iloc[3:5, 1:3] = np.nan + df.iloc[8:10, -2] = np.nan + + +def test_select_filter_corner(setup_path): + + df = DataFrame(np.random.randn(50, 100)) + df.index = [f"{c:3d}" for c in df.index] + df.columns = [f"{c:3d}" for c in df.columns] + + with ensure_clean_store(setup_path) as store: + store.put("frame", df, format="table") + + crit = "columns=df.columns[:75]" + result = store.select("frame", [crit]) + tm.assert_frame_equal(result, df.loc[:, df.columns[:75]]) + + crit = "columns=df.columns[:75:2]" + result = store.select("frame", [crit]) + tm.assert_frame_equal(result, df.loc[:, df.columns[:75:2]]) + + +def test_path_pathlib(): + df = tm.makeDataFrame() + + result = tm.round_trip_pathlib( + lambda p: df.to_hdf(p, "df"), lambda p: read_hdf(p, "df") + ) + tm.assert_frame_equal(df, result) + + +@pytest.mark.parametrize("start, stop", [(0, 2), (1, 2), (None, None)]) +def test_contiguous_mixed_data_table(start, stop, setup_path): + # GH 17021 + df = DataFrame( + { + "a": Series([20111010, 20111011, 20111012]), + "b": Series(["ab", "cd", "ab"]), + } + ) + + with ensure_clean_store(setup_path) as store: + store.append("test_dataset", df) + + result = store.select("test_dataset", start=start, stop=stop) + tm.assert_frame_equal(df[start:stop], result) + + +def test_path_pathlib_hdfstore(): + df = tm.makeDataFrame() + + def writer(path): + with HDFStore(path) as store: + df.to_hdf(store, "df") + + def reader(path): + with HDFStore(path) as store: + return read_hdf(store, "df") + + result = tm.round_trip_pathlib(writer, reader) + tm.assert_frame_equal(df, result) + + +def test_pickle_path_localpath(): + df = tm.makeDataFrame() + result = tm.round_trip_pathlib( + lambda p: df.to_hdf(p, "df"), lambda p: read_hdf(p, "df") + ) + tm.assert_frame_equal(df, result) + + +def test_path_localpath_hdfstore(): + df = tm.makeDataFrame() + + def writer(path): + with HDFStore(path) as store: + df.to_hdf(store, "df") + + def reader(path): + with HDFStore(path) as store: + return read_hdf(store, "df") + + result = tm.round_trip_localpath(writer, reader) + tm.assert_frame_equal(df, result) + + +def test_copy(): + + with catch_warnings(record=True): + + def do_copy(f, new_f=None, keys=None, propindexes=True, **kwargs): + if new_f is None: + fd, new_f = tempfile.mkstemp() + + try: + store = HDFStore(f, "r") + tstore = store.copy(new_f, keys=keys, propindexes=propindexes, **kwargs) + + # check keys + if keys is None: + keys = store.keys() + assert set(keys) == set(tstore.keys()) + + # check indices & nrows + for k in tstore.keys(): + if tstore.get_storer(k).is_table: + new_t = tstore.get_storer(k) + orig_t = store.get_storer(k) + + assert orig_t.nrows == new_t.nrows + + # check propindixes + if propindexes: + for a in orig_t.axes: + if a.is_indexed: + assert new_t[a.name].is_indexed + + finally: + safe_close(store) + safe_close(tstore) + try: + os.close(fd) + except (OSError, ValueError): + pass + os.remove(new_f) + + # new table + df = tm.makeDataFrame() + + with tm.ensure_clean() as path: + with HDFStore(path) as st: + st.append("df", df, data_columns=["A"]) + do_copy(f=path) + do_copy(f=path, propindexes=False) + + +def test_duplicate_column_name(tmp_path, setup_path): + df = DataFrame(columns=["a", "a"], data=[[0, 0]]) + + path = tmp_path / setup_path + msg = "Columns index has to be unique for fixed format" + with pytest.raises(ValueError, match=msg): + df.to_hdf(path, "df", format="fixed") + + df.to_hdf(path, "df", format="table") + other = read_hdf(path, "df") + + tm.assert_frame_equal(df, other) + assert df.equals(other) + assert other.equals(df) + + +def test_preserve_timedeltaindex_type(setup_path): + # GH9635 + df = DataFrame(np.random.normal(size=(10, 5))) + df.index = timedelta_range(start="0s", periods=10, freq="1s", name="example") + + with ensure_clean_store(setup_path) as store: + + store["df"] = df + tm.assert_frame_equal(store["df"], df) + + +def test_columns_multiindex_modified(tmp_path, setup_path): + # BUG: 7212 + + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + df.index.name = "letters" + df = df.set_index(keys="E", append=True) + + data_columns = df.index.names + df.columns.tolist() + path = tmp_path / setup_path + df.to_hdf( + path, + "df", + mode="a", + append=True, + data_columns=data_columns, + index=False, + ) + cols2load = list("BCD") + cols2load_original = list(cols2load) + # GH#10055 make sure read_hdf call does not alter cols2load inplace + read_hdf(path, "df", columns=cols2load) + assert cols2load_original == cols2load + + +def test_to_hdf_with_object_column_names(tmp_path, setup_path): + # GH9057 + + types_should_fail = [ + tm.makeIntIndex, + tm.makeFloatIndex, + tm.makeDateIndex, + tm.makeTimedeltaIndex, + tm.makePeriodIndex, + ] + types_should_run = [ + tm.makeStringIndex, + tm.makeCategoricalIndex, + ] + + for index in types_should_fail: + df = DataFrame(np.random.randn(10, 2), columns=index(2)) + path = tmp_path / setup_path + with catch_warnings(record=True): + msg = "cannot have non-object label DataIndexableCol" + with pytest.raises(ValueError, match=msg): + df.to_hdf(path, "df", format="table", data_columns=True) + + for index in types_should_run: + df = DataFrame(np.random.randn(10, 2), columns=index(2)) + path = tmp_path / setup_path + with catch_warnings(record=True): + df.to_hdf(path, "df", format="table", data_columns=True) + result = read_hdf(path, "df", where=f"index = [{df.index[0]}]") + assert len(result) + + +def test_hdfstore_strides(setup_path): + # GH22073 + df = DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + with ensure_clean_store(setup_path) as store: + store.put("df", df) + assert df["a"].values.strides == store["df"]["a"].values.strides + + +def test_store_bool_index(tmp_path, setup_path): + # GH#48667 + df = DataFrame([[1]], columns=[True], index=Index([False], dtype="bool")) + expected = df.copy() + + # # Test to make sure defaults are to not drop. + # # Corresponding to Issue 9382 + path = tmp_path / setup_path + df.to_hdf(path, "a") + result = read_hdf(path, "a") + tm.assert_frame_equal(expected, result) diff --git a/pandas/tests/io/pytables/test_subclass.py b/pandas/tests/io/pytables/test_subclass.py new file mode 100644 index 0000000000000..823d2875c5417 --- /dev/null +++ b/pandas/tests/io/pytables/test_subclass.py @@ -0,0 +1,52 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm + +from pandas.io.pytables import ( + HDFStore, + read_hdf, +) + +pytest.importorskip("tables") + + +class TestHDFStoreSubclass: + # GH 33748 + def test_supported_for_subclass_dataframe(self, tmp_path): + data = {"a": [1, 2], "b": [3, 4]} + sdf = tm.SubclassedDataFrame(data, dtype=np.intp) + + expected = DataFrame(data, dtype=np.intp) + + path = tmp_path / "temp.h5" + sdf.to_hdf(path, "df") + result = read_hdf(path, "df") + tm.assert_frame_equal(result, expected) + + path = tmp_path / "temp.h5" + with HDFStore(path) as store: + store.put("df", sdf) + result = read_hdf(path, "df") + tm.assert_frame_equal(result, expected) + + def test_supported_for_subclass_series(self, tmp_path): + data = [1, 2, 3] + sser = tm.SubclassedSeries(data, dtype=np.intp) + + expected = Series(data, dtype=np.intp) + + path = tmp_path / "temp.h5" + sser.to_hdf(path, "ser") + result = read_hdf(path, "ser") + tm.assert_series_equal(result, expected) + + path = tmp_path / "temp.h5" + with HDFStore(path) as store: + store.put("ser", sser) + result = read_hdf(path, "ser") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_time_series.py b/pandas/tests/io/pytables/test_time_series.py new file mode 100644 index 0000000000000..6625984961c11 --- /dev/null +++ b/pandas/tests/io/pytables/test_time_series.py @@ -0,0 +1,66 @@ +import datetime + +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Series, + _testing as tm, +) +from pandas.tests.io.pytables.common import ensure_clean_store + +pytestmark = pytest.mark.single_cpu + + +def test_store_datetime_fractional_secs(setup_path): + + with ensure_clean_store(setup_path) as store: + dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456) + series = Series([0], [dt]) + store["a"] = series + assert store["a"].index[0] == dt + + +def test_tseries_indices_series(setup_path): + + with ensure_clean_store(setup_path) as store: + idx = tm.makeDateIndex(10) + ser = Series(np.random.randn(len(idx)), idx) + store["a"] = ser + result = store["a"] + + tm.assert_series_equal(result, ser) + assert result.index.freq == ser.index.freq + tm.assert_class_equal(result.index, ser.index, obj="series index") + + idx = tm.makePeriodIndex(10) + ser = Series(np.random.randn(len(idx)), idx) + store["a"] = ser + result = store["a"] + + tm.assert_series_equal(result, ser) + assert result.index.freq == ser.index.freq + tm.assert_class_equal(result.index, ser.index, obj="series index") + + +def test_tseries_indices_frame(setup_path): + + with ensure_clean_store(setup_path) as store: + idx = tm.makeDateIndex(10) + df = DataFrame(np.random.randn(len(idx), 3), index=idx) + store["a"] = df + result = store["a"] + + tm.assert_frame_equal(result, df) + assert result.index.freq == df.index.freq + tm.assert_class_equal(result.index, df.index, obj="dataframe index") + + idx = tm.makePeriodIndex(10) + df = DataFrame(np.random.randn(len(idx), 3), idx) + store["a"] = df + result = store["a"] + + tm.assert_frame_equal(result, df) + assert result.index.freq == df.index.freq + tm.assert_class_equal(result.index, df.index, obj="dataframe index") diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py new file mode 100644 index 0000000000000..ba125ffd28581 --- /dev/null +++ b/pandas/tests/io/pytables/test_timezones.py @@ -0,0 +1,369 @@ +from datetime import ( + date, + timedelta, +) + +import numpy as np +import pytest + +from pandas._libs.tslibs.timezones import maybe_get_tz +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm +from pandas.tests.io.pytables.common import ( + _maybe_remove, + ensure_clean_store, +) + + +def _compare_with_tz(a, b): + tm.assert_frame_equal(a, b) + + # compare the zones on each element + for c in a.columns: + for i in a.index: + a_e = a.loc[i, c] + b_e = b.loc[i, c] + if not (a_e == b_e and a_e.tz == b_e.tz): + raise AssertionError(f"invalid tz comparison [{a_e}] [{b_e}]") + + +# use maybe_get_tz instead of dateutil.tz.gettz to handle the windows +# filename issues. +gettz_dateutil = lambda x: maybe_get_tz("dateutil/" + x) +gettz_pytz = lambda x: x + + +@pytest.mark.parametrize("gettz", [gettz_dateutil, gettz_pytz]) +def test_append_with_timezones(setup_path, gettz): + # as columns + + # Single-tzinfo, no DST transition + df_est = DataFrame( + { + "A": [ + Timestamp("20130102 2:00:00", tz=gettz("US/Eastern")) + + timedelta(hours=1) * i + for i in range(5) + ] + } + ) + + # frame with all columns having same tzinfo, but different sides + # of DST transition + df_crosses_dst = DataFrame( + { + "A": Timestamp("20130102", tz=gettz("US/Eastern")), + "B": Timestamp("20130603", tz=gettz("US/Eastern")), + }, + index=range(5), + ) + + df_mixed_tz = DataFrame( + { + "A": Timestamp("20130102", tz=gettz("US/Eastern")), + "B": Timestamp("20130102", tz=gettz("EET")), + }, + index=range(5), + ) + + df_different_tz = DataFrame( + { + "A": Timestamp("20130102", tz=gettz("US/Eastern")), + "B": Timestamp("20130102", tz=gettz("CET")), + }, + index=range(5), + ) + + with ensure_clean_store(setup_path) as store: + + _maybe_remove(store, "df_tz") + store.append("df_tz", df_est, data_columns=["A"]) + result = store["df_tz"] + _compare_with_tz(result, df_est) + tm.assert_frame_equal(result, df_est) + + # select with tz aware + expected = df_est[df_est.A >= df_est.A[3]] + result = store.select("df_tz", where="A>=df_est.A[3]") + _compare_with_tz(result, expected) + + # ensure we include dates in DST and STD time here. + _maybe_remove(store, "df_tz") + store.append("df_tz", df_crosses_dst) + result = store["df_tz"] + _compare_with_tz(result, df_crosses_dst) + tm.assert_frame_equal(result, df_crosses_dst) + + msg = ( + r"invalid info for \[values_block_1\] for \[tz\], " + r"existing_value \[(dateutil/.*)?US/Eastern\] " + r"conflicts with new value \[(dateutil/.*)?EET\]" + ) + with pytest.raises(ValueError, match=msg): + store.append("df_tz", df_mixed_tz) + + # this is ok + _maybe_remove(store, "df_tz") + store.append("df_tz", df_mixed_tz, data_columns=["A", "B"]) + result = store["df_tz"] + _compare_with_tz(result, df_mixed_tz) + tm.assert_frame_equal(result, df_mixed_tz) + + # can't append with diff timezone + msg = ( + r"invalid info for \[B\] for \[tz\], " + r"existing_value \[(dateutil/.*)?EET\] " + r"conflicts with new value \[(dateutil/.*)?CET\]" + ) + with pytest.raises(ValueError, match=msg): + store.append("df_tz", df_different_tz) + + +@pytest.mark.parametrize("gettz", [gettz_dateutil, gettz_pytz]) +def test_append_with_timezones_as_index(setup_path, gettz): + # GH#4098 example + + dti = date_range("2000-1-1", periods=3, freq="H", tz=gettz("US/Eastern")) + dti = dti._with_freq(None) # freq doesn't round-trip + + df = DataFrame({"A": Series(range(3), index=dti)}) + + with ensure_clean_store(setup_path) as store: + + _maybe_remove(store, "df") + store.put("df", df) + result = store.select("df") + tm.assert_frame_equal(result, df) + + _maybe_remove(store, "df") + store.append("df", df) + result = store.select("df") + tm.assert_frame_equal(result, df) + + +def test_roundtrip_tz_aware_index(setup_path): + # GH 17618 + time = Timestamp("2000-01-01 01:00:00", tz="US/Eastern") + df = DataFrame(data=[0], index=[time]) + + with ensure_clean_store(setup_path) as store: + store.put("frame", df, format="fixed") + recons = store["frame"] + tm.assert_frame_equal(recons, df) + assert recons.index[0].value == 946706400000000000 + + +def test_store_index_name_with_tz(setup_path): + # GH 13884 + df = DataFrame({"A": [1, 2]}) + df.index = DatetimeIndex([1234567890123456787, 1234567890123456788]) + df.index = df.index.tz_localize("UTC") + df.index.name = "foo" + + with ensure_clean_store(setup_path) as store: + store.put("frame", df, format="table") + recons = store["frame"] + tm.assert_frame_equal(recons, df) + + +def test_tseries_select_index_column(setup_path): + # GH7777 + # selecting a UTC datetimeindex column did + # not preserve UTC tzinfo set before storing + + # check that no tz still works + rng = date_range("1/1/2000", "1/30/2000") + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + + with ensure_clean_store(setup_path) as store: + store.append("frame", frame) + result = store.select_column("frame", "index") + assert rng.tz == DatetimeIndex(result.values).tz + + # check utc + rng = date_range("1/1/2000", "1/30/2000", tz="UTC") + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + + with ensure_clean_store(setup_path) as store: + store.append("frame", frame) + result = store.select_column("frame", "index") + assert rng.tz == result.dt.tz + + # double check non-utc + rng = date_range("1/1/2000", "1/30/2000", tz="US/Eastern") + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + + with ensure_clean_store(setup_path) as store: + store.append("frame", frame) + result = store.select_column("frame", "index") + assert rng.tz == result.dt.tz + + +def test_timezones_fixed_format_frame_non_empty(setup_path): + with ensure_clean_store(setup_path) as store: + + # index + rng = date_range("1/1/2000", "1/30/2000", tz="US/Eastern") + rng = rng._with_freq(None) # freq doesn't round-trip + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + store["df"] = df + result = store["df"] + tm.assert_frame_equal(result, df) + + # as data + # GH11411 + _maybe_remove(store, "df") + df = DataFrame( + { + "A": rng, + "B": rng.tz_convert("UTC").tz_localize(None), + "C": rng.tz_convert("CET"), + "D": range(len(rng)), + }, + index=rng, + ) + store["df"] = df + result = store["df"] + tm.assert_frame_equal(result, df) + + +def test_timezones_fixed_format_empty(setup_path, tz_aware_fixture, frame_or_series): + # GH 20594 + + dtype = pd.DatetimeTZDtype(tz=tz_aware_fixture) + + obj = Series(dtype=dtype, name="A") + if frame_or_series is DataFrame: + obj = obj.to_frame() + + with ensure_clean_store(setup_path) as store: + store["obj"] = obj + result = store["obj"] + tm.assert_equal(result, obj) + + +def test_timezones_fixed_format_series_nonempty(setup_path, tz_aware_fixture): + # GH 20594 + + dtype = pd.DatetimeTZDtype(tz=tz_aware_fixture) + + with ensure_clean_store(setup_path) as store: + s = Series([0], dtype=dtype) + store["s"] = s + result = store["s"] + tm.assert_series_equal(result, s) + + +def test_fixed_offset_tz(setup_path): + rng = date_range("1/1/2000 00:00:00-07:00", "1/30/2000 00:00:00-07:00") + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + + with ensure_clean_store(setup_path) as store: + store["frame"] = frame + recons = store["frame"] + tm.assert_index_equal(recons.index, rng) + assert rng.tz == recons.index.tz + + +@td.skip_if_windows +def test_store_timezone(setup_path): + # GH2852 + # issue storing datetime.date with a timezone as it resets when read + # back in a new timezone + + # original method + with ensure_clean_store(setup_path) as store: + + today = date(2013, 9, 10) + df = DataFrame([1, 2, 3], index=[today, today, today]) + store["obj1"] = df + result = store["obj1"] + tm.assert_frame_equal(result, df) + + # with tz setting + with ensure_clean_store(setup_path) as store: + + with tm.set_timezone("EST5EDT"): + today = date(2013, 9, 10) + df = DataFrame([1, 2, 3], index=[today, today, today]) + store["obj1"] = df + + with tm.set_timezone("CST6CDT"): + result = store["obj1"] + + tm.assert_frame_equal(result, df) + + +def test_legacy_datetimetz_object(datapath): + # legacy from < 0.17.0 + # 8260 + expected = DataFrame( + { + "A": Timestamp("20130102", tz="US/Eastern"), + "B": Timestamp("20130603", tz="CET"), + }, + index=range(5), + ) + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "datetimetz_object.h5"), mode="r" + ) as store: + result = store["df"] + tm.assert_frame_equal(result, expected) + + +def test_dst_transitions(setup_path): + # make sure we are not failing on transitions + with ensure_clean_store(setup_path) as store: + times = date_range( + "2013-10-26 23:00", + "2013-10-27 01:00", + tz="Europe/London", + freq="H", + ambiguous="infer", + ) + times = times._with_freq(None) # freq doesn't round-trip + + for i in [times, times + pd.Timedelta("10min")]: + _maybe_remove(store, "df") + df = DataFrame({"A": range(len(i)), "B": i}, index=i) + store.append("df", df) + result = store.select("df") + tm.assert_frame_equal(result, df) + + +def test_read_with_where_tz_aware_index(tmp_path, setup_path): + # GH 11926 + periods = 10 + dts = date_range("20151201", periods=periods, freq="D", tz="UTC") + mi = pd.MultiIndex.from_arrays([dts, range(periods)], names=["DATE", "NO"]) + expected = DataFrame({"MYCOL": 0}, index=mi) + + key = "mykey" + path = tmp_path / setup_path + with pd.HDFStore(path) as store: + store.append(key, expected, format="table", append=True) + result = pd.read_hdf(path, key, where="DATE > 20151130") + tm.assert_frame_equal(result, expected) + + +def test_py2_created_with_datetimez(datapath): + # The test HDF5 file was created in Python 2, but could not be read in + # Python 3. + # + # GH26443 + index = [Timestamp("2019-01-01T18:00").tz_localize("America/New_York")] + expected = DataFrame({"data": 123}, index=index) + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "gh26443.h5"), mode="r" + ) as store: + result = store["key"] + tm.assert_frame_equal(result, expected) From 1305cf70077b4e66ecfc0351c4bb37bdbe653e30 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 15 Jan 2023 17:03:26 +0100 Subject: [PATCH 09/14] Pin pytables to version that is supported for 3.11 --- ci/deps/actions-311.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 7d374dbb20e8b..4e101955c4dca 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -40,7 +40,7 @@ dependencies: - pandas-gbq - psycopg2 - pymysql - - pytables + - pytables>=3.8.0 - pyarrow - pyreadstat - python-snappy From ab4ea19436844d7e7211ffce45573cb74d564d48 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 15 Jan 2023 17:04:21 +0100 Subject: [PATCH 10/14] Add comment --- ci/deps/actions-311.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 4e101955c4dca..e704f88c42de9 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -40,7 +40,7 @@ dependencies: - pandas-gbq - psycopg2 - pymysql - - pytables>=3.8.0 + - pytables>=3.8.0 # first version that supports 3.11 - pyarrow - pyreadstat - python-snappy From c19d143c1b85c65722e153c87308e6716338fc69 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 15 Jan 2023 17:24:21 +0100 Subject: [PATCH 11/14] Remove pytables --- ci/deps/actions-311.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index e704f88c42de9..dd15a7637439b 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -1,4 +1,4 @@ -name: pandas-dev-test +name: pandas-dev channels: - conda-forge dependencies: @@ -40,7 +40,7 @@ dependencies: - pandas-gbq - psycopg2 - pymysql - - pytables>=3.8.0 # first version that supports 3.11 + # - pytables>=3.8.0 # first version that supports 3.11 - pyarrow - pyreadstat - python-snappy From 77eac07d52242250251dfd4f62f52d89e5ab4c54 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 15 Jan 2023 22:51:22 +0100 Subject: [PATCH 12/14] Update actions-311.yaml --- ci/deps/actions-311.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index dd15a7637439b..062fa85fdf949 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -18,7 +18,7 @@ dependencies: # required dependencies - python-dateutil - - numpy<1.24 + - numpy - pytz # optional dependencies From 7c9dc77a7fd764282b27f4d92592af75ef1b8a88 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 15 Jan 2023 23:20:00 +0100 Subject: [PATCH 13/14] Add numba --- ci/deps/actions-311.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 062fa85fdf949..f1238c9c84d16 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -34,6 +34,7 @@ dependencies: - jinja2 - lxml - matplotlib>=3.6.1 + # - numba not compatible with 3.11 - numexpr - openpyxl - odfpy From ff06b2b4fc937052d6092dd995a9d8f65c3204ac Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 16 Jan 2023 01:14:33 +0100 Subject: [PATCH 14/14] Update actions-311.yaml --- ci/deps/actions-311.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index f1238c9c84d16..8e15c7b4740c5 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -18,7 +18,7 @@ dependencies: # required dependencies - python-dateutil - - numpy + - numpy<1.24.0 - pytz # optional dependencies