Skip to content

Commit dda4c1a

Browse files
jgehrckejreback
authored andcommitted
COMPAT: reading generic PyTables Table format fails with sub-selection (#26818)
1 parent 2243629 commit dda4c1a

File tree

4 files changed

+108
-7
lines changed

4 files changed

+108
-7
lines changed

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -699,6 +699,7 @@ I/O
699699
- Bug in :func:`read_csv` not properly interpreting the UTF8 encoded filenames on Windows on Python 3.6+ (:issue:`15086`)
700700
- Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`)
701701
- Bug in :meth:`DataFrame.to_html` where header numbers would ignore display options when rounding (:issue:`17280`)
702+
- Bug in :func:`read_hdf` where reading a table from an HDF5 file written directly with PyTables fails with a ``ValueError`` when using a sub-selection via the ``start`` or ``stop`` arguments (:issue:`11188`)
702703
- Bug in :func:`read_hdf` not properly closing store after a ``KeyError`` is raised (:issue:`25766`)
703704
- Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`)
704705
- Improved the explanation for the failure when value labels are repeated in Stata dta files and suggested work-arounds (:issue:`25772`)

pandas/io/pytables.py

+30-6
Original file line numberDiff line numberDiff line change
@@ -1624,7 +1624,8 @@ def infer(self, handler):
16241624
new_self.read_metadata(handler)
16251625
return new_self
16261626

1627-
def convert(self, values, nan_rep, encoding, errors):
1627+
def convert(self, values, nan_rep, encoding, errors, start=None,
1628+
stop=None):
16281629
""" set the values from this selection: take = take ownership """
16291630

16301631
# values is a recarray
@@ -1813,10 +1814,29 @@ class GenericIndexCol(IndexCol):
18131814
def is_indexed(self):
18141815
return False
18151816

1816-
def convert(self, values, nan_rep, encoding, errors):
1817-
""" set the values from this selection: take = take ownership """
1817+
def convert(self, values, nan_rep, encoding, errors, start=None,
1818+
stop=None):
1819+
""" set the values from this selection: take = take ownership
1820+
1821+
Parameters
1822+
----------
1823+
1824+
values : np.ndarray
1825+
nan_rep : str
1826+
encoding : str
1827+
errors : str
1828+
start : int, optional
1829+
Table row number: the start of the sub-selection.
1830+
stop : int, optional
1831+
Table row number: the end of the sub-selection. Values larger than
1832+
the underlying table's row count are normalized to that.
1833+
"""
1834+
1835+
start = start if start is not None else 0
1836+
stop = (min(stop, self.table.nrows)
1837+
if stop is not None else self.table.nrows)
1838+
self.values = Int64Index(np.arange(stop - start))
18181839

1819-
self.values = Int64Index(np.arange(self.table.nrows))
18201840
return self
18211841

18221842
def get_attr(self):
@@ -2159,7 +2179,8 @@ def validate_attr(self, append):
21592179
raise ValueError("appended items dtype do not match existing "
21602180
"items dtype in table!")
21612181

2162-
def convert(self, values, nan_rep, encoding, errors):
2182+
def convert(self, values, nan_rep, encoding, errors, start=None,
2183+
stop=None):
21632184
"""set the data from this selection (and convert to the correct dtype
21642185
if we can)
21652186
"""
@@ -3431,8 +3452,11 @@ def read_axes(self, where, **kwargs):
34313452
# convert the data
34323453
for a in self.axes:
34333454
a.set_info(self.info)
3455+
# `kwargs` may contain `start` and `stop` arguments if passed to
3456+
# `store.select()`. If set they determine the index size.
34343457
a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding,
3435-
errors=self.errors)
3458+
errors=self.errors, start=kwargs.get('start'),
3459+
stop=kwargs.get('stop'))
34363460

34373461
return True
34383462

+76
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import pytest
2+
3+
import pandas as pd
4+
from pandas.tests.io.test_pytables import ensure_clean_path
5+
from pandas.util.testing import assert_frame_equal
6+
7+
tables = pytest.importorskip('tables')
8+
9+
10+
@pytest.fixture
11+
def pytables_hdf5_file():
12+
"""Use PyTables to create a simple HDF5 file."""
13+
14+
table_schema = {
15+
'c0': tables.Time64Col(pos=0),
16+
'c1': tables.StringCol(5, pos=1),
17+
'c2': tables.Int64Col(pos=2),
18+
}
19+
20+
t0 = 1561105000.0
21+
22+
testsamples = [
23+
{'c0': t0, 'c1': 'aaaaa', 'c2': 1},
24+
{'c0': t0 + 1, 'c1': 'bbbbb', 'c2': 2},
25+
{'c0': t0 + 2, 'c1': 'ccccc', 'c2': 10**5},
26+
{'c0': t0 + 3, 'c1': 'ddddd', 'c2': 4294967295},
27+
]
28+
29+
objname = 'pandas_test_timeseries'
30+
31+
with ensure_clean_path('written_with_pytables.h5') as path:
32+
# The `ensure_clean_path` context mgr removes the temp file upon exit.
33+
with tables.open_file(path, mode='w') as f:
34+
t = f.create_table('/', name=objname, description=table_schema)
35+
for sample in testsamples:
36+
for key, value in sample.items():
37+
t.row[key] = value
38+
t.row.append()
39+
40+
yield path, objname, pd.DataFrame(testsamples)
41+
42+
43+
class TestReadPyTablesHDF5:
44+
"""
45+
A group of tests which covers reading HDF5 files written by plain PyTables
46+
(not written by pandas).
47+
48+
Was introduced for regression-testing issue 11188.
49+
"""
50+
51+
def test_read_complete(self, pytables_hdf5_file):
52+
path, objname, df = pytables_hdf5_file
53+
result = pd.read_hdf(path, key=objname)
54+
expected = df
55+
assert_frame_equal(result, expected)
56+
57+
def test_read_with_start(self, pytables_hdf5_file):
58+
path, objname, df = pytables_hdf5_file
59+
# This is a regression test for pandas-dev/pandas/issues/11188
60+
result = pd.read_hdf(path, key=objname, start=1)
61+
expected = df[1:].reset_index(drop=True)
62+
assert_frame_equal(result, expected)
63+
64+
def test_read_with_stop(self, pytables_hdf5_file):
65+
path, objname, df = pytables_hdf5_file
66+
# This is a regression test for pandas-dev/pandas/issues/11188
67+
result = pd.read_hdf(path, key=objname, stop=1)
68+
expected = df[:1].reset_index(drop=True)
69+
assert_frame_equal(result, expected)
70+
71+
def test_read_with_startstop(self, pytables_hdf5_file):
72+
path, objname, df = pytables_hdf5_file
73+
# This is a regression test for pandas-dev/pandas/issues/11188
74+
result = pd.read_hdf(path, key=objname, start=1, stop=2)
75+
expected = df[1:2].reset_index(drop=True)
76+
assert_frame_equal(result, expected)

pandas/tests/io/test_pytables.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def ensure_clean_store(path, mode='a', complevel=None, complib=None,
105105
def ensure_clean_path(path):
106106
"""
107107
return essentially a named temporary file that is not opened
108-
and deleted on existing; if path is a list, then create and
108+
and deleted on exiting; if path is a list, then create and
109109
return list of filenames
110110
"""
111111
try:

0 commit comments

Comments
 (0)