COMPAT: reading generic PyTables Table format fails with sub-selection (#26818)

jgehrcke · jreback · commit dda4c1a89cb0 · 2019-06-21T10:19:43.000-04:00
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -699,6 +699,7 @@ I/O
 - Bug in :func:`read_csv` not properly interpreting the UTF8 encoded filenames on Windows on Python 3.6+ (:issue:`15086`)
 - Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`)
 - Bug in :meth:`DataFrame.to_html` where header numbers would ignore display options when rounding (:issue:`17280`)
+- Bug in :func:`read_hdf` where reading a table from an HDF5 file written directly with PyTables fails with a ``ValueError`` when using a sub-selection via the ``start`` or ``stop`` arguments (:issue:`11188`)
 - Bug in :func:`read_hdf` not properly closing store after a ``KeyError`` is raised (:issue:`25766`)
 - Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`)
 - Improved the explanation for the failure when value labels are repeated in Stata dta files and suggested work-arounds (:issue:`25772`)
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -1624,7 +1624,8 @@ def infer(self, handler):
         new_self.read_metadata(handler)
         return new_self
 
-    def convert(self, values, nan_rep, encoding, errors):
+    def convert(self, values, nan_rep, encoding, errors, start=None,
+                stop=None):
         """ set the values from this selection: take = take ownership """
 
         # values is a recarray
@@ -1813,10 +1814,29 @@ class GenericIndexCol(IndexCol):
     def is_indexed(self):
         return False
 
-    def convert(self, values, nan_rep, encoding, errors):
-        """ set the values from this selection: take = take ownership """
+    def convert(self, values, nan_rep, encoding, errors, start=None,
+                stop=None):
+        """ set the values from this selection: take = take ownership
+
+        Parameters
+        ----------
+
+        values : np.ndarray
+        nan_rep : str
+        encoding : str
+        errors : str
+        start : int, optional
+            Table row number: the start of the sub-selection.
+        stop : int, optional
+            Table row number: the end of the sub-selection. Values larger than
+            the underlying table's row count are normalized to that.
+        """
+
+        start = start if start is not None else 0
+        stop = (min(stop, self.table.nrows)
+                if stop is not None else self.table.nrows)
+        self.values = Int64Index(np.arange(stop - start))
 
-        self.values = Int64Index(np.arange(self.table.nrows))
         return self
 
     def get_attr(self):
@@ -2159,7 +2179,8 @@ def validate_attr(self, append):
                 raise ValueError("appended items dtype do not match existing "
                                  "items dtype in table!")
 
-    def convert(self, values, nan_rep, encoding, errors):
+    def convert(self, values, nan_rep, encoding, errors, start=None,
+                stop=None):
         """set the data from this selection (and convert to the correct dtype
         if we can)
         """
@@ -3431,8 +3452,11 @@ def read_axes(self, where, **kwargs):
         # convert the data
         for a in self.axes:
             a.set_info(self.info)
+            # `kwargs` may contain `start` and `stop` arguments if passed to
+            # `store.select()`. If set they determine the index size.
             a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding,
-                      errors=self.errors)
+                      errors=self.errors, start=kwargs.get('start'),
+                      stop=kwargs.get('stop'))
 
         return True
 
diff --git a/pandas/tests/io/pytables/test_compat.py b/pandas/tests/io/pytables/test_compat.py
@@ -0,0 +1,76 @@
+import pytest
+
+import pandas as pd
+from pandas.tests.io.test_pytables import ensure_clean_path
+from pandas.util.testing import assert_frame_equal
+
+tables = pytest.importorskip('tables')
+
+
+@pytest.fixture
+def pytables_hdf5_file():
+    """Use PyTables to create a simple HDF5 file."""
+
+    table_schema = {
+        'c0': tables.Time64Col(pos=0),
+        'c1': tables.StringCol(5, pos=1),
+        'c2': tables.Int64Col(pos=2),
+    }
+
+    t0 = 1561105000.0
+
+    testsamples = [
+        {'c0': t0, 'c1': 'aaaaa', 'c2': 1},
+        {'c0': t0 + 1, 'c1': 'bbbbb', 'c2': 2},
+        {'c0': t0 + 2, 'c1': 'ccccc', 'c2': 10**5},
+        {'c0': t0 + 3, 'c1': 'ddddd', 'c2': 4294967295},
+    ]
+
+    objname = 'pandas_test_timeseries'
+
+    with ensure_clean_path('written_with_pytables.h5') as path:
+        # The `ensure_clean_path` context mgr removes the temp file upon exit.
+        with tables.open_file(path, mode='w') as f:
+            t = f.create_table('/', name=objname, description=table_schema)
+            for sample in testsamples:
+                for key, value in sample.items():
+                    t.row[key] = value
+                t.row.append()
+
+        yield path, objname, pd.DataFrame(testsamples)
+
+
+class TestReadPyTablesHDF5:
+    """
+    A group of tests which covers reading HDF5 files written by plain PyTables
+    (not written by pandas).
+
+    Was introduced for regression-testing issue 11188.
+    """
+
+    def test_read_complete(self, pytables_hdf5_file):
+        path, objname, df = pytables_hdf5_file
+        result = pd.read_hdf(path, key=objname)
+        expected = df
+        assert_frame_equal(result, expected)
+
+    def test_read_with_start(self, pytables_hdf5_file):
+        path, objname, df = pytables_hdf5_file
+        # This is a regression test for pandas-dev/pandas/issues/11188
+        result = pd.read_hdf(path, key=objname, start=1)
+        expected = df[1:].reset_index(drop=True)
+        assert_frame_equal(result, expected)
+
+    def test_read_with_stop(self, pytables_hdf5_file):
+        path, objname, df = pytables_hdf5_file
+        # This is a regression test for pandas-dev/pandas/issues/11188
+        result = pd.read_hdf(path, key=objname, stop=1)
+        expected = df[:1].reset_index(drop=True)
+        assert_frame_equal(result, expected)
+
+    def test_read_with_startstop(self, pytables_hdf5_file):
+        path, objname, df = pytables_hdf5_file
+        # This is a regression test for pandas-dev/pandas/issues/11188
+        result = pd.read_hdf(path, key=objname, start=1, stop=2)
+        expected = df[1:2].reset_index(drop=True)
+        assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py
@@ -105,7 +105,7 @@ def ensure_clean_store(path, mode='a', complevel=None, complib=None,
 def ensure_clean_path(path):
     """
     return essentially a named temporary file that is not opened
-    and deleted on existing; if path is a list, then create and
+    and deleted on exiting; if path is a list, then create and
     return list of filenames
     """
     try: