pandas-dev · jreback · Aug 5, 2013 · Aug 5, 2013
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -2069,6 +2069,22 @@ These do not currently accept the ``where`` selector (coming soon)
    store.select_column('df_dc', 'index')
    store.select_column('df_dc', 'string')
 
+.. _io.hdf5-where_mask:
+
+**Selecting using a where mask**
+
+Sometime your query can involve creating a list of rows to select. Usually this ``mask`` would
+be a resulting ``index`` from an indexing operation. This example selects the months of
+a datetimeindex which are 5.
+
+.. ipython:: python
+
+   df_mask = DataFrame(np.random.randn(1000,2),index=date_range('20000101',periods=1000))
+   store.append('df_mask',df_mask)
+   c = store.select_column('df_mask','index')
+   where = c[DatetimeIndex(c).month==5].index
+   store.select('df_mask',where=where)
+
 **Replicating or**
 
 ``not`` and ``or`` conditions are unsupported at this time; however,

diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -81,6 +81,7 @@ pandas 0.13
       duplicate rows from a table (:issue:`4367`)
     - removed the ``warn`` argument from ``open``. Instead a ``PossibleDataLossError`` exception will
       be raised if you try to use ``mode='w'`` with an OPEN file handle (:issue:`4367`)
+    - allow a passed locations array or mask as a ``where`` condition (:issue:`4467`)
 
 **Experimental Features**
 

diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt
@@ -59,17 +59,19 @@ API changes
          store2.close()
          store2
 
+    - removed the ``_quiet`` attribute, replace by a ``DuplicateWarning`` if retrieving
+      duplicate rows from a table (:issue:`4367`)
+    - removed the ``warn`` argument from ``open``. Instead a ``PossibleDataLossError`` exception will
+      be raised if you try to use ``mode='w'`` with an OPEN file handle (:issue:`4367`)
+    - allow a passed locations array or mask as a ``where`` condition (:issue:`4467`).
+      See :ref:`here<io.hdf5-where_mask>` for an example.
+
       .. ipython:: python
          :suppress:
 
          import os
          os.remove(path)
 
-    - removed the ``_quiet`` attribute, replace by a ``DuplicateWarning`` if retrieving
-      duplicate rows from a table (:issue:`4367`)
-    - removed the ``warn`` argument from ``open``. Instead a ``PossibleDataLossError`` exception will
-      be raised if you try to use ``mode='w'`` with an OPEN file handle (:issue:`4367`)
-
 Enhancements
 ~~~~~~~~~~~~
 

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -744,7 +744,7 @@ def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, *
             dc = data_columns if k == selector else None
 
             # compute the val
-            val = value.reindex_axis(v, axis=axis, copy=False)
+            val = value.reindex_axis(v, axis=axis)
 
             self.append(k, val, data_columns=dc, **kwargs)
 
@@ -2674,7 +2674,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None,
 
         # reindex by our non_index_axes & compute data_columns
         for a in self.non_index_axes:
-            obj = obj.reindex_axis(a[1], axis=a[0], copy=False)
+            obj = obj.reindex_axis(a[1], axis=a[0])
 
         # figure out data_columns and get out blocks
         block_obj = self.get_object(obj).consolidate()
@@ -2684,10 +2684,10 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None,
             data_columns = self.validate_data_columns(data_columns, min_itemsize)
             if len(data_columns):
                 blocks = block_obj.reindex_axis(Index(axis_labels) - Index(
-                        data_columns), axis=axis, copy=False)._data.blocks
+                        data_columns), axis=axis)._data.blocks
                 for c in data_columns:
                     blocks.extend(block_obj.reindex_axis(
-                            [c], axis=axis, copy=False)._data.blocks)
+                            [c], axis=axis)._data.blocks)
 
         # reorder the blocks in the same order as the existing_table if we can
         if existing_table is not None:
@@ -2760,7 +2760,7 @@ def process_axes(self, obj, columns=None):
         for axis, labels in self.non_index_axes:
             if columns is not None:
                 labels = Index(labels) & Index(columns)
-            obj = obj.reindex_axis(labels, axis=axis, copy=False)
+            obj = obj.reindex_axis(labels, axis=axis)
 
         # apply the selection filters (but keep in the same order)
         if self.selection.filter:
@@ -3765,9 +3765,34 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs):
         self.terms = None
         self.coordinates = None
 
+        # a coordinate
         if isinstance(where, Coordinates):
             self.coordinates = where.values
-        else:
+
+        elif com.is_list_like(where):
+
+            # see if we have a passed coordinate like
+            try:
+                inferred = lib.infer_dtype(where)
+                if inferred=='integer' or inferred=='boolean':
+                    where = np.array(where)
+                    if where.dtype == np.bool_:
+                        start, stop = self.start, self.stop
+                        if start is None:
+                            start = 0
+                        if stop is None:
+                            stop = self.table.nrows
+                        self.coordinates = np.arange(start,stop)[where]
+                    elif issubclass(where.dtype.type,np.integer):
+                        if (self.start is not None and (where<self.start).any()) or (self.stop is not None and (where>=self.stop).any()):
+                            raise ValueError("where must have index locations >= start and < stop")
+                        self.coordinates = where
+
+            except:
+                pass
+
+        if self.coordinates is None:
+
             self.terms = self.generate(where)
 
             # create the numexpr & the filter

diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py
@@ -11,7 +11,7 @@
 
 import pandas
 from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range,
-                    date_range, Index)
+                    date_range, Index, DatetimeIndex)
 from pandas.io.pytables import (HDFStore, get_store, Term, read_hdf,
                                 IncompatibilityWarning, PerformanceWarning,
                                 AttributeConflictWarning, DuplicateWarning,
@@ -2535,6 +2535,43 @@ def test_coordinates(self):
             expected = expected[(expected.A > 0) & (expected.B > 0)]
             tm.assert_frame_equal(result, expected)
 
+        # pass array/mask as the coordinates
+        with ensure_clean(self.path) as store:
+
+            df = DataFrame(np.random.randn(1000,2),index=date_range('20000101',periods=1000))
+            store.append('df',df)
+            c = store.select_column('df','index')
+            where = c[DatetimeIndex(c).month==5].index
+            expected = df.iloc[where]
+
+            # locations
+            result = store.select('df',where=where)
+            tm.assert_frame_equal(result,expected)
+
+            # boolean
+            result = store.select('df',where=where)
+            tm.assert_frame_equal(result,expected)
+
+            # invalid
+            self.assertRaises(ValueError, store.select, 'df',where=np.arange(len(df),dtype='float64'))
+            self.assertRaises(ValueError, store.select, 'df',where=np.arange(len(df)+1))
+            self.assertRaises(ValueError, store.select, 'df',where=np.arange(len(df)),start=5)
+            self.assertRaises(ValueError, store.select, 'df',where=np.arange(len(df)),start=5,stop=10)
+
+            # list
+            df = DataFrame(np.random.randn(10,2))
+            store.append('df2',df)
+            result = store.select('df2',where=[0,3,5])
+            expected = df.iloc[[0,3,5]]
+            tm.assert_frame_equal(result,expected)
+
+            # boolean
+            where = [True] * 10
+            where[-2] = False
+            result = store.select('df2',where=where)
+            expected = df.loc[where]
+            tm.assert_frame_equal(result,expected)
+
     def test_append_to_multiple(self):
         df1 = tm.makeTimeDataFrame()
         df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x)