diff --git a/doc/source/io.rst b/doc/source/io.rst index 03afd37555b67..d51bf4c83ad0b 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2069,6 +2069,22 @@ These do not currently accept the ``where`` selector (coming soon) store.select_column('df_dc', 'index') store.select_column('df_dc', 'string') +.. _io.hdf5-where_mask: + +**Selecting using a where mask** + +Sometime your query can involve creating a list of rows to select. Usually this ``mask`` would +be a resulting ``index`` from an indexing operation. This example selects the months of +a datetimeindex which are 5. + +.. ipython:: python + + df_mask = DataFrame(np.random.randn(1000,2),index=date_range('20000101',periods=1000)) + store.append('df_mask',df_mask) + c = store.select_column('df_mask','index') + where = c[DatetimeIndex(c).month==5].index + store.select('df_mask',where=where) + **Replicating or** ``not`` and ``or`` conditions are unsupported at this time; however, diff --git a/doc/source/release.rst b/doc/source/release.rst index 1aaaf1f8b5a14..35f422ccad9dc 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -81,6 +81,7 @@ pandas 0.13 duplicate rows from a table (:issue:`4367`) - removed the ``warn`` argument from ``open``. Instead a ``PossibleDataLossError`` exception will be raised if you try to use ``mode='w'`` with an OPEN file handle (:issue:`4367`) + - allow a passed locations array or mask as a ``where`` condition (:issue:`4467`) **Experimental Features** diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index 0e2432a8b2b10..7da2f03ad4c74 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -59,17 +59,19 @@ API changes store2.close() store2 + - removed the ``_quiet`` attribute, replace by a ``DuplicateWarning`` if retrieving + duplicate rows from a table (:issue:`4367`) + - removed the ``warn`` argument from ``open``. Instead a ``PossibleDataLossError`` exception will + be raised if you try to use ``mode='w'`` with an OPEN file handle (:issue:`4367`) + - allow a passed locations array or mask as a ``where`` condition (:issue:`4467`). + See :ref:`here` for an example. + .. ipython:: python :suppress: import os os.remove(path) - - removed the ``_quiet`` attribute, replace by a ``DuplicateWarning`` if retrieving - duplicate rows from a table (:issue:`4367`) - - removed the ``warn`` argument from ``open``. Instead a ``PossibleDataLossError`` exception will - be raised if you try to use ``mode='w'`` with an OPEN file handle (:issue:`4367`) - Enhancements ~~~~~~~~~~~~ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 2f0374e60c955..9034007be2f6e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -744,7 +744,7 @@ def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, * dc = data_columns if k == selector else None # compute the val - val = value.reindex_axis(v, axis=axis, copy=False) + val = value.reindex_axis(v, axis=axis) self.append(k, val, data_columns=dc, **kwargs) @@ -2674,7 +2674,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, # reindex by our non_index_axes & compute data_columns for a in self.non_index_axes: - obj = obj.reindex_axis(a[1], axis=a[0], copy=False) + obj = obj.reindex_axis(a[1], axis=a[0]) # figure out data_columns and get out blocks block_obj = self.get_object(obj).consolidate() @@ -2684,10 +2684,10 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, data_columns = self.validate_data_columns(data_columns, min_itemsize) if len(data_columns): blocks = block_obj.reindex_axis(Index(axis_labels) - Index( - data_columns), axis=axis, copy=False)._data.blocks + data_columns), axis=axis)._data.blocks for c in data_columns: blocks.extend(block_obj.reindex_axis( - [c], axis=axis, copy=False)._data.blocks) + [c], axis=axis)._data.blocks) # reorder the blocks in the same order as the existing_table if we can if existing_table is not None: @@ -2760,7 +2760,7 @@ def process_axes(self, obj, columns=None): for axis, labels in self.non_index_axes: if columns is not None: labels = Index(labels) & Index(columns) - obj = obj.reindex_axis(labels, axis=axis, copy=False) + obj = obj.reindex_axis(labels, axis=axis) # apply the selection filters (but keep in the same order) if self.selection.filter: @@ -3765,9 +3765,34 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs): self.terms = None self.coordinates = None + # a coordinate if isinstance(where, Coordinates): self.coordinates = where.values - else: + + elif com.is_list_like(where): + + # see if we have a passed coordinate like + try: + inferred = lib.infer_dtype(where) + if inferred=='integer' or inferred=='boolean': + where = np.array(where) + if where.dtype == np.bool_: + start, stop = self.start, self.stop + if start is None: + start = 0 + if stop is None: + stop = self.table.nrows + self.coordinates = np.arange(start,stop)[where] + elif issubclass(where.dtype.type,np.integer): + if (self.start is not None and (where=self.stop).any()): + raise ValueError("where must have index locations >= start and < stop") + self.coordinates = where + + except: + pass + + if self.coordinates is None: + self.terms = self.generate(where) # create the numexpr & the filter diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index a5c4cb49bead8..ec2dce753c6b5 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -11,7 +11,7 @@ import pandas from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range, - date_range, Index) + date_range, Index, DatetimeIndex) from pandas.io.pytables import (HDFStore, get_store, Term, read_hdf, IncompatibilityWarning, PerformanceWarning, AttributeConflictWarning, DuplicateWarning, @@ -2535,6 +2535,43 @@ def test_coordinates(self): expected = expected[(expected.A > 0) & (expected.B > 0)] tm.assert_frame_equal(result, expected) + # pass array/mask as the coordinates + with ensure_clean(self.path) as store: + + df = DataFrame(np.random.randn(1000,2),index=date_range('20000101',periods=1000)) + store.append('df',df) + c = store.select_column('df','index') + where = c[DatetimeIndex(c).month==5].index + expected = df.iloc[where] + + # locations + result = store.select('df',where=where) + tm.assert_frame_equal(result,expected) + + # boolean + result = store.select('df',where=where) + tm.assert_frame_equal(result,expected) + + # invalid + self.assertRaises(ValueError, store.select, 'df',where=np.arange(len(df),dtype='float64')) + self.assertRaises(ValueError, store.select, 'df',where=np.arange(len(df)+1)) + self.assertRaises(ValueError, store.select, 'df',where=np.arange(len(df)),start=5) + self.assertRaises(ValueError, store.select, 'df',where=np.arange(len(df)),start=5,stop=10) + + # list + df = DataFrame(np.random.randn(10,2)) + store.append('df2',df) + result = store.select('df2',where=[0,3,5]) + expected = df.iloc[[0,3,5]] + tm.assert_frame_equal(result,expected) + + # boolean + where = [True] * 10 + where[-2] = False + result = store.select('df2',where=where) + expected = df.loc[where] + tm.assert_frame_equal(result,expected) + def test_append_to_multiple(self): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x)