Skip to content

ENH: allow where to be a list/array or a boolean mask of locations (GH4467) #4470

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 5, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2069,6 +2069,22 @@ These do not currently accept the ``where`` selector (coming soon)
store.select_column('df_dc', 'index')
store.select_column('df_dc', 'string')

.. _io.hdf5-where_mask:

**Selecting using a where mask**

Sometime your query can involve creating a list of rows to select. Usually this ``mask`` would
be a resulting ``index`` from an indexing operation. This example selects the months of
a datetimeindex which are 5.

.. ipython:: python

df_mask = DataFrame(np.random.randn(1000,2),index=date_range('20000101',periods=1000))
store.append('df_mask',df_mask)
c = store.select_column('df_mask','index')
where = c[DatetimeIndex(c).month==5].index
store.select('df_mask',where=where)

**Replicating or**

``not`` and ``or`` conditions are unsupported at this time; however,
Expand Down
1 change: 1 addition & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ pandas 0.13
duplicate rows from a table (:issue:`4367`)
- removed the ``warn`` argument from ``open``. Instead a ``PossibleDataLossError`` exception will
be raised if you try to use ``mode='w'`` with an OPEN file handle (:issue:`4367`)
- allow a passed locations array or mask as a ``where`` condition (:issue:`4467`)

**Experimental Features**

Expand Down
12 changes: 7 additions & 5 deletions doc/source/v0.13.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,17 +59,19 @@ API changes
store2.close()
store2

- removed the ``_quiet`` attribute, replace by a ``DuplicateWarning`` if retrieving
duplicate rows from a table (:issue:`4367`)
- removed the ``warn`` argument from ``open``. Instead a ``PossibleDataLossError`` exception will
be raised if you try to use ``mode='w'`` with an OPEN file handle (:issue:`4367`)
- allow a passed locations array or mask as a ``where`` condition (:issue:`4467`).
See :ref:`here<io.hdf5-where_mask>` for an example.

.. ipython:: python
:suppress:

import os
os.remove(path)

- removed the ``_quiet`` attribute, replace by a ``DuplicateWarning`` if retrieving
duplicate rows from a table (:issue:`4367`)
- removed the ``warn`` argument from ``open``. Instead a ``PossibleDataLossError`` exception will
be raised if you try to use ``mode='w'`` with an OPEN file handle (:issue:`4367`)

Enhancements
~~~~~~~~~~~~

Expand Down
37 changes: 31 additions & 6 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -744,7 +744,7 @@ def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, *
dc = data_columns if k == selector else None

# compute the val
val = value.reindex_axis(v, axis=axis, copy=False)
val = value.reindex_axis(v, axis=axis)

self.append(k, val, data_columns=dc, **kwargs)

Expand Down Expand Up @@ -2674,7 +2674,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None,

# reindex by our non_index_axes & compute data_columns
for a in self.non_index_axes:
obj = obj.reindex_axis(a[1], axis=a[0], copy=False)
obj = obj.reindex_axis(a[1], axis=a[0])

# figure out data_columns and get out blocks
block_obj = self.get_object(obj).consolidate()
Expand All @@ -2684,10 +2684,10 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None,
data_columns = self.validate_data_columns(data_columns, min_itemsize)
if len(data_columns):
blocks = block_obj.reindex_axis(Index(axis_labels) - Index(
data_columns), axis=axis, copy=False)._data.blocks
data_columns), axis=axis)._data.blocks
for c in data_columns:
blocks.extend(block_obj.reindex_axis(
[c], axis=axis, copy=False)._data.blocks)
[c], axis=axis)._data.blocks)

# reorder the blocks in the same order as the existing_table if we can
if existing_table is not None:
Expand Down Expand Up @@ -2760,7 +2760,7 @@ def process_axes(self, obj, columns=None):
for axis, labels in self.non_index_axes:
if columns is not None:
labels = Index(labels) & Index(columns)
obj = obj.reindex_axis(labels, axis=axis, copy=False)
obj = obj.reindex_axis(labels, axis=axis)

# apply the selection filters (but keep in the same order)
if self.selection.filter:
Expand Down Expand Up @@ -3765,9 +3765,34 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs):
self.terms = None
self.coordinates = None

# a coordinate
if isinstance(where, Coordinates):
self.coordinates = where.values
else:

elif com.is_list_like(where):

# see if we have a passed coordinate like
try:
inferred = lib.infer_dtype(where)
if inferred=='integer' or inferred=='boolean':
where = np.array(where)
if where.dtype == np.bool_:
start, stop = self.start, self.stop
if start is None:
start = 0
if stop is None:
stop = self.table.nrows
self.coordinates = np.arange(start,stop)[where]
elif issubclass(where.dtype.type,np.integer):
if (self.start is not None and (where<self.start).any()) or (self.stop is not None and (where>=self.stop).any()):
raise ValueError("where must have index locations >= start and < stop")
self.coordinates = where

except:
pass

if self.coordinates is None:

self.terms = self.generate(where)

# create the numexpr & the filter
Expand Down
39 changes: 38 additions & 1 deletion pandas/io/tests/test_pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

import pandas
from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range,
date_range, Index)
date_range, Index, DatetimeIndex)
from pandas.io.pytables import (HDFStore, get_store, Term, read_hdf,
IncompatibilityWarning, PerformanceWarning,
AttributeConflictWarning, DuplicateWarning,
Expand Down Expand Up @@ -2535,6 +2535,43 @@ def test_coordinates(self):
expected = expected[(expected.A > 0) & (expected.B > 0)]
tm.assert_frame_equal(result, expected)

# pass array/mask as the coordinates
with ensure_clean(self.path) as store:

df = DataFrame(np.random.randn(1000,2),index=date_range('20000101',periods=1000))
store.append('df',df)
c = store.select_column('df','index')
where = c[DatetimeIndex(c).month==5].index
expected = df.iloc[where]

# locations
result = store.select('df',where=where)
tm.assert_frame_equal(result,expected)

# boolean
result = store.select('df',where=where)
tm.assert_frame_equal(result,expected)

# invalid
self.assertRaises(ValueError, store.select, 'df',where=np.arange(len(df),dtype='float64'))
self.assertRaises(ValueError, store.select, 'df',where=np.arange(len(df)+1))
self.assertRaises(ValueError, store.select, 'df',where=np.arange(len(df)),start=5)
self.assertRaises(ValueError, store.select, 'df',where=np.arange(len(df)),start=5,stop=10)

# list
df = DataFrame(np.random.randn(10,2))
store.append('df2',df)
result = store.select('df2',where=[0,3,5])
expected = df.iloc[[0,3,5]]
tm.assert_frame_equal(result,expected)

# boolean
where = [True] * 10
where[-2] = False
result = store.select('df2',where=where)
expected = df.loc[where]
tm.assert_frame_equal(result,expected)

def test_append_to_multiple(self):
df1 = tm.makeTimeDataFrame()
df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x)
Expand Down