diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index a77e2c928abfa..6eb141930f274 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -231,22 +231,49 @@ Note, with the :ref:`advanced indexing ` ``ix`` method, you may select along more than one axis using boolean vectors combined with other indexing expressions. -Indexing a DataFrame with a boolean DataFrame -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Where and Masking +~~~~~~~~~~~~~~~~~ -You may wish to set values on a DataFrame based on some boolean criteria -derived from itself or another DataFrame or set of DataFrames. This can be done -intuitively like so: +Selecting values from a DataFrame is accomplished in a similar manner to a Series. +You index the Frame with a boolean DataFrame of the same size. This is accomplished +via the method `where` under the hood. The returned view of the DataFrame is the +same size as the original. + +.. ipython:: python + + df < 0 + df[df < 0] + +In addition, `where` takes an optional `other` argument for replacement in the +returned copy. + +.. ipython:: python + + df.where(df < 0, -df) + +You may wish to set values on a DataFrame based on some boolean criteria. +This can be done intuitively like so: .. ipython:: python df2 = df.copy() - df2 < 0 df2[df2 < 0] = 0 df2 -Note that such an operation requires that the boolean DataFrame is indexed -exactly the same. +Furthermore, `where` aligns the input boolean condition (ndarray or DataFrame), such that partial selection +with setting is possible. This is analagous to partial setting via `.ix` (but on the contents rather than the axis labels) + +.. ipython:: python + + df2 = df.copy() + df2[ df2[1:4] > 0 ] = 3 + df2 + +`DataFrame.mask` is the inverse boolean operation of `where`. + +.. ipython:: python + + df.mask(df >= 0) Take Methods diff --git a/doc/source/io.rst b/doc/source/io.rst index f74120ad7ef57..76bd123acf8aa 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1,3 +1,4 @@ + .. _io: .. currentmodule:: pandas @@ -812,8 +813,114 @@ In a current or later Python session, you can retrieve stored objects: os.remove('store.h5') -.. Storing in Table format -.. ~~~~~~~~~~~~~~~~~~~~~~~ +Storing in Table format +~~~~~~~~~~~~~~~~~~~~~~~ + +```HDFStore``` supports another *PyTables* format on disk, the *table* format. Conceptually a *table* is shaped +very much like a DataFrame, with rows and columns. A *table* may be appended to in the same or other sessions. +In addition, delete, query type operations are supported. You can create an index with ```create_table_index``` +after data is already in the table (this may become automatic in the future or an option on appending/putting a *table*). + +.. ipython:: python + :suppress: + :okexcept: + + os.remove('store.h5') + +.. ipython:: python + + store = HDFStore('store.h5') + df1 = df[0:4] + df2 = df[4:] + store.append('df', df1) + store.append('df', df2) + + store.select('df') + + store.create_table_index('df') + store.handle.root.df.table + +.. ipython:: python + :suppress: + + store.close() + import os + os.remove('store.h5') + + +Querying objects stored in Table format +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +`select` and `delete` operations have an optional criteria that can be specified to select/delete only +a subset of the data. This allows one to have a very large on-disk table and retrieve only a portion of the data. + +A query is specified using the `Term` class under the hood. + + - 'index' refers to the index of a DataFrame + - 'major_axis' and 'minor_axis' are supported indexers of the Panel + +The following are all valid terms. + +.. code-block:: python + + dict(field = 'index', op = '>', value = '20121114') + ('index', '>', '20121114') + 'index>20121114' + ('index', '>', datetime(2012,11,14)) + + ('index', ['20121114','20121115']) + ('major', Timestamp('2012/11/14')) + ('minor_axis', ['A','B']) + +Queries are built up (currently only *and* is supported) using a list. An example query for a panel might be specified as follows: + +.. code-block:: python + + ['major_axis>20121114', ('minor_axis', ['A','B']) ] + +This is roughly translated to: major_axis must be greater than the date 20121114 and the minor_axis must be A or B + +.. ipython:: python + + store = HDFStore('store.h5') + store.append('wp',wp) + store.select('wp',[ 'major_axis>20000102', ('minor_axis', ['A','B']) ]) + +Delete objects stored in Table format +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. ipython:: python + + store.remove('wp', 'index>20000102' ) + store.select('wp') + +.. ipython:: python + :suppress: + + store.close() + import os + os.remove('store.h5') -.. Querying objects stored in Table format -.. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Notes & Caveats +~~~~~~~~~~~~~~~ + + - Selection by items (the top level panel dimension) is not possible; you always get all of the items in the returned Panel + - Currently the sizes of the *column* items are governed by the first table creation + (this should be specified at creation time or use the largest available) - otherwise subsequent appends can truncate the column names + - Mixed-Type Panels/DataFrames are not currently supported - coming soon! + - Once a *table* is created its items (Panel) / columns (DataFrame) are fixed; only exactly the same columns can be appended + - Appending to an already existing table will raise an exception if any of the indexers (index,major_axis or minor_axis) are strings + and they would be truncated because the column size is too small (you can pass ```min_itemsize``` to append to provide a larger fixed size + to compensate) + +Performance +~~~~~~~~~~~ + + - To delete a lot of data, it is sometimes better to erase the table and rewrite it (after say an indexing operation) + *PyTables* tends to increase the file size with deletions + - In general it is best to store Panels with the most frequently selected dimension in the minor axis and a time/date like dimension in the major axis + but this is not required, major_axis and minor_axis can be any valid Panel index + - No dimensions are currently indexed automagically (in the *PyTables* sense); these require an explict call to ```create_table_index``` + - *Tables* offer better performance when compressed after writing them (as opposed to turning on compression at the very beginning) + use the pytables utilities ptrepack to rewrite the file (and also can change compression methods) + - Duplicate rows can be written, but are filtered out in selection (with the last items being selected; thus a table is unique on major, minor pairs) diff --git a/pandas/core/series.py b/pandas/core/series.py index a798915cb9681..d882a147f5395 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -562,6 +562,44 @@ def _get_values(self, indexer): except Exception: return self.values[indexer] + def where(self, cond, other=nan, inplace=False): + """ + Return a Series where cond is True; otherwise values are from other + + Parameters + ---------- + cond: boolean Series or array + other: scalar or Series + + Returns + ------- + wh: Series + """ + if not hasattr(cond, 'shape'): + raise ValueError('where requires an ndarray like object for its ' + 'condition') + + if inplace: + self._set_with(~cond, other) + return self + + return self._get_values(cond).reindex_like(self).fillna(other) + + def mask(self, cond): + """ + Returns copy of self whose values are replaced with nan if the + inverted condition is True + + Parameters + ---------- + cond: boolean Series or array + + Returns + ------- + wh: Series + """ + return self.where(~cond, nan) + def __setitem__(self, key, value): try: try: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index af480b5a6457f..bc8967973808e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -7,6 +7,7 @@ from datetime import datetime, date import time +import re import numpy as np from pandas import ( @@ -67,12 +68,20 @@ # oh the troubles to reduce import time _table_mod = None +_table_supports_index = True def _tables(): global _table_mod + global _table_supports_index if _table_mod is None: import tables _table_mod = tables + + # version requirements + major, minor, subv = tables.__version__.split('.') + if major >= 2 and minor >= 3: + _table_supports_index = True + return _table_mod @@ -321,7 +330,7 @@ def select(self, key, where=None): return self._read_group(group, where) def put(self, key, value, table=False, append=False, - compression=None): + compression=None, **kwargs): """ Store object in HDFStore @@ -342,7 +351,7 @@ def put(self, key, value, table=False, append=False, be used. """ self._write_to_group(key, value, table=table, append=append, - comp=compression) + comp=compression, **kwargs) def _get_handler(self, op, kind): return getattr(self, '_%s_%s' % (op, kind)) @@ -370,7 +379,7 @@ def remove(self, key, where=None): if group is not None: self._delete_from_table(group, where) - def append(self, key, value): + def append(self, key, value, **kwargs): """ Append to Table in file. Node must already exist and be Table format. @@ -385,10 +394,58 @@ def append(self, key, value): Does *not* check if data being appended overlaps with existing data in the table, so be careful """ - self._write_to_group(key, value, table=True, append=True) + self._write_to_group(key, value, table=True, append=True, **kwargs) + + def create_table_index(self, key, columns = None, optlevel = None, kind = None): + """ + Create a pytables index on the specified columns + note: cannot index Time64Col() currently; PyTables must be >= 2.3.1 + + + Paramaters + ---------- + key : object (the node to index) + columns : None or list_like (the columns to index - currently supports index/column) + optlevel: optimization level (defaults to 6) + kind : kind of index (defaults to 'medium') + + Exceptions + ---------- + raises if the node is not a table + + """ + + # version requirements + if not _table_supports_index: + raise("PyTables >= 2.3 is required for table indexing") + + group = getattr(self.handle.root, key, None) + if group is None: return + + if not _is_table_type(group): + raise Exception("cannot create table index on a non-table") + + table = getattr(group, 'table', None) + if table is None: return + + if columns is None: + columns = ['index'] + if not isinstance(columns, (tuple,list)): + columns = [ columns ] + + kw = dict() + if optlevel is not None: + kw['optlevel'] = optlevel + if kind is not None: + kw['kind'] = kind + + for c in columns: + v = getattr(table.cols,c,None) + if v is not None and not v.is_indexed: + v.createIndex(**kw) def _write_to_group(self, key, value, table=False, append=False, - comp=None): + comp=None, **kwargs): root = self.handle.root if key not in root._v_children: group = self.handle.createGroup(root, key) @@ -400,7 +457,7 @@ def _write_to_group(self, key, value, table=False, append=False, kind = '%s_table' % kind handler = self._get_handler(op='write', kind=kind) wrapper = lambda value: handler(group, value, append=append, - comp=comp) + comp=comp, **kwargs) else: if append: raise ValueError('Can only append to Tables') @@ -530,7 +587,7 @@ def _read_block_manager(self, group): return BlockManager(blocks, axes) - def _write_frame_table(self, group, df, append=False, comp=None): + def _write_frame_table(self, group, df, append=False, comp=None, **kwargs): mat = df.values values = mat.reshape((1,) + mat.shape) @@ -540,7 +597,7 @@ def _write_frame_table(self, group, df, append=False, comp=None): self._write_table(group, items=['value'], index=df.index, columns=df.columns, - values=values, append=append, compression=comp) + values=values, append=append, compression=comp, **kwargs) def _write_wide(self, group, panel): panel._consolidate_inplace() @@ -549,10 +606,10 @@ def _write_wide(self, group, panel): def _read_wide(self, group, where=None): return Panel(self._read_block_manager(group)) - def _write_wide_table(self, group, panel, append=False, comp=None): + def _write_wide_table(self, group, panel, append=False, comp=None, **kwargs): self._write_table(group, items=panel.items, index=panel.major_axis, columns=panel.minor_axis, values=panel.values, - append=append, compression=comp) + append=append, compression=comp, **kwargs) def _read_wide_table(self, group, where=None): return self._read_panel_table(group, where) @@ -569,10 +626,10 @@ def _write_index(self, group, key, index): self._write_sparse_intindex(group, key, index) else: setattr(group._v_attrs, '%s_variety' % key, 'regular') - converted, kind, _ = _convert_index(index) - self._write_array(group, key, converted) + converted = _convert_index(index).set_name('index') + self._write_array(group, key, converted.values) node = getattr(group, key) - node._v_attrs.kind = kind + node._v_attrs.kind = converted.kind node._v_attrs.name = index.name if isinstance(index, (DatetimeIndex, PeriodIndex)): @@ -629,11 +686,11 @@ def _write_multi_index(self, group, key, index): index.labels, index.names)): # write the level - conv_level, kind, _ = _convert_index(lev) level_key = '%s_level%d' % (key, i) - self._write_array(group, level_key, conv_level) + conv_level = _convert_index(lev).set_name(level_key) + self._write_array(group, level_key, conv_level.values) node = getattr(group, level_key) - node._v_attrs.kind = kind + node._v_attrs.kind = conv_level.kind node._v_attrs.name = name # write the name @@ -738,22 +795,28 @@ def _write_array(self, group, key, value): getattr(group, key)._v_attrs.transposed = transposed def _write_table(self, group, items=None, index=None, columns=None, - values=None, append=False, compression=None): + values=None, append=False, compression=None, + min_itemsize = None, **kwargs): """ need to check for conform to the existing table: e.g. columns should match """ - # create dict of types - index_converted, index_kind, index_t = _convert_index(index) - columns_converted, cols_kind, col_t = _convert_index(columns) + + # create Col types + index_converted = _convert_index(index).set_name('index') + columns_converted = _convert_index(columns).set_name('column') # create the table if it doesn't exist (or get it if it does) if not append: if 'table' in group: self.handle.removeNode(group, 'table') + else: + # check that we are not truncating on our indicies + index_converted.maybe_set(min_itemsize = min_itemsize) + columns_converted.maybe_set(min_itemsize = min_itemsize) if 'table' not in group: # create the table - desc = {'index': index_t, - 'column': col_t, + desc = {'index' : index_converted.typ, + 'column': columns_converted.typ, 'values': _tables().FloatCol(shape=(len(values)))} options = {'name': 'table', @@ -775,16 +838,20 @@ def _write_table(self, group, items=None, index=None, columns=None, # the table must already exist table = getattr(group, 'table', None) + # check that we are not truncating on our indicies + index_converted.validate(table) + columns_converted.validate(table) + # check for backwards incompatibility if append: - existing_kind = table._v_attrs.index_kind - if existing_kind != index_kind: + existing_kind = getattr(table._v_attrs,'index_kind',None) + if existing_kind is not None and existing_kind != index_converted.kind: raise TypeError("incompatible kind in index [%s - %s]" % - (existing_kind, index_kind)) + (existing_kind, index_converted.kind)) # add kinds - table._v_attrs.index_kind = index_kind - table._v_attrs.columns_kind = cols_kind + table._v_attrs.index_kind = index_converted.kind + table._v_attrs.columns_kind = columns_converted.kind if append: existing_fields = getattr(table._v_attrs, 'fields', None) if (existing_fields is not None and @@ -916,35 +983,90 @@ def _read_panel_table(self, group, where=None): lp = DataFrame(new_values, index=new_index, columns=lp.columns) wp = lp.to_panel() - if sel.column_filter: - new_minor = sorted(set(wp.minor_axis) & sel.column_filter) + if sel.filter: + new_minor = sorted(set(wp.minor_axis) & sel.filter) wp = wp.reindex(minor=new_minor) return wp - def _delete_from_table(self, group, where = None): + def _delete_from_table(self, group, where): + """ delete rows from a group where condition is True """ table = getattr(group, 'table') # create the selection - s = Selection(table, where, table._v_attrs.index_kind) + s = Selection(table,where,table._v_attrs.index_kind) s.select_coords() # delete the rows in reverse order - l = list(s.values) - l.reverse() - for c in l: - table.removeRows(c) - self.handle.flush() - return len(s.values) + l = list(s.values) + ln = len(l) + + if ln: + + # if we can do a consecutive removal - do it! + if l[0]+ln-1 == l[-1]: + table.removeRows(start = l[0], stop = l[-1]+1) + + # one by one + else: + l.reverse() + for c in l: + table.removeRows(c) + + self.handle.flush() + # return the number of rows removed + return ln + +class Col(object): + """ a column description class + + Parameters + ---------- + + values : the ndarray like converted values + kind : a string description of this type + typ : the pytables type + + """ + + def __init__(self, values, kind, typ, itemsize = None, **kwargs): + self.values = values + self.kind = kind + self.typ = typ + self.itemsize = itemsize + self.name = None + + def set_name(self, n): + self.name = n + return self + + def __iter__(self): + return iter(self.values) + + def maybe_set(self, min_itemsize = None, **kwargs): + """ maybe set a string col itemsize """ + if self.kind == 'string' and min_itemsize is not None: + if self.typ.itemsize < min_itemsize: + self.typ = _tables().StringCol(itemsize = min_itemsize, pos = getattr(self.typ,'pos',None)) + + def validate(self, table, **kwargs): + """ validate this column for string truncation (or reset to the max size) """ + if self.kind == 'string': + + # the current column name + t = getattr(table.description,self.name,None) + if t is not None: + if t.itemsize < self.itemsize: + raise Exception("[%s] column has a min_itemsize of [%s] but itemsize [%s] is required!" % (self.name,self.itemsize,t.itemsize)) def _convert_index(index): if isinstance(index, DatetimeIndex): converted = index.asi8 - return converted, 'datetime64', _tables().Int64Col() + return Col(converted, 'datetime64', _tables().Int64Col()) elif isinstance(index, (Int64Index, PeriodIndex)): atom = _tables().Int64Col() - return index.values, 'integer', atom + return Col(index.values, 'integer', atom) if isinstance(index, MultiIndex): raise Exception('MultiIndex not supported here!') @@ -955,33 +1077,33 @@ def _convert_index(index): if inferred_type == 'datetime64': converted = values.view('i8') - return converted, 'datetime64', _tables().Int64Col() + return Col(converted, 'datetime64', _tables().Int64Col()) elif inferred_type == 'datetime': converted = np.array([(time.mktime(v.timetuple()) + v.microsecond / 1E6) for v in values], dtype=np.float64) - return converted, 'datetime', _tables().Time64Col() + return Col(converted, 'datetime', _tables().Time64Col()) elif inferred_type == 'date': converted = np.array([time.mktime(v.timetuple()) for v in values], dtype=np.int32) - return converted, 'date', _tables().Time32Col() + return Col(converted, 'date', _tables().Time32Col()) elif inferred_type == 'string': converted = np.array(list(values), dtype=np.str_) itemsize = converted.dtype.itemsize - return converted, 'string', _tables().StringCol(itemsize) + return Col(converted, 'string', _tables().StringCol(itemsize), itemsize = itemsize) elif inferred_type == 'unicode': atom = _tables().ObjectAtom() - return np.asarray(values, dtype='O'), 'object', atom + return Col(np.asarray(values, dtype='O'), 'object', atom) elif inferred_type == 'integer': # take a guess for now, hope the values fit atom = _tables().Int64Col() - return np.asarray(values, dtype=np.int64), 'integer', atom + return Col(np.asarray(values, dtype=np.int64), 'integer', atom) elif inferred_type == 'floating': atom = _tables().Float64Col() - return np.asarray(values, dtype=np.float64), 'float', atom + return Col(np.asarray(values, dtype=np.float64), 'float', atom) else: # pragma: no cover atom = _tables().ObjectAtom() - return np.asarray(values, dtype='O'), 'object', atom + return Col(np.asarray(values, dtype='O'), 'object', atom) def _read_array(group, key): @@ -1088,6 +1210,151 @@ def _alias_to_class(alias): return _reverse_index_map.get(alias, Index) +class Term(object): + """ create a term object that holds a field, op, and value + + Parameters + ---------- + field : dict, string term expression, or the field to operate (must be a valid index/column type of DataFrame/Panel) + op : a valid op (defaults to '=') (optional) + >, >=, <, <=, =, != (not equal) are allowed + value : a value or list of values (required) + + Returns + ------- + a Term object + + Examples + -------- + Term(dict(field = 'index', op = '>', value = '20121114')) + Term('index', '20121114') + Term('index', '>', '20121114') + Term('index', ['20121114','20121114']) + Term('index', datetime(2012,11,14)) + Term('major>20121114') + Term('minor', ['A','B']) + + """ + + _ops = ['<','<=','>','>=','=','!='] + _search = re.compile("^(?P\w+)(?P%s)(?P.+)$" % '|'.join(_ops)) + _index = ['index','major_axis','major'] + _column = ['column','minor_axis','minor'] + + def __init__(self, field, op = None, value = None, index_kind = None): + self.field = None + self.op = None + self.value = None + self.index_kind = index_kind + self.filter = None + self.condition = None + + # unpack lists/tuples in field + if isinstance(field,(tuple,list)): + f = field + field = f[0] + if len(f) > 1: + op = f[1] + if len(f) > 2: + value = f[2] + + # backwards compatible + if isinstance(field, dict): + self.field = field.get('field') + self.op = field.get('op') or '=' + self.value = field.get('value') + + # passed a term + elif isinstance(field,Term): + self.field = field.field + self.op = field.op + self.value = field.value + + # a string expression (or just the field) + elif isinstance(field,basestring): + + # is a term is passed + s = self._search.match(field) + if s is not None: + self.field = s.group('field') + self.op = s.group('op') + self.value = s.group('value') + + else: + self.field = field + + # is an op passed? + if isinstance(op, basestring) and op in self._ops: + self.op = op + self.value = value + else: + self.op = '=' + self.value = op + + else: + raise Exception("Term does not understand the supplied field [%s]" % field) + + # we have valid fields + if self.field is None or self.op is None or self.value is None: + raise Exception("Could not create this term [%s]" % str(self)) + + # valid field name + if self.field in self._index: + self.field = 'index' + elif self.field in self._column: + self.field = 'column' + else: + raise Exception("field is not a valid index/column for this term [%s]" % str(self)) + + # we have valid conditions + if self.op in ['>','>=','<','<=']: + if hasattr(self.value,'__iter__') and len(self.value) > 1: + raise Exception("an inequality condition cannot have multiple values [%s]" % str(self)) + + if not hasattr(self.value,'__iter__'): + self.value = [ self.value ] + + self.eval() + + def __str__(self): + return "field->%s,op->%s,value->%s" % (self.field,self.op,self.value) + + __repr__ = __str__ + + def eval(self): + """ set the numexpr expression for this term """ + + # convert values + values = [ self.convert_value(v) for v in self.value ] + + # equality conditions + if self.op in ['=','!=']: + + # too many values to create the expression? + if len(values) <= 61: + self.condition = "(%s)" % ' | '.join([ "(%s == %s)" % (self.field,v[0]) for v in values]) + + # use a filter after reading + else: + self.filter = set([ v[1] for v in values ]) + + else: + + self.condition = '(%s %s %s)' % (self.field, self.op, values[0][0]) + + def convert_value(self, v): + + if self.field == 'index': + if self.index_kind == 'datetime64' : + return [lib.Timestamp(v).value, None] + elif isinstance(v, datetime): + return [time.mktime(v.timetuple()), None] + elif not isinstance(v, basestring): + return [str(v), None] + + # string quoting + return ["'" + v + "'", v] + class Selection(object): """ Carries out a selection operation on a tables.Table object. @@ -1095,72 +1362,43 @@ class Selection(object): Parameters ---------- table : tables.Table - where : list of dicts of the following form - - Comparison op - {'field' : 'index', - 'op' : '>=', - 'value' : value} + where : list of Terms (or convertable to) - Match single value - {'field' : 'index', - 'value' : v1} - - Match a set of values - {'field' : 'index', - 'value' : [v1, v2, v3]} """ def __init__(self, table, where=None, index_kind=None): - self.table = table - self.where = where + self.table = table + self.where = where self.index_kind = index_kind - self.column_filter = None - self.the_condition = None - self.conditions = [] - self.values = None - if where: - self.generate(where) + self.values = None + self.condition = None + self.filter = None + self.terms = self.generate(where) + + # create the numexpr & the filter + if self.terms: + conds = [ t.condition for t in self.terms if t.condition is not None ] + if len(conds): + self.condition = "(%s)" % ' & '.join(conds) + self.filter = set() + for t in self.terms: + if t.filter is not None: + self.filter |= t.filter def generate(self, where): - # and condictions - for c in where: - op = c.get('op', None) - value = c['value'] - field = c['field'] - - if field == 'index' and self.index_kind == 'datetime64': - val = lib.Timestamp(value).value - self.conditions.append('(%s %s %s)' % (field, op, val)) - elif field == 'index' and isinstance(value, datetime): - value = time.mktime(value.timetuple()) - self.conditions.append('(%s %s %s)' % (field, op, value)) - else: - self.generate_multiple_conditions(op, value, field) + """ generate and return the terms """ + if where is None: return None - if len(self.conditions): - self.the_condition = '(' + ' & '.join(self.conditions) + ')' + if not isinstance(where, (list,tuple)): + where = [ where ] - def generate_multiple_conditions(self, op, value, field): - - if op and op == 'in' or isinstance(value, (list, np.ndarray)): - if len(value) <= 61: - l = '(' + ' | '.join([ "(%s == '%s')" % (field, v) - for v in value]) + ')' - self.conditions.append(l) - else: - self.column_filter = set(value) - else: - if op is None: - op = '==' - self.conditions.append('(%s %s "%s")' % (field, op, value)) + return [ Term(c, index_kind = self.index_kind) for c in where ] def select(self): """ generate the selection """ - if self.the_condition: - self.values = self.table.readWhere(self.the_condition) - + if self.condition is not None: + self.values = self.table.readWhere(self.condition) else: self.values = self.table.read() @@ -1168,7 +1406,7 @@ def select_coords(self): """ generate the selection """ - self.values = self.table.getWhereList(self.the_condition) + self.values = self.table.getWhereList(self.condition) def _get_index_factory(klass): diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 9442f274a7810..30bc9d4ed8ba1 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -10,10 +10,11 @@ from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range, date_range, Index) -from pandas.io.pytables import HDFStore, get_store +from pandas.io.pytables import HDFStore, get_store, Term import pandas.util.testing as tm from pandas.tests.test_series import assert_series_equal from pandas.tests.test_frame import assert_frame_equal +from pandas import concat try: import tables @@ -142,10 +143,48 @@ def test_put_integer(self): def test_append(self): df = tm.makeTimeDataFrame() - self.store.put('c', df[:10], table=True) + self.store.append('c', df[:10]) self.store.append('c', df[10:]) tm.assert_frame_equal(self.store['c'], df) + self.store.put('d', df[:10], table=True) + self.store.append('d', df[10:]) + tm.assert_frame_equal(self.store['d'], df) + + def test_append_with_strings(self): + wp = tm.makePanel() + wp2 = wp.rename_axis(dict([ (x,"%s_extra" % x) for x in wp.minor_axis ]), axis = 2) + + self.store.append('s1', wp, min_itemsize = 20) + self.store.append('s1', wp2) + expected = concat([ wp, wp2], axis = 2) + expected = expected.reindex(minor_axis = sorted(expected.minor_axis)) + tm.assert_panel_equal(self.store['s1'], expected) + + # test truncation of bigger strings + self.store.append('s2', wp) + self.assertRaises(Exception, self.store.append, 's2', wp2) + + def test_create_table_index(self): + wp = tm.makePanel() + self.store.append('p5', wp) + self.store.create_table_index('p5') + + assert(self.store.handle.root.p5.table.cols.index.is_indexed == True) + assert(self.store.handle.root.p5.table.cols.column.is_indexed == False) + + df = tm.makeTimeDataFrame() + self.store.append('f', df[:10]) + self.store.append('f', df[10:]) + self.store.create_table_index('f') + + # create twice + self.store.create_table_index('f') + + # try to index a non-table + self.store.put('f2', df) + self.assertRaises(Exception, self.store.create_table_index, 'f2') + def test_append_diff_item_order(self): wp = tm.makePanel() wp1 = wp.ix[:, :10, :] @@ -177,11 +216,7 @@ def test_remove(self): self.assertEquals(len(self.store), 0) def test_remove_where_not_exist(self): - crit1 = { - 'field' : 'index', - 'op' : '>', - 'value' : 'foo' - } + crit1 = Term('index','>','foo') self.store.remove('a', where=[crit1]) def test_remove_crit(self): @@ -189,21 +224,60 @@ def test_remove_crit(self): self.store.put('wp', wp, table=True) date = wp.major_axis[len(wp.major_axis) // 2] - crit1 = { - 'field' : 'index', - 'op' : '>', - 'value' : date - } - crit2 = { - 'field' : 'column', - 'value' : ['A', 'D'] - } + crit1 = Term('index','>',date) + crit2 = Term('column',['A', 'D']) self.store.remove('wp', where=[crit1]) self.store.remove('wp', where=[crit2]) result = self.store['wp'] expected = wp.truncate(after=date).reindex(minor=['B', 'C']) tm.assert_panel_equal(result, expected) + # test non-consecutive row removal + wp = tm.makePanel() + self.store.put('wp2', wp, table=True) + + date1 = wp.major_axis[1:3] + date2 = wp.major_axis[5] + date3 = [wp.major_axis[7],wp.major_axis[9]] + + crit1 = Term('index',date1) + crit2 = Term('index',date2) + crit3 = Term('index',date3) + + self.store.remove('wp2', where=[crit1]) + self.store.remove('wp2', where=[crit2]) + self.store.remove('wp2', where=[crit3]) + result = self.store['wp2'] + + ma = list(wp.major_axis) + for d in date1: + ma.remove(d) + ma.remove(date2) + for d in date3: + ma.remove(d) + expected = wp.reindex(major = ma) + tm.assert_panel_equal(result, expected) + + def test_terms(self): + + Term(dict(field = 'index', op = '>', value = '20121114')) + Term('index', '20121114') + Term('index', '>', '20121114') + Term('index', ['20121114','20121114']) + Term('index', datetime(2012,11,14)) + Term('index>20121114') + Term('major>20121114') + Term('major_axis>20121114') + Term('minor', ['A','B']) + Term('minor_axis', ['A','B']) + Term('column', ['A','B']) + + self.assertRaises(Exception, Term.__init__) + self.assertRaises(Exception, Term.__init__, 'blah') + self.assertRaises(Exception, Term.__init__, 'index') + self.assertRaises(Exception, Term.__init__, 'index', '==') + self.assertRaises(Exception, Term.__init__, 'index', '>', 5) + def test_series(self): s = tm.makeStringSeries() self._check_roundtrip(s, tm.assert_series_equal) @@ -528,15 +602,8 @@ def test_panel_select(self): self.store.put('wp', wp, table=True) date = wp.major_axis[len(wp.major_axis) // 2] - crit1 = { - 'field' : 'index', - 'op' : '>=', - 'value' : date - } - crit2 = { - 'field' : 'column', - 'value' : ['A', 'D'] - } + crit1 = ('index','>=',date) + crit2 = ('column', ['A', 'D']) result = self.store.select('wp', [crit1, crit2]) expected = wp.truncate(before=date).reindex(minor=['A', 'D']) @@ -547,19 +614,9 @@ def test_frame_select(self): self.store.put('frame', df, table=True) date = df.index[len(df) // 2] - crit1 = { - 'field' : 'index', - 'op' : '>=', - 'value' : date - } - crit2 = { - 'field' : 'column', - 'value' : ['A', 'D'] - } - crit3 = { - 'field' : 'column', - 'value' : 'A' - } + crit1 = ('index','>=',date) + crit2 = ('column',['A', 'D']) + crit3 = ('column','A') result = self.store.select('frame', [crit1, crit2]) expected = df.ix[date:, ['A', 'D']] @@ -580,10 +637,7 @@ def test_select_filter_corner(self): df.columns = ['%.3d' % c for c in df.columns] self.store.put('frame', df, table=True) - crit = { - 'field' : 'column', - 'value' : df.columns[:75] - } + crit = Term('column', df.columns[:75]) result = self.store.select('frame', [crit]) tm.assert_frame_equal(result, df.ix[:, df.columns[:75]]) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 7422c925fd657..a48e66d38b1c4 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -939,6 +939,37 @@ def test_ix_getitem_iterator(self): result = self.series.ix[idx] assert_series_equal(result, self.series[:10]) + def test_where(self): + s = Series(np.random.randn(5)) + cond = s > 0 + + rs = s.where(cond).dropna() + rs2 = s[cond] + assert_series_equal(rs, rs2) + + rs = s.where(cond,-s) + assert_series_equal(rs, s.abs()) + + rs = s.where(cond) + assert(s.shape == rs.shape) + + self.assertRaises(ValueError, s.where, 1) + + def test_where_inplace(self): + s = Series(np.random.randn(5)) + cond = s > 0 + + rs = s.copy() + rs.where(cond,inplace=True) + assert_series_equal(rs.dropna(), s[cond]) + + def test_mask(self): + s = Series(np.random.randn(5)) + cond = s > 0 + + rs = s.where(cond, np.nan) + assert_series_equal(rs, s.mask(~cond)) + def test_ix_setitem(self): inds = self.series.index[[3,4,7]]