From 3b690d0dc0c68347d06decf4ea58d4d1ce61d50a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 15 Nov 2012 13:57:48 -0500 Subject: [PATCH 1/7] changes in pandas/io/pytables.py 1. added __str__ (to do __repr__) 2. row removal in tables is much faster if rows are consecutive 3. added Term class, refactored Selection (this is backdwards compatible) Term is a concise way of specifying conditions for queries, e.g. Term(dict(field = 'index', op = '>', value = '20121114')) Term('index', '20121114') Term('index', '>', '20121114') Term('index', ['20121114','20121114']) Term('index', datetime(2012,11,14)) Term('index>20121114') updated tests for same this should close GH #1996 --- pandas/io/pytables.py | 258 +++++++++++++++++++++++-------- pandas/io/tests/test_pytables.py | 92 ++++++----- 2 files changed, 245 insertions(+), 105 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index af480b5a6457f..224ac82237dfd 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -7,6 +7,7 @@ from datetime import datetime, date import time +import re import numpy as np from pandas import ( @@ -916,27 +917,40 @@ def _read_panel_table(self, group, where=None): lp = DataFrame(new_values, index=new_index, columns=lp.columns) wp = lp.to_panel() - if sel.column_filter: - new_minor = sorted(set(wp.minor_axis) & sel.column_filter) + if sel.filter: + new_minor = sorted(set(wp.minor_axis) & sel.filter) wp = wp.reindex(minor=new_minor) return wp - def _delete_from_table(self, group, where = None): + def _delete_from_table(self, group, where): + """ delete rows from a group where condition is True """ table = getattr(group, 'table') # create the selection - s = Selection(table, where, table._v_attrs.index_kind) + s = Selection(table,where,table._v_attrs.index_kind) s.select_coords() # delete the rows in reverse order - l = list(s.values) - l.reverse() - for c in l: - table.removeRows(c) - self.handle.flush() - return len(s.values) + l = list(s.values) + ln = len(l) + + if ln: + + # if we can do a consecutive removal - do it! + if l[0]+ln-1 == l[-1]: + table.removeRows(start = l[0], stop = l[-1]+1) + # one by one + else: + l.reverse() + for c in l: + table.removeRows(c) + + self.handle.flush() + + # return the number of rows removed + return ln def _convert_index(index): if isinstance(index, DatetimeIndex): @@ -1088,6 +1102,151 @@ def _alias_to_class(alias): return _reverse_index_map.get(alias, Index) +class Term(object): + """ create a term object that holds a field, op, and value + + Parameters + ---------- + field : dict, string term expression, or the field to operate (must be a valid index/column type of DataFrame/Panel) + op : a valid op (defaults to '=') (optional) + >, >=, <, <=, =, != (not equal) are allowed + value : a value or list of values (required) + + Returns + ------- + a Term object + + Examples + -------- + Term(dict(field = 'index', op = '>', value = '20121114')) + Term('index', '20121114') + Term('index', '>', '20121114') + Term('index', ['20121114','20121114']) + Term('index', datetime(2012,11,14)) + Term('index>20121114') + + """ + + _ops = ['<','<=','>','>=','=','!='] + _search = re.compile("^(?P\w+)(?P%s)(?P.+)$" % '|'.join(_ops)) + _index = ['index','major_axis'] + _column = ['column','minor_axis','items'] + + def __init__(self, field, op = None, value = None, index_kind = None): + self.field = None + self.op = None + self.value = None + self.typ = None + self.index_kind = index_kind + self.filter = None + self.condition = None + + # unpack lists/tuples in field + if isinstance(field,(tuple,list)): + f = field + field = f[0] + if len(f) > 1: + op = f[1] + if len(f) > 2: + value = f[2] + + # backwards compatible + if isinstance(field, dict): + self.field = field.get('field') + self.op = field.get('op') or '=' + self.value = field.get('value') + + # passed a term + elif isinstance(field,Term): + self.field = field.field + self.op = field.op + self.value = field.value + + # a string expression (or just the field) + elif isinstance(field,basestring): + + # is a term is passed + s = self._search.match(field) + if s is not None: + self.field = s.group('field') + self.op = s.group('op') + self.value = s.group('value') + + else: + self.field = field + + # is an op passed? + if isinstance(op, basestring) and op in self._ops: + self.op = op + self.value = value + else: + self.op = '=' + self.value = op + + else: + raise Exception("Term does not understand the supplied field [%s]" % field) + + # we have valid fields + if self.field is None or self.op is None or self.value is None: + raise Exception("Could not create this term [%s]" % str(self)) + + # valid field name + if self.field in self._index: + self.typ = 'index' + elif self.field in self._column: + self.typ = 'column' + else: + raise Exception("field is not a valid index/column for this term [%s]" % str(self)) + + # we have valid conditions + if self.op in ['>','>=','<','<=']: + if hasattr(self.value,'__iter__') and len(self.value) > 1: + raise Exception("an inequality condition cannot have multiple values [%s]" % str(self)) + + if not hasattr(self.value,'__iter__'): + self.value = [ self.value ] + + self.eval() + + def __str__(self): + return "typ->%s,field->%s,op->%s,value->%s" % (self.typ,self.field,self.op,self.value) + + __repr__ = __str__ + + def eval(self): + """ set the numexpr expression for this term """ + + # convert values + values = [ self.convert_value(v) for v in self.value ] + + # equality conditions + if self.op in ['=','!=']: + + # too many values to create the expression? + if len(values) <= 61: + self.condition = "(%s)" % ' | '.join([ "(%s == %s)" % (self.field,v[0]) for v in values]) + + # use a filter after reading + else: + self.filter = set([ v[1] for v in values ]) + + else: + + self.condition = '(%s %s %s)' % (self.field, self.op, values[0][0]) + + def convert_value(self, v): + + if self.typ == 'index': + if self.index_kind == 'datetime64' : + return [lib.Timestamp(v).value, None] + elif isinstance(v, datetime): + return [time.mktime(v.timetuple()), None] + elif not isinstance(v, basestring): + return [str(v), None] + + # string quoting + return ["'" + v + "'", v] + class Selection(object): """ Carries out a selection operation on a tables.Table object. @@ -1095,72 +1254,43 @@ class Selection(object): Parameters ---------- table : tables.Table - where : list of dicts of the following form - - Comparison op - {'field' : 'index', - 'op' : '>=', - 'value' : value} - - Match single value - {'field' : 'index', - 'value' : v1} + where : list of Terms (or convertable to) - Match a set of values - {'field' : 'index', - 'value' : [v1, v2, v3]} """ def __init__(self, table, where=None, index_kind=None): - self.table = table - self.where = where + self.table = table + self.where = where self.index_kind = index_kind - self.column_filter = None - self.the_condition = None - self.conditions = [] - self.values = None - if where: - self.generate(where) + self.values = None + self.condition = None + self.filter = None + self.terms = self.generate(where) + + # create the numexpr & the filter + if self.terms: + conds = [ t.condition for t in self.terms if t.condition is not None ] + if len(conds): + self.condition = "(%s)" % ' & '.join(conds) + self.filter = set() + for t in self.terms: + if t.filter is not None: + self.filter |= t.filter def generate(self, where): - # and condictions - for c in where: - op = c.get('op', None) - value = c['value'] - field = c['field'] - - if field == 'index' and self.index_kind == 'datetime64': - val = lib.Timestamp(value).value - self.conditions.append('(%s %s %s)' % (field, op, val)) - elif field == 'index' and isinstance(value, datetime): - value = time.mktime(value.timetuple()) - self.conditions.append('(%s %s %s)' % (field, op, value)) - else: - self.generate_multiple_conditions(op, value, field) + """ generate and return the terms """ + if where is None: return None - if len(self.conditions): - self.the_condition = '(' + ' & '.join(self.conditions) + ')' + if not isinstance(where, (list,tuple)): + where = [ where ] - def generate_multiple_conditions(self, op, value, field): - - if op and op == 'in' or isinstance(value, (list, np.ndarray)): - if len(value) <= 61: - l = '(' + ' | '.join([ "(%s == '%s')" % (field, v) - for v in value]) + ')' - self.conditions.append(l) - else: - self.column_filter = set(value) - else: - if op is None: - op = '==' - self.conditions.append('(%s %s "%s")' % (field, op, value)) + return [ Term(c, index_kind = self.index_kind) for c in where ] def select(self): """ generate the selection """ - if self.the_condition: - self.values = self.table.readWhere(self.the_condition) - + if self.condition is not None: + self.values = self.table.readWhere(self.condition) else: self.values = self.table.read() @@ -1168,7 +1298,7 @@ def select_coords(self): """ generate the selection """ - self.values = self.table.getWhereList(self.the_condition) + self.values = self.table.getWhereList(self.condition) def _get_index_factory(klass): diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 9442f274a7810..3ccab8616127a 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -10,7 +10,7 @@ from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range, date_range, Index) -from pandas.io.pytables import HDFStore, get_store +from pandas.io.pytables import HDFStore, get_store, Term import pandas.util.testing as tm from pandas.tests.test_series import assert_series_equal from pandas.tests.test_frame import assert_frame_equal @@ -177,11 +177,7 @@ def test_remove(self): self.assertEquals(len(self.store), 0) def test_remove_where_not_exist(self): - crit1 = { - 'field' : 'index', - 'op' : '>', - 'value' : 'foo' - } + crit1 = Term('index','>','foo') self.store.remove('a', where=[crit1]) def test_remove_crit(self): @@ -189,21 +185,55 @@ def test_remove_crit(self): self.store.put('wp', wp, table=True) date = wp.major_axis[len(wp.major_axis) // 2] - crit1 = { - 'field' : 'index', - 'op' : '>', - 'value' : date - } - crit2 = { - 'field' : 'column', - 'value' : ['A', 'D'] - } + crit1 = Term('index','>',date) + crit2 = Term('column',['A', 'D']) self.store.remove('wp', where=[crit1]) self.store.remove('wp', where=[crit2]) result = self.store['wp'] expected = wp.truncate(after=date).reindex(minor=['B', 'C']) tm.assert_panel_equal(result, expected) + # test non-consecutive row removal + wp = tm.makePanel() + self.store.put('wp2', wp, table=True) + + date1 = wp.major_axis[1:3] + date2 = wp.major_axis[5] + date3 = [wp.major_axis[7],wp.major_axis[9]] + + crit1 = Term('index',date1) + crit2 = Term('index',date2) + crit3 = Term('index',date3) + + self.store.remove('wp2', where=[crit1]) + self.store.remove('wp2', where=[crit2]) + self.store.remove('wp2', where=[crit3]) + result = self.store['wp2'] + + ma = list(wp.major_axis) + for d in date1: + ma.remove(d) + ma.remove(date2) + for d in date3: + ma.remove(d) + expected = wp.reindex(major = ma) + tm.assert_panel_equal(result, expected) + + def test_terms(self): + + Term(dict(field = 'index', op = '>', value = '20121114')) + Term('index', '20121114') + Term('index', '>', '20121114') + Term('index', ['20121114','20121114']) + Term('index', datetime(2012,11,14)) + Term('index>20121114') + + self.assertRaises(Exception, Term.__init__) + self.assertRaises(Exception, Term.__init__, 'blah') + self.assertRaises(Exception, Term.__init__, 'index') + self.assertRaises(Exception, Term.__init__, 'index', '==') + self.assertRaises(Exception, Term.__init__, 'index', '>', 5) + def test_series(self): s = tm.makeStringSeries() self._check_roundtrip(s, tm.assert_series_equal) @@ -528,15 +558,8 @@ def test_panel_select(self): self.store.put('wp', wp, table=True) date = wp.major_axis[len(wp.major_axis) // 2] - crit1 = { - 'field' : 'index', - 'op' : '>=', - 'value' : date - } - crit2 = { - 'field' : 'column', - 'value' : ['A', 'D'] - } + crit1 = ('index','>=',date) + crit2 = ('column', ['A', 'D']) result = self.store.select('wp', [crit1, crit2]) expected = wp.truncate(before=date).reindex(minor=['A', 'D']) @@ -547,19 +570,9 @@ def test_frame_select(self): self.store.put('frame', df, table=True) date = df.index[len(df) // 2] - crit1 = { - 'field' : 'index', - 'op' : '>=', - 'value' : date - } - crit2 = { - 'field' : 'column', - 'value' : ['A', 'D'] - } - crit3 = { - 'field' : 'column', - 'value' : 'A' - } + crit1 = ('index','>=',date) + crit2 = ('column',['A', 'D']) + crit3 = ('column','A') result = self.store.select('frame', [crit1, crit2]) expected = df.ix[date:, ['A', 'D']] @@ -580,10 +593,7 @@ def test_select_filter_corner(self): df.columns = ['%.3d' % c for c in df.columns] self.store.put('frame', df, table=True) - crit = { - 'field' : 'column', - 'value' : df.columns[:75] - } + crit = Term('column', df.columns[:75]) result = self.store.select('frame', [crit]) tm.assert_frame_equal(result, df.ix[:, df.columns[:75]]) From 0fcae823d658b23e448c1b0c733ffff850a5d4ab Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 15 Nov 2012 15:13:32 -0500 Subject: [PATCH 2/7] update the HDF5 documentation to support table operations in io.html --- doc/source/io.rst | 100 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 96 insertions(+), 4 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index f74120ad7ef57..3962176e10fc9 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1,3 +1,4 @@ + .. _io: .. currentmodule:: pandas @@ -812,8 +813,99 @@ In a current or later Python session, you can retrieve stored objects: os.remove('store.h5') -.. Storing in Table format -.. ~~~~~~~~~~~~~~~~~~~~~~~ +Storing in Table format +~~~~~~~~~~~~~~~~~~~~~~~ + +```HDFStore``` supports another *PyTables* format on disk, the *table* format. Conceptually a *table* is shaped +very much like a DataFrame, with rows and columns. A *table* may be appended to in the same or other sessions. +In addition, delete and query type operations are supported. + +.. ipython:: python + :suppress: + :okexcept: + + os.remove('store.h5') + +.. ipython:: python + + store = HDFStore('store.h5') + df1 = df[0:4] + df2 = df[4:] + store.put('df', df1, table=True) + store.append('df', df2) + + store.select('df') + +.. ipython:: python + :suppress: + + store.close() + import os + os.remove('store.h5') + + +Querying objects stored in Table format +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +`select` and `delete` operations have an optional criteria that can be specified to select/delete only +a subset of the data. This allows one to have very large on-table disks and retrieve only a portion of the data. + +A query is specified using the `Term` class under the hood. + + - 'index' refers to the index of a DataFrame (or major_axis of the Panel) + - 'column' refers to the minor_axis of the Panel (and is not needed for a DataFrame) + +The following are all valid terms. + +.. code-block:: python + + dict(field = 'index', op = '>', value = '20121114') + ('index', '20121114') + ('index', '>', '20121114') + ('index', ['20121114','20121114']) + ('index', datetime(2012,11,14)) + 'index>20121114' + ('column', ['A','B']) + +Queries are built up (currently only *and* is supported) using a list. An example query for a panel might be specified as follows: + +.. code-block:: python + + ['index>20121114', ('column', ['A','B']) ] + +This is roughly translated to: index must be greater than the date 20121114 and the column must be A or B + +.. ipython:: python + + store = HDFStore('store.h5') + store.put('wp',wp,table=True) + store.select('wp',[ 'index>20000102', ('column', ['A','B']) ]) + +Delete objects stored in Table format +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. ipython:: python + + store.remove('wp', 'index>20000102' ) + store.select('wp') + +.. ipython:: python + :suppress: + + store.close() + import os + os.remove('store.h5') -.. Querying objects stored in Table format -.. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Notes & Caveats +~~~~~~~~~~~~~~~ + + - Selection by items (the top level panel dimension) is not possible; you always get all of the items in the returned Panel + - Mixed-Type Panels/DataFrames are not currently supported - coming soon! + - Once a *table* is created its items (Panel) / columns (DataFrame) are fixed; only exactly the same columns can be appended + - To delete a lot of data, it is sometimes better to erase the table and rewrite it (after say an indexing operation) + *PyTables* tends to increase the file size with deletions + - In general it is best to store Panels with the most frequently selected dimension in the minor axis and a time/date like dimension in the major axis + - No dimensions are currently indexed (in the *PyTables* sense) - but coming soon! + - *Tables* offer better performance when compressed after writing them (as opposed to turning on compression at the very beginning) + use the pytables utilities ptrepack to rewrite the file (and also can change compression methods) + - Duplicate rows can be written, but are filtered out in selection (with the last items being selected; thus a table is unique on major, minor pairs) From 72f557e6352bcc9861824fcfb44dc10ab99347ab Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 15 Nov 2012 15:30:45 -0500 Subject: [PATCH 3/7] a store would fail if appending but the a put had not been done before (see test_append) this the result of incompatibility testing on the index_kind --- pandas/io/pytables.py | 4 ++-- pandas/io/tests/test_pytables.py | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 224ac82237dfd..a1693f547d03a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -778,8 +778,8 @@ def _write_table(self, group, items=None, index=None, columns=None, # check for backwards incompatibility if append: - existing_kind = table._v_attrs.index_kind - if existing_kind != index_kind: + existing_kind = getattr(table._v_attrs,'index_kind',None) + if existing_kind is not None and existing_kind != index_kind: raise TypeError("incompatible kind in index [%s - %s]" % (existing_kind, index_kind)) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 3ccab8616127a..10ad3277bb94a 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -142,10 +142,14 @@ def test_put_integer(self): def test_append(self): df = tm.makeTimeDataFrame() - self.store.put('c', df[:10], table=True) + self.store.append('c', df[:10]) self.store.append('c', df[10:]) tm.assert_frame_equal(self.store['c'], df) + self.store.put('d', df[:10], table=True) + self.store.append('d', df[10:]) + tm.assert_frame_equal(self.store['d'], df) + def test_append_diff_item_order(self): wp = tm.makePanel() wp1 = wp.ix[:, :10, :] From a1956cb8f0f5dc02674deee4a079e653892ad0c3 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 15 Nov 2012 16:45:58 -0500 Subject: [PATCH 4/7] added create_table_index to index tables think about doing this automagically for tables --- doc/source/conf.py | 1 + doc/source/io.rst | 4 +-- pandas/io/pytables.py | 49 ++++++++++++++++++++++++++++++++ pandas/io/tests/test_pytables.py | 20 +++++++++++++ 4 files changed, 72 insertions(+), 2 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 692c7757ee17c..3992c6da172a1 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -16,6 +16,7 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. #sys.path.append(os.path.abspath('.')) +sys.path.insert(0,'/home/jreback/pandas') sys.path.insert(0, os.path.abspath('../sphinxext')) sys.path.extend([ diff --git a/doc/source/io.rst b/doc/source/io.rst index 3962176e10fc9..f0fc9f9db2655 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -831,7 +831,7 @@ In addition, delete and query type operations are supported. store = HDFStore('store.h5') df1 = df[0:4] df2 = df[4:] - store.put('df', df1, table=True) + store.append('df', df1) store.append('df', df2) store.select('df') @@ -878,7 +878,7 @@ This is roughly translated to: index must be greater than the date 20121114 and .. ipython:: python store = HDFStore('store.h5') - store.put('wp',wp,table=True) + store.append('wp',wp) store.select('wp',[ 'index>20000102', ('column', ['A','B']) ]) Delete objects stored in Table format diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index a1693f547d03a..2e995722181f6 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -388,6 +388,55 @@ def append(self, key, value): """ self._write_to_group(key, value, table=True, append=True) + def create_table_index(self, key, columns = None, optlevel = None, kind = None): + """ + Create a pytables index on the specified columns + note: cannot index Time64Col() currently; PyTables must be >= 2.3.1 + + + Paramaters + ---------- + key : object (the node to index) + columns : None or list_like (the columns to index - currently supports index/column) + optlevel: optimization level (defaults to 6) + kind : kind of index (defaults to 'medium') + + Exceptions + ---------- + raises if the node is not a table + + """ + + # version requirements + major, minor, subv = _tables().__version__.split('.') + if major < 2 and minor < 3 and subv < 1: + raise("PyTables >= 2.3.1 is required for table indexing") + + group = getattr(self.handle.root, key, None) + if group is None: return + + if not _is_table_type(group): + raise Exception("cannot create table index on a non-table") + + table = getattr(group, 'table', None) + if table is None: return + + if columns is None: + columns = ['index'] + if not isinstance(columns, (tuple,list)): + columns = [ columns ] + + kw = dict() + if optlevel is not None: + kw['optlevel'] = optlevel + if kind is not None: + kw['kind'] = kind + + for c in columns: + v = getattr(table.cols,c,None) + if v is not None and not v.is_indexed: + v.createIndex(**kw) + def _write_to_group(self, key, value, table=False, append=False, comp=None): root = self.handle.root diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 10ad3277bb94a..3bca6ececce3a 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -150,6 +150,26 @@ def test_append(self): self.store.append('d', df[10:]) tm.assert_frame_equal(self.store['d'], df) + def test_create_table_index(self): + wp = tm.makePanel() + self.store.append('p5', wp) + self.store.create_table_index('p5') + + assert(self.store.handle.root.p5.table.cols.index.is_indexed == True) + assert(self.store.handle.root.p5.table.cols.column.is_indexed == False) + + df = tm.makeTimeDataFrame() + self.store.append('f', df[:10]) + self.store.append('f', df[10:]) + self.store.create_table_index('f') + + # create twice + self.store.create_table_index('f') + + # try to index a non-table + self.store.put('f2', df) + self.assertRaises(Exception, self.store.create_table_index, 'f2') + def test_append_diff_item_order(self): wp = tm.makePanel() wp1 = wp.ix[:, :10, :] From f619462cad2928c33462d64b754461142d14ec2b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 15 Nov 2012 16:59:40 -0500 Subject: [PATCH 5/7] updated io.rst with some typos and docs for indicies --- doc/source/io.rst | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index f0fc9f9db2655..25475c8a1f850 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -818,7 +818,8 @@ Storing in Table format ```HDFStore``` supports another *PyTables* format on disk, the *table* format. Conceptually a *table* is shaped very much like a DataFrame, with rows and columns. A *table* may be appended to in the same or other sessions. -In addition, delete and query type operations are supported. +In addition, delete, query type operations are supported. You can create an index with ```create_table_index``` +after data is already in the table (this may become automatic in the future or an option on appending/putting a *table*). .. ipython:: python :suppress: @@ -836,6 +837,9 @@ In addition, delete and query type operations are supported. store.select('df') + store.create_table_index('df') + store.handle.root.df.table + .. ipython:: python :suppress: @@ -848,7 +852,7 @@ Querying objects stored in Table format ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ `select` and `delete` operations have an optional criteria that can be specified to select/delete only -a subset of the data. This allows one to have very large on-table disks and retrieve only a portion of the data. +a subset of the data. This allows one to have a very large on-disk table and retrieve only a portion of the data. A query is specified using the `Term` class under the hood. @@ -900,6 +904,8 @@ Notes & Caveats ~~~~~~~~~~~~~~~ - Selection by items (the top level panel dimension) is not possible; you always get all of the items in the returned Panel + - Currently the sizes of the *column* items are governed by the first table creation + (this should be specified at creation time or use the largest available) - otherwise subsequent appends can truncate the column names - Mixed-Type Panels/DataFrames are not currently supported - coming soon! - Once a *table* is created its items (Panel) / columns (DataFrame) are fixed; only exactly the same columns can be appended - To delete a lot of data, it is sometimes better to erase the table and rewrite it (after say an indexing operation) From fadcdd1bd340f3376c91990b2fa4e19ac7b1099a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 15 Nov 2012 23:39:40 -0500 Subject: [PATCH 6/7] added min_itemsize parameter and checks in pytables to allow setting of index columns minimum size changed pytables version test for indexing around a bit added Col class to manage the column conversions added alias to the Term class; you can specify the nomial indexers (e.g. index in DataFrame, major_axis/minor_axis or alias in Panel) updated docs for pytables to reflect these changes updated docs for indexing to incorporate whatsnew 0.9.1 for where and mask --- doc/source/conf.py | 1 - doc/source/indexing.rst | 43 +++++++-- doc/source/io.rst | 31 ++++--- pandas/io/pytables.py | 153 +++++++++++++++++++++---------- pandas/io/tests/test_pytables.py | 20 ++++ 5 files changed, 181 insertions(+), 67 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 3992c6da172a1..692c7757ee17c 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -16,7 +16,6 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. #sys.path.append(os.path.abspath('.')) -sys.path.insert(0,'/home/jreback/pandas') sys.path.insert(0, os.path.abspath('../sphinxext')) sys.path.extend([ diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index a77e2c928abfa..6eb141930f274 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -231,22 +231,49 @@ Note, with the :ref:`advanced indexing ` ``ix`` method, you may select along more than one axis using boolean vectors combined with other indexing expressions. -Indexing a DataFrame with a boolean DataFrame -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Where and Masking +~~~~~~~~~~~~~~~~~ -You may wish to set values on a DataFrame based on some boolean criteria -derived from itself or another DataFrame or set of DataFrames. This can be done -intuitively like so: +Selecting values from a DataFrame is accomplished in a similar manner to a Series. +You index the Frame with a boolean DataFrame of the same size. This is accomplished +via the method `where` under the hood. The returned view of the DataFrame is the +same size as the original. + +.. ipython:: python + + df < 0 + df[df < 0] + +In addition, `where` takes an optional `other` argument for replacement in the +returned copy. + +.. ipython:: python + + df.where(df < 0, -df) + +You may wish to set values on a DataFrame based on some boolean criteria. +This can be done intuitively like so: .. ipython:: python df2 = df.copy() - df2 < 0 df2[df2 < 0] = 0 df2 -Note that such an operation requires that the boolean DataFrame is indexed -exactly the same. +Furthermore, `where` aligns the input boolean condition (ndarray or DataFrame), such that partial selection +with setting is possible. This is analagous to partial setting via `.ix` (but on the contents rather than the axis labels) + +.. ipython:: python + + df2 = df.copy() + df2[ df2[1:4] > 0 ] = 3 + df2 + +`DataFrame.mask` is the inverse boolean operation of `where`. + +.. ipython:: python + + df.mask(df >= 0) Take Methods diff --git a/doc/source/io.rst b/doc/source/io.rst index 25475c8a1f850..76bd123acf8aa 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -856,34 +856,35 @@ a subset of the data. This allows one to have a very large on-disk table and ret A query is specified using the `Term` class under the hood. - - 'index' refers to the index of a DataFrame (or major_axis of the Panel) - - 'column' refers to the minor_axis of the Panel (and is not needed for a DataFrame) + - 'index' refers to the index of a DataFrame + - 'major_axis' and 'minor_axis' are supported indexers of the Panel The following are all valid terms. .. code-block:: python dict(field = 'index', op = '>', value = '20121114') - ('index', '20121114') ('index', '>', '20121114') - ('index', ['20121114','20121114']) - ('index', datetime(2012,11,14)) 'index>20121114' - ('column', ['A','B']) + ('index', '>', datetime(2012,11,14)) + + ('index', ['20121114','20121115']) + ('major', Timestamp('2012/11/14')) + ('minor_axis', ['A','B']) Queries are built up (currently only *and* is supported) using a list. An example query for a panel might be specified as follows: .. code-block:: python - ['index>20121114', ('column', ['A','B']) ] + ['major_axis>20121114', ('minor_axis', ['A','B']) ] -This is roughly translated to: index must be greater than the date 20121114 and the column must be A or B +This is roughly translated to: major_axis must be greater than the date 20121114 and the minor_axis must be A or B .. ipython:: python store = HDFStore('store.h5') store.append('wp',wp) - store.select('wp',[ 'index>20000102', ('column', ['A','B']) ]) + store.select('wp',[ 'major_axis>20000102', ('minor_axis', ['A','B']) ]) Delete objects stored in Table format ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -908,10 +909,18 @@ Notes & Caveats (this should be specified at creation time or use the largest available) - otherwise subsequent appends can truncate the column names - Mixed-Type Panels/DataFrames are not currently supported - coming soon! - Once a *table* is created its items (Panel) / columns (DataFrame) are fixed; only exactly the same columns can be appended + - Appending to an already existing table will raise an exception if any of the indexers (index,major_axis or minor_axis) are strings + and they would be truncated because the column size is too small (you can pass ```min_itemsize``` to append to provide a larger fixed size + to compensate) + +Performance +~~~~~~~~~~~ + - To delete a lot of data, it is sometimes better to erase the table and rewrite it (after say an indexing operation) *PyTables* tends to increase the file size with deletions - - In general it is best to store Panels with the most frequently selected dimension in the minor axis and a time/date like dimension in the major axis - - No dimensions are currently indexed (in the *PyTables* sense) - but coming soon! + - In general it is best to store Panels with the most frequently selected dimension in the minor axis and a time/date like dimension in the major axis + but this is not required, major_axis and minor_axis can be any valid Panel index + - No dimensions are currently indexed automagically (in the *PyTables* sense); these require an explict call to ```create_table_index``` - *Tables* offer better performance when compressed after writing them (as opposed to turning on compression at the very beginning) use the pytables utilities ptrepack to rewrite the file (and also can change compression methods) - Duplicate rows can be written, but are filtered out in selection (with the last items being selected; thus a table is unique on major, minor pairs) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 2e995722181f6..bc8967973808e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -68,12 +68,20 @@ # oh the troubles to reduce import time _table_mod = None +_table_supports_index = True def _tables(): global _table_mod + global _table_supports_index if _table_mod is None: import tables _table_mod = tables + + # version requirements + major, minor, subv = tables.__version__.split('.') + if major >= 2 and minor >= 3: + _table_supports_index = True + return _table_mod @@ -322,7 +330,7 @@ def select(self, key, where=None): return self._read_group(group, where) def put(self, key, value, table=False, append=False, - compression=None): + compression=None, **kwargs): """ Store object in HDFStore @@ -343,7 +351,7 @@ def put(self, key, value, table=False, append=False, be used. """ self._write_to_group(key, value, table=table, append=append, - comp=compression) + comp=compression, **kwargs) def _get_handler(self, op, kind): return getattr(self, '_%s_%s' % (op, kind)) @@ -371,7 +379,7 @@ def remove(self, key, where=None): if group is not None: self._delete_from_table(group, where) - def append(self, key, value): + def append(self, key, value, **kwargs): """ Append to Table in file. Node must already exist and be Table format. @@ -386,7 +394,7 @@ def append(self, key, value): Does *not* check if data being appended overlaps with existing data in the table, so be careful """ - self._write_to_group(key, value, table=True, append=True) + self._write_to_group(key, value, table=True, append=True, **kwargs) def create_table_index(self, key, columns = None, optlevel = None, kind = None): """ @@ -408,9 +416,8 @@ def create_table_index(self, key, columns = None, optlevel = None, kind = None): """ # version requirements - major, minor, subv = _tables().__version__.split('.') - if major < 2 and minor < 3 and subv < 1: - raise("PyTables >= 2.3.1 is required for table indexing") + if not _table_supports_index: + raise("PyTables >= 2.3 is required for table indexing") group = getattr(self.handle.root, key, None) if group is None: return @@ -438,7 +445,7 @@ def create_table_index(self, key, columns = None, optlevel = None, kind = None): v.createIndex(**kw) def _write_to_group(self, key, value, table=False, append=False, - comp=None): + comp=None, **kwargs): root = self.handle.root if key not in root._v_children: group = self.handle.createGroup(root, key) @@ -450,7 +457,7 @@ def _write_to_group(self, key, value, table=False, append=False, kind = '%s_table' % kind handler = self._get_handler(op='write', kind=kind) wrapper = lambda value: handler(group, value, append=append, - comp=comp) + comp=comp, **kwargs) else: if append: raise ValueError('Can only append to Tables') @@ -580,7 +587,7 @@ def _read_block_manager(self, group): return BlockManager(blocks, axes) - def _write_frame_table(self, group, df, append=False, comp=None): + def _write_frame_table(self, group, df, append=False, comp=None, **kwargs): mat = df.values values = mat.reshape((1,) + mat.shape) @@ -590,7 +597,7 @@ def _write_frame_table(self, group, df, append=False, comp=None): self._write_table(group, items=['value'], index=df.index, columns=df.columns, - values=values, append=append, compression=comp) + values=values, append=append, compression=comp, **kwargs) def _write_wide(self, group, panel): panel._consolidate_inplace() @@ -599,10 +606,10 @@ def _write_wide(self, group, panel): def _read_wide(self, group, where=None): return Panel(self._read_block_manager(group)) - def _write_wide_table(self, group, panel, append=False, comp=None): + def _write_wide_table(self, group, panel, append=False, comp=None, **kwargs): self._write_table(group, items=panel.items, index=panel.major_axis, columns=panel.minor_axis, values=panel.values, - append=append, compression=comp) + append=append, compression=comp, **kwargs) def _read_wide_table(self, group, where=None): return self._read_panel_table(group, where) @@ -619,10 +626,10 @@ def _write_index(self, group, key, index): self._write_sparse_intindex(group, key, index) else: setattr(group._v_attrs, '%s_variety' % key, 'regular') - converted, kind, _ = _convert_index(index) - self._write_array(group, key, converted) + converted = _convert_index(index).set_name('index') + self._write_array(group, key, converted.values) node = getattr(group, key) - node._v_attrs.kind = kind + node._v_attrs.kind = converted.kind node._v_attrs.name = index.name if isinstance(index, (DatetimeIndex, PeriodIndex)): @@ -679,11 +686,11 @@ def _write_multi_index(self, group, key, index): index.labels, index.names)): # write the level - conv_level, kind, _ = _convert_index(lev) level_key = '%s_level%d' % (key, i) - self._write_array(group, level_key, conv_level) + conv_level = _convert_index(lev).set_name(level_key) + self._write_array(group, level_key, conv_level.values) node = getattr(group, level_key) - node._v_attrs.kind = kind + node._v_attrs.kind = conv_level.kind node._v_attrs.name = name # write the name @@ -788,22 +795,28 @@ def _write_array(self, group, key, value): getattr(group, key)._v_attrs.transposed = transposed def _write_table(self, group, items=None, index=None, columns=None, - values=None, append=False, compression=None): + values=None, append=False, compression=None, + min_itemsize = None, **kwargs): """ need to check for conform to the existing table: e.g. columns should match """ - # create dict of types - index_converted, index_kind, index_t = _convert_index(index) - columns_converted, cols_kind, col_t = _convert_index(columns) + + # create Col types + index_converted = _convert_index(index).set_name('index') + columns_converted = _convert_index(columns).set_name('column') # create the table if it doesn't exist (or get it if it does) if not append: if 'table' in group: self.handle.removeNode(group, 'table') + else: + # check that we are not truncating on our indicies + index_converted.maybe_set(min_itemsize = min_itemsize) + columns_converted.maybe_set(min_itemsize = min_itemsize) if 'table' not in group: # create the table - desc = {'index': index_t, - 'column': col_t, + desc = {'index' : index_converted.typ, + 'column': columns_converted.typ, 'values': _tables().FloatCol(shape=(len(values)))} options = {'name': 'table', @@ -825,16 +838,20 @@ def _write_table(self, group, items=None, index=None, columns=None, # the table must already exist table = getattr(group, 'table', None) + # check that we are not truncating on our indicies + index_converted.validate(table) + columns_converted.validate(table) + # check for backwards incompatibility if append: existing_kind = getattr(table._v_attrs,'index_kind',None) - if existing_kind is not None and existing_kind != index_kind: + if existing_kind is not None and existing_kind != index_converted.kind: raise TypeError("incompatible kind in index [%s - %s]" % - (existing_kind, index_kind)) + (existing_kind, index_converted.kind)) # add kinds - table._v_attrs.index_kind = index_kind - table._v_attrs.columns_kind = cols_kind + table._v_attrs.index_kind = index_converted.kind + table._v_attrs.columns_kind = columns_converted.kind if append: existing_fields = getattr(table._v_attrs, 'fields', None) if (existing_fields is not None and @@ -1001,13 +1018,55 @@ def _delete_from_table(self, group, where): # return the number of rows removed return ln +class Col(object): + """ a column description class + + Parameters + ---------- + + values : the ndarray like converted values + kind : a string description of this type + typ : the pytables type + + """ + + def __init__(self, values, kind, typ, itemsize = None, **kwargs): + self.values = values + self.kind = kind + self.typ = typ + self.itemsize = itemsize + self.name = None + + def set_name(self, n): + self.name = n + return self + + def __iter__(self): + return iter(self.values) + + def maybe_set(self, min_itemsize = None, **kwargs): + """ maybe set a string col itemsize """ + if self.kind == 'string' and min_itemsize is not None: + if self.typ.itemsize < min_itemsize: + self.typ = _tables().StringCol(itemsize = min_itemsize, pos = getattr(self.typ,'pos',None)) + + def validate(self, table, **kwargs): + """ validate this column for string truncation (or reset to the max size) """ + if self.kind == 'string': + + # the current column name + t = getattr(table.description,self.name,None) + if t is not None: + if t.itemsize < self.itemsize: + raise Exception("[%s] column has a min_itemsize of [%s] but itemsize [%s] is required!" % (self.name,self.itemsize,t.itemsize)) + def _convert_index(index): if isinstance(index, DatetimeIndex): converted = index.asi8 - return converted, 'datetime64', _tables().Int64Col() + return Col(converted, 'datetime64', _tables().Int64Col()) elif isinstance(index, (Int64Index, PeriodIndex)): atom = _tables().Int64Col() - return index.values, 'integer', atom + return Col(index.values, 'integer', atom) if isinstance(index, MultiIndex): raise Exception('MultiIndex not supported here!') @@ -1018,33 +1077,33 @@ def _convert_index(index): if inferred_type == 'datetime64': converted = values.view('i8') - return converted, 'datetime64', _tables().Int64Col() + return Col(converted, 'datetime64', _tables().Int64Col()) elif inferred_type == 'datetime': converted = np.array([(time.mktime(v.timetuple()) + v.microsecond / 1E6) for v in values], dtype=np.float64) - return converted, 'datetime', _tables().Time64Col() + return Col(converted, 'datetime', _tables().Time64Col()) elif inferred_type == 'date': converted = np.array([time.mktime(v.timetuple()) for v in values], dtype=np.int32) - return converted, 'date', _tables().Time32Col() + return Col(converted, 'date', _tables().Time32Col()) elif inferred_type == 'string': converted = np.array(list(values), dtype=np.str_) itemsize = converted.dtype.itemsize - return converted, 'string', _tables().StringCol(itemsize) + return Col(converted, 'string', _tables().StringCol(itemsize), itemsize = itemsize) elif inferred_type == 'unicode': atom = _tables().ObjectAtom() - return np.asarray(values, dtype='O'), 'object', atom + return Col(np.asarray(values, dtype='O'), 'object', atom) elif inferred_type == 'integer': # take a guess for now, hope the values fit atom = _tables().Int64Col() - return np.asarray(values, dtype=np.int64), 'integer', atom + return Col(np.asarray(values, dtype=np.int64), 'integer', atom) elif inferred_type == 'floating': atom = _tables().Float64Col() - return np.asarray(values, dtype=np.float64), 'float', atom + return Col(np.asarray(values, dtype=np.float64), 'float', atom) else: # pragma: no cover atom = _tables().ObjectAtom() - return np.asarray(values, dtype='O'), 'object', atom + return Col(np.asarray(values, dtype='O'), 'object', atom) def _read_array(group, key): @@ -1172,20 +1231,20 @@ class Term(object): Term('index', '>', '20121114') Term('index', ['20121114','20121114']) Term('index', datetime(2012,11,14)) - Term('index>20121114') + Term('major>20121114') + Term('minor', ['A','B']) """ _ops = ['<','<=','>','>=','=','!='] _search = re.compile("^(?P\w+)(?P%s)(?P.+)$" % '|'.join(_ops)) - _index = ['index','major_axis'] - _column = ['column','minor_axis','items'] + _index = ['index','major_axis','major'] + _column = ['column','minor_axis','minor'] def __init__(self, field, op = None, value = None, index_kind = None): self.field = None self.op = None self.value = None - self.typ = None self.index_kind = index_kind self.filter = None self.condition = None @@ -1241,9 +1300,9 @@ def __init__(self, field, op = None, value = None, index_kind = None): # valid field name if self.field in self._index: - self.typ = 'index' + self.field = 'index' elif self.field in self._column: - self.typ = 'column' + self.field = 'column' else: raise Exception("field is not a valid index/column for this term [%s]" % str(self)) @@ -1258,7 +1317,7 @@ def __init__(self, field, op = None, value = None, index_kind = None): self.eval() def __str__(self): - return "typ->%s,field->%s,op->%s,value->%s" % (self.typ,self.field,self.op,self.value) + return "field->%s,op->%s,value->%s" % (self.field,self.op,self.value) __repr__ = __str__ @@ -1285,7 +1344,7 @@ def eval(self): def convert_value(self, v): - if self.typ == 'index': + if self.field == 'index': if self.index_kind == 'datetime64' : return [lib.Timestamp(v).value, None] elif isinstance(v, datetime): diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 3bca6ececce3a..30bc9d4ed8ba1 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -14,6 +14,7 @@ import pandas.util.testing as tm from pandas.tests.test_series import assert_series_equal from pandas.tests.test_frame import assert_frame_equal +from pandas import concat try: import tables @@ -150,6 +151,20 @@ def test_append(self): self.store.append('d', df[10:]) tm.assert_frame_equal(self.store['d'], df) + def test_append_with_strings(self): + wp = tm.makePanel() + wp2 = wp.rename_axis(dict([ (x,"%s_extra" % x) for x in wp.minor_axis ]), axis = 2) + + self.store.append('s1', wp, min_itemsize = 20) + self.store.append('s1', wp2) + expected = concat([ wp, wp2], axis = 2) + expected = expected.reindex(minor_axis = sorted(expected.minor_axis)) + tm.assert_panel_equal(self.store['s1'], expected) + + # test truncation of bigger strings + self.store.append('s2', wp) + self.assertRaises(Exception, self.store.append, 's2', wp2) + def test_create_table_index(self): wp = tm.makePanel() self.store.append('p5', wp) @@ -251,6 +266,11 @@ def test_terms(self): Term('index', ['20121114','20121114']) Term('index', datetime(2012,11,14)) Term('index>20121114') + Term('major>20121114') + Term('major_axis>20121114') + Term('minor', ['A','B']) + Term('minor_axis', ['A','B']) + Term('column', ['A','B']) self.assertRaises(Exception, Term.__init__) self.assertRaises(Exception, Term.__init__, 'blah') From 0690ac80fad50a4f126bf4d0cd2eacfa5cfcad7b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 23 Nov 2012 16:28:29 -0500 Subject: [PATCH 7/7] add where and mask methods to Series. where returns a series evaluated for the cond with a shape like the original --- pandas/core/series.py | 38 +++++++++++++++++++++++++++++++++++++ pandas/tests/test_series.py | 31 ++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/pandas/core/series.py b/pandas/core/series.py index a798915cb9681..d882a147f5395 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -562,6 +562,44 @@ def _get_values(self, indexer): except Exception: return self.values[indexer] + def where(self, cond, other=nan, inplace=False): + """ + Return a Series where cond is True; otherwise values are from other + + Parameters + ---------- + cond: boolean Series or array + other: scalar or Series + + Returns + ------- + wh: Series + """ + if not hasattr(cond, 'shape'): + raise ValueError('where requires an ndarray like object for its ' + 'condition') + + if inplace: + self._set_with(~cond, other) + return self + + return self._get_values(cond).reindex_like(self).fillna(other) + + def mask(self, cond): + """ + Returns copy of self whose values are replaced with nan if the + inverted condition is True + + Parameters + ---------- + cond: boolean Series or array + + Returns + ------- + wh: Series + """ + return self.where(~cond, nan) + def __setitem__(self, key, value): try: try: diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 7422c925fd657..a48e66d38b1c4 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -939,6 +939,37 @@ def test_ix_getitem_iterator(self): result = self.series.ix[idx] assert_series_equal(result, self.series[:10]) + def test_where(self): + s = Series(np.random.randn(5)) + cond = s > 0 + + rs = s.where(cond).dropna() + rs2 = s[cond] + assert_series_equal(rs, rs2) + + rs = s.where(cond,-s) + assert_series_equal(rs, s.abs()) + + rs = s.where(cond) + assert(s.shape == rs.shape) + + self.assertRaises(ValueError, s.where, 1) + + def test_where_inplace(self): + s = Series(np.random.randn(5)) + cond = s > 0 + + rs = s.copy() + rs.where(cond,inplace=True) + assert_series_equal(rs.dropna(), s[cond]) + + def test_mask(self): + s = Series(np.random.randn(5)) + cond = s > 0 + + rs = s.where(cond, np.nan) + assert_series_equal(rs, s.mask(~cond)) + def test_ix_setitem(self): inds = self.series.index[[3,4,7]]