From 9b0aac04c5885031eedb243f1e4b9808f3b26d73 Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 18 Dec 2012 18:03:12 -0500 Subject: [PATCH 01/35] ENH/BUG/DOC: added support for data column queries (can construct searches on the actual columns of the data) added nan_rep for supporting string columns with nan's in them performance enhancements on string columns more tests & docs for data columns --- doc/source/conf.py | 1 + doc/source/io.rst | 27 ++- pandas/io/pytables.py | 304 ++++++++++++++++++++++--------- pandas/io/tests/test_pytables.py | 99 +++++++++- pandas/lib.pyx | 53 ++++-- vb_suite/hdfstore_bench.py | 30 +++ 6 files changed, 407 insertions(+), 107 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 692c7757ee17c..6895f00414b0b 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -16,6 +16,7 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. #sys.path.append(os.path.abspath('.')) +sys.path.insert(0, '/home/jreback/pandas') sys.path.insert(0, os.path.abspath('../sphinxext')) sys.path.extend([ diff --git a/doc/source/io.rst b/doc/source/io.rst index c73240725887f..a2e82dc05a562 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1095,7 +1095,7 @@ Storing Mixed Types in a Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Storing mixed-dtype data is supported. Strings are store as a fixed-width using the maximum size of the appended column. Subsequent appends will truncate strings at this length. -Passing ``min_itemsize = { `values` : size }`` as a parameter to append will set a larger minimum for the string columns. Storing ``floats, strings, ints, bools`` are currently supported. +Passing ``min_itemsize = { `values` : size }`` as a parameter to append will set a larger minimum for the string columns. Storing ``floats, strings, ints, bools`` are currently supported. For string columns, passing ``nan_rep = 'my_nan_rep'`` to append will change the default nan representation on disk (which converts to/from `np.nan`), this defaults to `nan`. .. ipython:: python @@ -1115,7 +1115,6 @@ Passing ``min_itemsize = { `values` : size }`` as a parameter to append will set Querying a Table ~~~~~~~~~~~~~~~~ - ``select`` and ``delete`` operations have an optional criteria that can be specified to select/delete only a subset of the data. This allows one to have a very large on-disk table and retrieve only a portion of the data. @@ -1160,6 +1159,30 @@ You can create an index for a table with ``create_table_index`` after data is al i.optlevel, i.kind +Query via Data Columns +~~~~~~~~~~~~~~~~~~~~~~ +You can designate (and index) certain columns that you want to be able to perform queries (other than the `indexable` columns, which you can always query). For instance say you want to perform this this common operation, on-disk, and return just the frame that matches this query. + +.. ipython:: python + + df['string'] = 'foo' + df.ix[4:6,'string'] = np.nan + df.ix[7:9,'string'] = 'bar' + df + + # on-disk operations + store.append('df_dc', df, columns = ['B','string']) + store.select('df_dc',[ Term('B>0') ]) + + # getting creative + store.select('df_dc',[ Term('B>0'), Term('string=foo') ]) + + # index the data_column + store.create_table_index('df_dc', columns = ['B']) + store.root.df_dc.table + +There is some performance degredation by making lots of columns into `data columns`, so it is up to the user to designate these. + Delete from a Table ~~~~~~~~~~~~~~~~~~~ You can delete from a table selectively by specifying a ``where``. In deleting rows, it is important to understand the ``PyTables`` deletes rows by erasing the rows, then **moving** the following data. Thus deleting can potentially be a very expensive operation depending on the orientation of your data. This is especially true in higher dimensional objects (``Panel`` and ``Panel4D``). To get optimal deletion speed, it pays to have the dimension you are deleting be the first of the ``indexables``. diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 175845fd38a2b..27adfbdd60e79 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -21,7 +21,7 @@ from pandas.tseries.api import PeriodIndex, DatetimeIndex from pandas.core.common import adjoin from pandas.core.algorithms import match, unique, factorize -from pandas.core.strings import str_len +from pandas.core.strings import str_len, _na_map from pandas.core.categorical import Categorical from pandas.core.common import _asarray_tuplesafe, _try_sort from pandas.core.internals import BlockManager, make_block, form_blocks @@ -36,7 +36,7 @@ from contextlib import contextmanager # versioning attribute -_version = '0.10' +_version = '0.11' class IncompatibilityWarning(Warning): pass @@ -948,6 +948,9 @@ def __eq__(self, other): """ compare 2 col items """ return all([ getattr(self,a,None) == getattr(other,a,None) for a in ['name','cname','axis','pos'] ]) + def __ne__(self, other): + return not self.__eq__(other) + def copy(self): new_self = copy.copy(self) return new_self @@ -959,7 +962,7 @@ def infer(self, table): new_self.get_attr() return new_self - def convert(self, values): + def convert(self, values, nan_rep): """ set the values from this selection """ self.values = Index(_maybe_convert(values[self.cname], self.kind)) @@ -1006,7 +1009,7 @@ def validate_col(self, itemsize = None): # validate this column for string truncation (or reset to the max size) dtype = getattr(self,'dtype',None) - if self.kind == 'string' or (dtype is not None and dtype.startswith('string')): + if self.kind == 'string': c = self.col if c is not None: @@ -1045,13 +1048,21 @@ class DataCol(IndexCol): cname : the column name in the table to hold the data (typeically values) """ is_indexable = False + is_searchable = False @classmethod - def create_for_block(cls, i, **kwargs): + def create_for_block(cls, i = None, name = None, cname = None, **kwargs): """ return a new datacol with the block i """ - return cls(name = 'values_%d' % i, cname = 'values_block_%d' % i, **kwargs) - - def __init__(self, values = None, kind = None, typ = None, cname = None, data = None, **kwargs): + if cname is None: + cname = name or 'values_block_%d' % i + if name is None: + name = cname + m = re.search("values_block_(\d+)",name) + if m: + name = "values_%s" % m.groups()[0] + return cls(name = name, cname = cname, **kwargs) + + def __init__(self, values = None, kind = None, typ = None, cname = None, data = None, block = None, **kwargs): super(DataCol, self).__init__(values = values, kind = kind, typ = typ, cname = cname, **kwargs) self.dtype = None self.dtype_attr = "%s_dtype" % self.name @@ -1069,12 +1080,69 @@ def set_data(self, data): if data is not None: if self.dtype is None: self.dtype = data.dtype.name + self.set_kind() def take_data(self): """ return the data & release the memory """ self.data, data = None, self.data return data + def set_kind(self): + # set my kind if we can + if self.dtype is not None: + if self.dtype.startswith('string'): + self.kind = 'string' + elif self.dtype.startswith('float'): + self.kind = 'float' + elif self.dtype.startswith('int'): + self.kind = 'integer' + + def set_atom(self, block, existing_col, min_itemsize, nan_rep, **kwargs): + """ create and setup my atom from the block b """ + + self.values = list(block.items) + if block.dtype.name == 'object': + self.set_atom_object(block, existing_col, min_itemsize, nan_rep) + else: + self.set_atom_data(block) + + return self + + def get_atom_object(self, block, itemsize): + return _tables().StringCol(itemsize = itemsize, shape = block.shape[0]) + + def set_atom_object(self, block, existing_col, min_itemsize, nan_rep): + # fill nan items with myself + data = block.fillna(nan_rep).values + + # itemsize is the maximum length of a string (along any dimension) + itemsize = lib.max_len_string_array(data) + + # specified min_itemsize? + if isinstance(min_itemsize, dict): + itemsize = max(int(min_itemsize.get('values')),itemsize) + + # check for column in the values conflicts + if existing_col is not None: + eci = existing_col.validate_col(itemsize) + if eci > itemsize: + itemsize = eci + + self.kind = 'string' + self.typ = self.get_atom_object(block, itemsize) + self.set_data(self.convert_object_data(data, itemsize)) + + def convert_object_data(self, data, itemsize): + return data.astype('S%s' % itemsize) + + def get_atom_data(self, block): + return getattr(_tables(),"%sCol" % self.kind.capitalize())(shape = block.shape[0]) + + def set_atom_data(self, block): + self.kind = block.dtype.name + self.typ = self.get_atom_data(block) + self.set_data(block.values.astype(self.typ._deftype)) + @property def shape(self): return getattr(self.data,'shape',None) @@ -1099,7 +1167,7 @@ def validate_attr(self, append): raise Exception("appended items dtype do not match existing items dtype" " in table!") - def convert(self, values): + def convert(self, values, nan_rep): """ set the data from this selection (and convert to the correct dtype if we can) """ self.set_data(values[self.cname]) @@ -1110,10 +1178,15 @@ def convert(self, values): except: self.data = self.data.astype('O') + # convert nans + if self.kind == 'string': + self.data = _na_map(lambda x: np.nan if x == nan_rep else x, self.data.flatten()).reshape(self.data.shape) + def get_attr(self): """ get the data for this colummn """ self.values = getattr(self.attrs,self.kind_attr,None) self.dtype = getattr(self.attrs,self.dtype_attr,None) + self.set_kind() def set_attr(self): """ set the data for this colummn """ @@ -1121,6 +1194,28 @@ def set_attr(self): if self.dtype is not None: setattr(self.attrs,self.dtype_attr,self.dtype) +class DataIndexableCol(DataCol): + """ represent a data column that can be indexed """ + + @property + def is_searchable(self): + return self.kind == 'string' + + def get_atom_object(self, block, itemsize): + return _tables().StringCol(itemsize = itemsize) + + # reshape the values if not shape (e.g. we are a scalar) + #if 'shape' not in kw: + # import pdb; pdb.set_trace() + # values = values.reshape(values.shape[1:]) + + + def convert_object_data(self, data, itemsize): + return data.astype('S%s' % itemsize) + + def get_atom_data(self, block): + return getattr(_tables(),"%sCol" % self.kind.capitalize())() + class Table(object): """ represent a table: facilitate read/write of various types of tables @@ -1153,6 +1248,8 @@ def __init__(self, parent, group, **kwargs): self.index_axes = [] self.non_index_axes = [] self.values_axes = [] + self.data_columns = [] + self.nan_rep = None self.selection = None @property @@ -1166,7 +1263,11 @@ def pandas_type(self): def __repr__(self): """ return a pretty representatgion of myself """ self.infer_axes() - return "%s (typ->%s,nrows->%s,indexers->[%s])" % (self.pandas_type,self.table_type_short,self.nrows,','.join([ a.name for a in self.index_axes ])) + return "%s (typ->%s,nrows->%s,indexers->[%s],data->[%s])" % (self.pandas_type, + self.table_type_short, + self.nrows, + ','.join([ a.name for a in self.index_axes ]), + ','.join(self.data_columns)) __str__ = __repr__ @@ -1242,7 +1343,12 @@ def data_orientation(self): def queryables(self): """ return a dict of the kinds allowable columns for this object """ - return dict([ (a.cname,a.kind) for a in self.index_axes ] + [ (self.obj_type._AXIS_NAMES[axis],None) for axis, values in self.non_index_axes ]) + + # compute the values_axes queryables + return dict([ (a.cname,a.kind) for a in self.index_axes ] + + [ (self.obj_type._AXIS_NAMES[axis],None) for axis, values in self.non_index_axes ] + + [ (v.cname,v.kind) for v in self.values_axes if v.name in set(self.data_columns) ] + ) def index_cols(self): """ return a list of my index cols """ @@ -1258,6 +1364,8 @@ def set_attrs(self): self.attrs.index_cols = self.index_cols() self.attrs.values_cols = self.values_cols() self.attrs.non_index_axes = self.non_index_axes + self.attrs.data_columns = self.data_columns + self.attrs.nan_rep = self.nan_rep def validate_version(self, where = None): """ are we trying to operate on an old version? """ @@ -1276,9 +1384,16 @@ def indexables(self): # index columns self._indexables.extend([ IndexCol(name = name, axis = axis, pos = i) for i, (axis, name) in enumerate(self.attrs.index_cols) ]) - # data columns + # values columns + dc = set(self.data_columns) base_pos = len(self._indexables) - self._indexables.extend([ DataCol.create_for_block(i = i, pos = base_pos + i ) for i, c in enumerate(self.attrs.values_cols) ]) + def f(i, c): + klass = DataCol + if c in dc: + klass = DataIndexableCol + return klass.create_for_block(i = i, name = c, pos = base_pos + i ) + + self._indexables.extend([ f(i,c) for i, c in enumerate(self.attrs.values_cols) ]) return self._indexables @@ -1353,7 +1468,7 @@ def read_axes(self, where): # convert the data for a in self.axes: - a.convert(values) + a.convert(values, nan_rep = self.nan_rep) return True @@ -1365,28 +1480,41 @@ def infer_axes(self): if table is None: return False - self.index_axes, self.values_axes = [ a.infer(self.table) for a in self.indexables if a.is_indexable ], [ a.infer(self.table) for a in self.indexables if not a.is_indexable ] self.non_index_axes = getattr(self.attrs,'non_index_axes',None) or [] + self.data_columns = getattr(self.attrs,'data_columns',None) or [] + self.nan_rep = getattr(self.attrs,'nan_rep',None) + self.index_axes, self.values_axes = [ a.infer(self.table) for a in self.indexables if a.is_indexable ], [ a.infer(self.table) for a in self.indexables if not a.is_indexable ] return True - def get_data_blocks(self, obj): - """ return the data blocks for this obj """ - return obj._data.blocks + def get_object(self, obj): + """ return the data for this obj """ + return obj - def create_axes(self, axes, obj, validate = True, min_itemsize = None): + def create_axes(self, axes, obj, validate = True, nan_rep = None, columns = None, min_itemsize = None, **kwargs): """ create and return the axes leagcy tables create an indexable column, indexable index, non-indexable fields + + Parameters: + ----------- + axes: a list of the axes in order to create (names or numbers of the axes) + obj : the object to create axes on + validate: validate the obj against an existiing object already written + min_itemsize: a dict of the min size for a column in bytes + nan_rep : a values to use for string column nan_rep + columns : a list of columns that we want to create separate to allow indexing """ # map axes to numbers axes = [ obj._get_axis_number(a) for a in axes ] - # do we have an existing table (if so, use its axes)? + # do we have an existing table (if so, use its axes & data_columns) if self.infer_axes(): existing_table = self.copy() - axes = [ a.axis for a in existing_table.index_axes] + axes = [ a.axis for a in existing_table.index_axes] + columns = existing_table.data_columns + nan_rep = existing_table.nan_rep else: existing_table = None @@ -1396,6 +1524,12 @@ def create_axes(self, axes, obj, validate = True, min_itemsize = None): # create according to the new data self.non_index_axes = [] + self.data_columns = [] + + # nan_representation + if nan_rep is None: + nan_rep = 'nan' + self.nan_rep = nan_rep # create axes to index and non_index index_axes_map = dict() @@ -1428,54 +1562,55 @@ def create_axes(self, axes, obj, validate = True, min_itemsize = None): for a in self.axes: a.maybe_set_size(min_itemsize = min_itemsize) - # reindex by our non_index_axes + # reindex by our non_index_axes & compute data_columns for a in self.non_index_axes: - obj = obj.reindex_axis(a[1], axis = a[0], copy = False) + obj = obj.reindex_axis(a[1], axis = a[0], copy = False) + + # get out blocks + block_obj = self.get_object(obj) - blocks = self.get_data_blocks(obj) + data_obj = None + if columns is not None and len(self.non_index_axes): + axis = self.non_index_axes[0][0] + axis_labels = self.non_index_axes[0][1] + columns = [ c for c in columns if c in axis_labels ] + if len(columns): + data_obj = block_obj.reindex_axis(Index(columns), axis = axis, copy = False) + block_obj = block_obj.reindex_axis(Index(axis_labels)-Index(columns), axis = axis, copy = False) + + blocks = list(block_obj._data.blocks) + if data_obj is not None: + blocks.extend(data_obj._data.blocks) # add my values self.values_axes = [] for i, b in enumerate(blocks): # shape of the data column are the indexable axes - shape = b.shape[0] - values = b.values - - # a string column - if b.dtype.name == 'object': - - # itemsize is the maximum length of a string (along any dimension) - itemsize = _itemsize_string_array(values) + klass = DataCol + name = None - # specified min_itemsize? - if isinstance(min_itemsize, dict): - itemsize = max(int(min_itemsize.get('values')),itemsize) + # we have a data_column + if columns and len(b.items) == 1 and b.items[0] in columns: + klass = DataIndexableCol + name = b.items[0] + self.data_columns.append(name) - # check for column in the values conflicts - if existing_table is not None and validate: - eci = existing_table.values_axes[i].validate_col(itemsize) - if eci > itemsize: - itemsize = eci - - atom = _tables().StringCol(itemsize = itemsize, shape = shape) - utype = 'S%s' % itemsize - kind = 'string' + try: + existing_col = existing_table.values_axes[i] if existing_table is not None and validate else None - else: - atom = getattr(_tables(),"%sCol" % b.dtype.name.capitalize())(shape = shape) - utype = atom._deftype - kind = b.dtype.name + col = klass.create_for_block(i = i, name = name) + col.set_atom(block = b, + existing_col = existing_col, + min_itemsize = min_itemsize, + nan_rep = nan_rep, + **kwargs) + col.set_pos(j) - # coerce data to this type - try: - values = values.astype(utype) + self.values_axes.append(col) except (Exception), detail: - raise Exception("cannot coerce data type -> [dtype->%s]" % b.dtype.name) - - dc = DataCol.create_for_block(i = i, values = list(b.items), kind = kind, typ = atom, data = values, pos = j) + raise Exception("cannot find the correct atom type -> [dtype->%s] %s" % (b.dtype.name,str(detail))) j += 1 - self.values_axes.append(dc) # validate the axes if we have an existing table if validate: @@ -1674,7 +1809,7 @@ def write(self, axes, obj, append=False, compression=None, self.handle.removeNode(self.group, 'table') # create the axes - self.create_axes(axes = axes, obj = obj, validate = append, min_itemsize = min_itemsize) + self.create_axes(axes = axes, obj = obj, validate = append, min_itemsize = min_itemsize, **kwargs) if 'table' not in self.group: @@ -1706,13 +1841,8 @@ def write_data(self): for a in self.values_axes: # figure the mask: only do if we can successfully process this column, otherwise ignore the mask - try: - mask = np.isnan(a.data).all(axis=0) - masks.append(mask.astype('u1')) - except: - - # need to check for Nan in a non-numeric type column!!! - masks.append(np.zeros((a.data.shape[1:]), dtype = 'u1')) + mask = com.isnull(a.data).all(axis=0) + masks.append(mask.astype('u1')) # consolidate masks mask = masks[0] @@ -1721,14 +1851,20 @@ def write_data(self): # the arguments args = [ a.cvalues for a in self.index_axes ] + search = np.array([ a.is_searchable for a in self.values_axes ]).astype('u1') values = [ a.data for a in self.values_axes ] # get our function try: func = getattr(lib,"create_hdf_rows_%sd" % self.ndim) args.append(mask) + args.append(search) args.append(values) rows = func(*args) + except (Exception), detail: + raise Exception("cannot create row-data -> %s" % str(detail)) + + try: if len(rows): self.table.append(rows) except (Exception), detail: @@ -1794,11 +1930,11 @@ class AppendableFrameTable(AppendableTable): def is_transposed(self): return self.index_axes[0].axis == 1 - def get_data_blocks(self, obj): + def get_object(self, obj): """ these are written transposed """ if self.is_transposed: obj = obj.T - return obj._data.blocks + return obj def read(self, where=None): @@ -1812,12 +1948,17 @@ def read(self, where=None): if self.is_transposed: values = a.cvalues index_ = columns - columns_ = index + columns_ = Index(index) else: values = a.cvalues.T - index_ = index + index_ = Index(index) columns_ = columns + + # if we have a DataIndexableCol, its shape will only be 1 dim + if values.ndim == 1: + values = values.reshape(1,values.shape[0]) + block = make_block(values, columns_, columns_) mgr = BlockManager([ block ], [ columns_, index_ ]) frames.append(DataFrame(mgr)) @@ -1838,11 +1979,11 @@ class AppendablePanelTable(AppendableTable): ndim = 3 obj_type = Panel - def get_data_blocks(self, obj): + def get_object(self, obj): """ these are written transposed """ if self.is_transposed: obj = obj.transpose(*self.data_orientation) - return obj._data.blocks + return obj @property def is_transposed(self): @@ -1891,10 +2032,6 @@ def create_table(parent, group, typ = None, **kwargs): return _TABLE_MAP.get(tt)(parent, group, **kwargs) -def _itemsize_string_array(arr): - """ return the maximum size of elements in a strnig array """ - return max([ str_len(arr[v]).max() for v in range(arr.shape[0]) ]) - def _convert_index(index): if isinstance(index, DatetimeIndex): converted = index.asi8 @@ -2208,17 +2345,16 @@ def eval(self): def convert_value(self, v): #### a little hacky here, need to really figure out what we should convert ####x - if self.field == 'index' or self.field == 'major_axis': - if self.kind == 'datetime64' : - return [lib.Timestamp(v).value, None] - elif isinstance(v, datetime) or hasattr(v,'timetuple') or self.kind == 'date': - return [time.mktime(v.timetuple()), None] - elif self.kind == 'integer': - v = int(float(v)) - return [v, v] - elif self.kind == 'float': - v = float(v) - return [v, v] + if self.kind == 'datetime64' : + return [lib.Timestamp(v).value, None] + elif isinstance(v, datetime) or hasattr(v,'timetuple') or self.kind == 'date': + return [time.mktime(v.timetuple()), None] + elif self.kind == 'integer': + v = int(float(v)) + return [v, v] + elif self.kind == 'float': + v = float(v) + return [v, v] elif not isinstance(v, basestring): return [str(v), None] diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 2b0d1cda89392..eec6b5c5ec4bf 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -93,9 +93,9 @@ def test_versioning(self): self.store.remove('df1') self.store.append('df1', df[:10]) self.store.append('df1', df[10:]) - self.assert_(self.store.root.a._v_attrs.pandas_version == '0.10') - self.assert_(self.store.root.b._v_attrs.pandas_version == '0.10') - self.assert_(self.store.root.df1._v_attrs.pandas_version == '0.10') + self.assert_(self.store.root.a._v_attrs.pandas_version == '0.11') + self.assert_(self.store.root.b._v_attrs.pandas_version == '0.11') + self.assert_(self.store.root.df1._v_attrs.pandas_version == '0.11') # write a file and wipe its versioning self.store.remove('df2') @@ -287,6 +287,13 @@ def test_append(self): self.store.append('wp1', wp_append2) tm.assert_panel_equal(self.store['wp1'], wp) + # dtype issues - mizxed type in a single object column + df = DataFrame(data=[[1,2],[0,1],[1,2],[0,0]]) + df['mixed_column'] = 'testing' + df.ix[2,'mixed_column'] = np.nan + self.store.remove('df') + self.store.append('df', df) + tm.assert_frame_equal(self.store['df'],df) def test_append_frame_column_oriented(self): @@ -415,6 +422,44 @@ def test_append_with_strings(self): df_new = DataFrame([[124,'abcdefqhij'], [346, 'abcdefghijklmnopqrtsuvwxyz']]) self.assertRaises(Exception, self.store.append, 'df_new',df_new) + def test_append_with_data_columns(self): + + df = tm.makeTimeDataFrame() + self.store.remove('df') + self.store.append('df', df[:2], columns = ['B']) + self.store.append('df', df[2:]) + tm.assert_frame_equal(self.store['df'], df) + + # data column searching + result = self.store.select('df', [ Term('B>0') ]) + expected = df[df.B>0] + tm.assert_frame_equal(result, expected) + + # data column searching (with an indexable and a data_columns) + result = self.store.select('df', [ Term('B>0'), Term('index','>',df.index[3]) ]) + df_new = df.reindex(index=df.index[4:]) + expected = df_new[df_new.B>0] + tm.assert_frame_equal(result, expected) + + # index the columns + self.store.create_table_index('df', columns = ['B']) + result = self.store.select('df', [ Term('B>0'), Term('index','>',df.index[3]) ]) + tm.assert_frame_equal(result, expected) + + # check the index + assert(self.store.handle.root.df.table.cols.B.is_indexed == True) + + # data column selection with a string data_column + df_new = df.copy() + df_new['string'] = 'foo' + df_new['string'][1:4] = np.nan + df_new['string'][5:6] = 'bar' + self.store.remove('df') + self.store.append('df', df_new, columns = ['string']) + result = self.store.select('df', [ Term('string', '=', 'foo') ]) + expected = df_new[df_new.string == 'foo'] + tm.assert_frame_equal(result, expected) + def test_create_table_index(self): wp = tm.makePanel() self.store.append('p5', wp) @@ -474,24 +519,52 @@ def test_create_table_index(self): tables.__version__ = original - def test_big_table(self): - raise nose.SkipTest('no big table') + def test_big_table_frame(self): + raise nose.SkipTest('no big table frame') # create and write a big table - wp = Panel(np.random.randn(20, 1000, 1000), items= [ 'Item%s' % i for i in xrange(20) ], - major_axis=date_range('1/1/2000', periods=1000), minor_axis = [ 'E%s' % i for i in xrange(1000) ]) + df = DataFrame(np.random.randn(2000*100, 100), index = range(2000*100), columns = [ 'E%03d' % i for i in xrange(100) ]) + for x in range(20): + df['String%03d' % x] = 'string%03d' % x + + import time + x = time.time() + try: + store = HDFStore(self.scratchpath) + store.append('df',df) + rows = store.root.df.table.nrows + recons = store.select('df') + finally: + store.close() + os.remove(self.scratchpath) + + print "\nbig_table frame [%s] -> %5.2f" % (rows,time.time()-x) + + def test_big_table_panel(self): + raise nose.SkipTest('no big table panel') + + # create and write a big table + wp = Panel(np.random.randn(20, 1000, 1000), items= [ 'Item%03d' % i for i in xrange(20) ], + major_axis=date_range('1/1/2000', periods=1000), minor_axis = [ 'E%03d' % i for i in xrange(1000) ]) wp.ix[:,100:200,300:400] = np.nan + for x in range(100): + wp['String%03d'] = 'string%03d' % x + + import time + x = time.time() try: store = HDFStore(self.scratchpath) - store._debug_memory = True - store.append('wp',wp) + store.prof_append('wp',wp) + rows = store.root.wp.table.nrows recons = store.select('wp') finally: store.close() os.remove(self.scratchpath) + print "\nbig_table panel [%s] -> %5.2f" % (rows,time.time()-x) + def test_append_diff_item_order(self): raise nose.SkipTest('append diff item order') @@ -1238,6 +1311,14 @@ def test_legacy_table_read(self): store.close() + def test_legacy_0_10_read(self): + # legacy from 0.10 + pth = curpath() + store = HDFStore(os.path.join(pth, 'legacy_0.10.h5'), 'r') + for k in store.keys(): + store.select(k) + store.close() + def test_legacy_table_write(self): # legacy table types pth = curpath() diff --git a/pandas/lib.pyx b/pandas/lib.pyx index d904d86f183c3..014d69706e89c 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -749,24 +749,45 @@ from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, @cython.boundscheck(False) @cython.wraparound(False) -def create_hdf_rows_2d(ndarray indexer0, ndarray[np.uint8_t, ndim=1] mask, - list values): +def max_len_string_array(ndarray arr): + """ return the maximum size of elements in a strnig array """ + cdef: + int i, n_i, n_j, m, l + + n_i = arr.shape[0] + m = 0 + for i from 0 <= i < n_i: + n_j = len(arr[i]) + + for j from 0 <= j < n_j: + + l = len(arr[i,j]) + if l > m: + m = l + return m + +@cython.boundscheck(False) +@cython.wraparound(False) +def create_hdf_rows_2d(ndarray indexer0, + ndarray[np.uint8_t, ndim=1] mask, + ndarray[np.uint8_t, ndim=1] searchable, + list values): """ return a list of objects ready to be converted to rec-array format """ cdef: int i, b, n_indexer0, n_blocks, tup_size - ndarray v list l - object tup, val + object tup, val, v n_indexer0 = indexer0.shape[0] n_blocks = len(values) tup_size = n_blocks+1 l = [] + for i from 0 <= i < n_indexer0: if not mask[i]: - + tup = PyTuple_New(tup_size) val = indexer0[i] PyTuple_SET_ITEM(tup, 0, val) @@ -774,7 +795,9 @@ def create_hdf_rows_2d(ndarray indexer0, ndarray[np.uint8_t, ndim=1] mask, for b from 0 <= b < n_blocks: - v = values[b][:, i] + v = values[b][:, i] + if searchable[b]: + v = v[0] PyTuple_SET_ITEM(tup, b+1, v) Py_INCREF(v) @@ -785,14 +808,15 @@ def create_hdf_rows_2d(ndarray indexer0, ndarray[np.uint8_t, ndim=1] mask, @cython.boundscheck(False) @cython.wraparound(False) def create_hdf_rows_3d(ndarray indexer0, ndarray indexer1, - ndarray[np.uint8_t, ndim=2] mask, list values): + ndarray[np.uint8_t, ndim=2] mask, + ndarray[np.uint8_t, ndim=1] searchable, + list values): """ return a list of objects ready to be converted to rec-array format """ cdef: int i, j, b, n_indexer0, n_indexer1, n_blocks, tup_size - ndarray v list l - object tup, val + object tup, val, v n_indexer0 = indexer0.shape[0] n_indexer1 = indexer1.shape[0] @@ -818,6 +842,8 @@ def create_hdf_rows_3d(ndarray indexer0, ndarray indexer1, for b from 0 <= b < n_blocks: v = values[b][:, i, j] + if searchable[b]: + v = v[0] PyTuple_SET_ITEM(tup, b+2, v) Py_INCREF(v) @@ -828,14 +854,15 @@ def create_hdf_rows_3d(ndarray indexer0, ndarray indexer1, @cython.boundscheck(False) @cython.wraparound(False) def create_hdf_rows_4d(ndarray indexer0, ndarray indexer1, ndarray indexer2, - ndarray[np.uint8_t, ndim=3] mask, list values): + ndarray[np.uint8_t, ndim=3] mask, + ndarray[np.uint8_t, ndim=1] searchable, + list values): """ return a list of objects ready to be converted to rec-array format """ cdef: int i, j, k, b, n_indexer0, n_indexer1, n_indexer2, n_blocks, tup_size - ndarray v list l - object tup, val + object tup, val, v n_indexer0 = indexer0.shape[0] n_indexer1 = indexer1.shape[0] @@ -868,6 +895,8 @@ def create_hdf_rows_4d(ndarray indexer0, ndarray indexer1, ndarray indexer2, for b from 0 <= b < n_blocks: v = values[b][:, i, j, k] + if searchable[b]: + v = v[0] PyTuple_SET_ITEM(tup, b+3, v) Py_INCREF(v) diff --git a/vb_suite/hdfstore_bench.py b/vb_suite/hdfstore_bench.py index d43d8b60a9cf0..23303f335af7e 100644 --- a/vb_suite/hdfstore_bench.py +++ b/vb_suite/hdfstore_bench.py @@ -220,3 +220,33 @@ def remove(f): query_store_table = Benchmark("store.select('df12', [ ('index', '>', df.index[10000]), ('index', '<', df.index[15000]) ])", setup12, cleanup = "store.close()", start_date=start_date) +#---------------------------------------------------------------------- +# select from a panel table + +setup13 = common_setup + """ +p = Panel(randn(20, 1000, 1000), items= [ 'Item%03d' % i for i in xrange(20) ], + major_axis=date_range('1/1/2000', periods=1000), minor_axis = [ 'E%03d' % i for i in xrange(1000) ]) + +remove(f) +store = HDFStore(f) +store.append('p1',p) +""" + +read_store_table_panel = Benchmark("store.select('p1')", setup13, cleanup = "store.close()", + start_date=start_date) + + +#---------------------------------------------------------------------- +# write to a panel table + +setup14 = common_setup + """ +p = Panel(randn(20, 1000, 1000), items= [ 'Item%03d' % i for i in xrange(20) ], + major_axis=date_range('1/1/2000', periods=1000), minor_axis = [ 'E%03d' % i for i in xrange(1000) ]) + +remove(f) +store = HDFStore(f) +""" + +write_store_table_panel = Benchmark("store.append('p2',p)", setup14, cleanup = "store.close()", + start_date=start_date) + From 9408d59685ede2df39378ae9471c34e470ca7c06 Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 18 Dec 2012 19:32:46 -0500 Subject: [PATCH 02/35] removed conf.py paths --- doc/source/conf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 6895f00414b0b..692c7757ee17c 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -16,7 +16,6 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. #sys.path.append(os.path.abspath('.')) -sys.path.insert(0, '/home/jreback/pandas') sys.path.insert(0, os.path.abspath('../sphinxext')) sys.path.extend([ From 5c7e849b8b56e1a7a4bc1efe4c62940ed0576101 Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 18 Dec 2012 20:00:10 -0500 Subject: [PATCH 03/35] BUG: support multiple data columns that are in the same block (e.g. the same type) e.g. self.store.select('df', [ Term('string', '=', 'foo'), Term('string2=foo'), Term('A>0'), Term('B<0') ]) --- pandas/io/pytables.py | 12 ++++++------ pandas/io/tests/test_pytables.py | 19 +++++++++++++++++++ 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 27adfbdd60e79..1ee8805bba8fc 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1568,19 +1568,19 @@ def create_axes(self, axes, obj, validate = True, nan_rep = None, columns = None # get out blocks block_obj = self.get_object(obj) + blocks = None - data_obj = None if columns is not None and len(self.non_index_axes): axis = self.non_index_axes[0][0] axis_labels = self.non_index_axes[0][1] columns = [ c for c in columns if c in axis_labels ] if len(columns): - data_obj = block_obj.reindex_axis(Index(columns), axis = axis, copy = False) - block_obj = block_obj.reindex_axis(Index(axis_labels)-Index(columns), axis = axis, copy = False) + blocks = block_obj.reindex_axis(Index(axis_labels)-Index(columns), axis = axis, copy = False)._data.blocks + for c in columns: + blocks.extend(block_obj.reindex_axis([ c ], axis = axis, copy = False)._data.blocks) - blocks = list(block_obj._data.blocks) - if data_obj is not None: - blocks.extend(data_obj._data.blocks) + if blocks is None: + blocks = block_obj._data.blocks # add my values self.values_axes = [] diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index eec6b5c5ec4bf..0d25f759fa942 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -460,6 +460,25 @@ def test_append_with_data_columns(self): expected = df_new[df_new.string == 'foo'] tm.assert_frame_equal(result, expected) + # multiple data columns + df_new = df.copy() + df_new['string'] = 'foo' + df_new['string'][1:4] = np.nan + df_new['string'][5:6] = 'bar' + df_new['string2'] = 'foo' + df_new['string2'][2:5] = np.nan + df_new['string2'][7:8] = 'bar' + self.store.remove('df') + self.store.append('df', df_new, columns = ['A','B','string','string2']) + result = self.store.select('df', [ Term('string', '=', 'foo'), Term('string2=foo'), Term('A>0'), Term('B<0') ]) + expected = df_new[(df_new.string == 'foo') & (df_new.string2 == 'foo') & (df_new.A > 0) & (df_new.B < 0)] + tm.assert_frame_equal(result, expected) + + # yield an empty frame + result = self.store.select('df', [ Term('string', '=', 'foo'), Term('string2=bar'), Term('A>0'), Term('B<0') ]) + expected = df_new[(df_new.string == 'foo') & (df_new.string2 == 'bar') & (df_new.A > 0) & (df_new.B < 0)] + tm.assert_frame_equal(result, expected) + def test_create_table_index(self): wp = tm.makePanel() self.store.append('p5', wp) From c749c18ba3ac08debf3a1c18c1975d80d7a5c220 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 19 Dec 2012 08:03:39 -0500 Subject: [PATCH 04/35] ENH: correctly interpret data column dtypes and raise NotImplementedError (in cases of unicode/datetime64/date) --- pandas/io/pytables.py | 22 ++++++++++-- pandas/io/tests/test_pytables.py | 60 +++++++++++++++++++++++++++----- 2 files changed, 72 insertions(+), 10 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 1ee8805bba8fc..30214ff6c0edc 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1100,9 +1100,19 @@ def set_kind(self): def set_atom(self, block, existing_col, min_itemsize, nan_rep, **kwargs): """ create and setup my atom from the block b """ - self.values = list(block.items) - if block.dtype.name == 'object': + self.values = list(block.items) + dtype = block.dtype.name + + if dtype == 'object': + inferred_type = lib.infer_dtype(block.values.flatten()) + if inferred_type == 'unicode': + raise NotImplementedError("unicode is not implemented as a table column") + elif inferred_type == 'date': + raise NotImplementedError("date is not implemented as a table column") + self.set_atom_object(block, existing_col, min_itemsize, nan_rep) + elif dtype == 'datetime64[ns]': + raise NotImplementedError("datetime64[ns] is not implemented as a table column") else: self.set_atom_data(block) @@ -1531,6 +1541,12 @@ def create_axes(self, axes, obj, validate = True, nan_rep = None, columns = None nan_rep = 'nan' self.nan_rep = nan_rep + # convert the objects if we can to better divine dtypes + try: + obj = obj.convert_objects() + except: + pass + # create axes to index and non_index index_axes_map = dict() for i, a in enumerate(obj.axes): @@ -1608,6 +1624,8 @@ def create_axes(self, axes, obj, validate = True, nan_rep = None, columns = None col.set_pos(j) self.values_axes.append(col) + except (NotImplementedError): + raise except (Exception), detail: raise Exception("cannot find the correct atom type -> [dtype->%s] %s" % (b.dtype.name,str(detail))) j += 1 diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 0d25f759fa942..7029088f87b8c 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4,7 +4,7 @@ import sys import warnings -from datetime import datetime +import datetime import numpy as np from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range, @@ -13,7 +13,7 @@ import pandas.util.testing as tm from pandas.tests.test_series import assert_series_equal from pandas.tests.test_frame import assert_frame_equal -from pandas import concat, Timestamp +from pandas import concat try: import tables @@ -559,6 +559,42 @@ def test_big_table_frame(self): print "\nbig_table frame [%s] -> %5.2f" % (rows,time.time()-x) + + def test_big_table2_frame(self): + # this is a really big table: 2.5m rows x 300 float columns, 20 string columns + raise nose.SkipTest('no big table2 frame') + + # create and write a big table + print "\nbig_table2 start" + import time + start_time = time.time() + df = DataFrame(np.random.randn(2.5*1000*1000, 300), index = range(int(2.5*1000*1000)), columns = [ 'E%03d' % i for i in xrange(300) ]) + for x in range(20): + df['String%03d' % x] = 'string%03d' % x + + print "\nbig_table2 frame (creation of df) -> %5.2f" % (time.time()-start_time) + start_time = time.time() + + from arb.common import profile + fn = 'big_table2.h5' + + try: + + @profile.profile_func() + def f(): + store = HDFStore(fn,mode = 'w') + store.append('df',df) + store.close() + + f() + rows = store.root.df.table.nrows + #recons = store.select('df') + finally: + pass + #os.remove(fn) + + print "\nbig_table2 frame [%s] -> %5.2f" % (rows,time.time()-start_time) + def test_big_table_panel(self): raise nose.SkipTest('no big table panel') @@ -665,6 +701,15 @@ def _make_one_p4d(): self.store.append('p4d_mixed', p4d) tm.assert_panel4d_equal(self.store.select('p4d_mixed'), p4d) + def test_unimplemented_dtypes_table_columns(self): + #### currently not supported dtypes #### + from pandas import Timestamp + + for n,f in [ ('timestamp',Timestamp('20010102')), ('unicode',u'\u03c3'), ('datetime',datetime.datetime(2001,1,2)), ('date',datetime.date(2001,1,2)) ]: + df = tm.makeDataFrame() + df[n] = f + self.assertRaises(NotImplementedError, self.store.append, 'df1_%s' % n, df) + def test_remove(self): ts = tm.makeTimeSeries() df = tm.makeDataFrame() @@ -829,7 +874,7 @@ def test_terms(self): ('major_axis', '20121114'), ('major_axis', '>', '20121114'), (('major_axis', ['20121114','20121114']),), - ('major_axis', datetime(2012,11,14)), + ('major_axis', datetime.datetime(2012,11,14)), 'major_axis>20121114', 'major_axis>20121114', 'major_axis>20121114', @@ -936,14 +981,13 @@ def test_index_types(self): ser = Series(values, [0, 'y']) self._check_roundtrip(ser, func) - ser = Series(values, [datetime.today(), 0]) + ser = Series(values, [datetime.datetime.today(), 0]) self._check_roundtrip(ser, func) ser = Series(values, ['y', 0]) self._check_roundtrip(ser, func) - from datetime import date - ser = Series(values, [date.today(), 'a']) + ser = Series(values, [datetime.date.today(), 'a']) self._check_roundtrip(ser, func) ser = Series(values, [1.23, 'b']) @@ -955,7 +999,7 @@ def test_index_types(self): ser = Series(values, [1, 5]) self._check_roundtrip(ser, func) - ser = Series(values, [datetime(2012, 1, 1), datetime(2012, 1, 2)]) + ser = Series(values, [datetime.datetime(2012, 1, 1), datetime.datetime(2012, 1, 2)]) self._check_roundtrip(ser, func) def test_timeseries_preepoch(self): @@ -1352,7 +1396,7 @@ def test_legacy_table_write(self): store.close() def test_store_datetime_fractional_secs(self): - dt = datetime(2012, 1, 2, 3, 4, 5, 123456) + dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456) series = Series([0], [dt]) self.store['a'] = series self.assertEquals(self.store['a'].index[0], dt) From 29277684547bd9915c549c2458d8dacfbe1c6f4f Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 19 Dec 2012 08:18:23 -0500 Subject: [PATCH 05/35] ENH: automagically created indicies (controlled by kw index=True/False passed to append/put) --- doc/source/io.rst | 2 +- pandas/io/pytables.py | 23 ++++++++++++----------- pandas/io/tests/test_pytables.py | 24 ++++++++++++++---------- 3 files changed, 27 insertions(+), 22 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index a2e82dc05a562..20d91df5e21a9 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1144,7 +1144,7 @@ Queries are built up using a list of ``Terms`` (currently only **anding** of ter Indexing ~~~~~~~~ -You can create an index for a table with ``create_table_index`` after data is already in the table (after and ``append/put`` operation). Creating a table index is **highly** encouraged. This will speed your queries a great deal when you use a ``select`` with the indexed dimension as the ``where``. It is not automagically done now because you may want to index different axes than the default (except in the case of a DataFrame, where it almost always makes sense to index the ``index``. +You can create/modify an index for a table with ``create_table_index`` after data is already in the table (after and ``append/put`` operation). Creating a table index is **highly** encouraged. This will speed your queries a great deal when you use a ``select`` with the indexed dimension as the ``where``. **Indexes are automagically created** on the indexables and any data columns you specify. This behavior can be turned off by passing ``index=False`` to ``append``. .. ipython:: python diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 30214ff6c0edc..50299d874343c 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -507,7 +507,6 @@ def _write_to_group(self, key, value, table=False, append=False, wrapper(value) group._v_attrs.pandas_type = kind group._v_attrs.pandas_version = _version - #group._v_attrs.meta = getattr(value,'meta',None) def _write_series(self, group, series): self._write_index(group, 'index', series.index) @@ -634,31 +633,37 @@ def _write_wide(self, group, panel): def _read_wide(self, group, where=None): return Panel(self._read_block_manager(group)) - def _write_ndim_table(self, group, obj, append=False, comp=None, axes=None, **kwargs): + def _write_ndim_table(self, group, obj, append=False, comp=None, axes=None, index=True, **kwargs): if axes is None: axes = [1,2,3] t = create_table(self, group, typ = 'appendable_ndim') t.write(axes=axes, obj=obj, append=append, compression=comp, **kwargs) + if index: + t.create_index() def _read_ndim_table(self, group, where=None, **kwargs): t = create_table(self, group, **kwargs) return t.read(where) - def _write_frame_table(self, group, df, append=False, comp=None, axes=None, **kwargs): + def _write_frame_table(self, group, df, append=False, comp=None, axes=None, index=True, **kwargs): if axes is None: axes = [0] t = create_table(self, group, typ = 'appendable_frame') t.write(axes=axes, obj=df, append=append, compression=comp, **kwargs) + if index: + t.create_index() _read_frame_table = _read_ndim_table - def _write_wide_table(self, group, panel, append=False, comp=None, axes=None, **kwargs): + def _write_wide_table(self, group, panel, append=False, comp=None, axes=None, index=True, **kwargs): if axes is None: axes = [1,2] t = create_table(self, group, typ = 'appendable_panel') t.write(axes=axes, obj=panel, append=append, compression=comp, **kwargs) + if index: + t.create_index() _read_wide_table = _read_ndim_table @@ -847,12 +852,7 @@ def _read_group(self, group, where=None, **kwargs): kind = group._v_attrs.pandas_type kind = _LEGACY_MAP.get(kind, kind) handler = self._get_handler(op='read', kind=kind) - v = handler(group, where, **kwargs) - #if v is not None: - # meta = getattr(group._v_attrs,'meta',None) - # if meta is not None: - # v.meta = meta - return v + return handler(group, where, **kwargs) def _read_series(self, group, where=None): index = self._read_index(group, 'index') @@ -1427,8 +1427,9 @@ def create_index(self, columns = None, optlevel = None, kind = None): if not self.infer_axes(): return + # index all indexables and data_columns if columns is None: - columns = [ self.index_axes[0].name ] + columns = [ a.cname for a in self.index_axes ] + [ v.cname for v in self.values_axes if v.name in set(self.data_columns) ] if not isinstance(columns, (tuple,list)): columns = [ columns ] diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 7029088f87b8c..2a7db27aee742 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -430,6 +430,10 @@ def test_append_with_data_columns(self): self.store.append('df', df[2:]) tm.assert_frame_equal(self.store['df'], df) + # check that we have indicies created + assert(self.store.handle.root.df.table.cols.index.is_indexed == True) + assert(self.store.handle.root.df.table.cols.B.is_indexed == True) + # data column searching result = self.store.select('df', [ Term('B>0') ]) expected = df[df.B>0] @@ -441,14 +445,6 @@ def test_append_with_data_columns(self): expected = df_new[df_new.B>0] tm.assert_frame_equal(result, expected) - # index the columns - self.store.create_table_index('df', columns = ['B']) - result = self.store.select('df', [ Term('B>0'), Term('index','>',df.index[3]) ]) - tm.assert_frame_equal(result, expected) - - # check the index - assert(self.store.handle.root.df.table.cols.B.is_indexed == True) - # data column selection with a string data_column df_new = df.copy() df_new['string'] = 'foo' @@ -480,13 +476,21 @@ def test_append_with_data_columns(self): tm.assert_frame_equal(result, expected) def test_create_table_index(self): + + # index=False wp = tm.makePanel() - self.store.append('p5', wp) - self.store.create_table_index('p5') + self.store.append('p5', wp, index=False) + self.store.create_table_index('p5', columns = ['major_axis']) assert(self.store.handle.root.p5.table.cols.major_axis.is_indexed == True) assert(self.store.handle.root.p5.table.cols.minor_axis.is_indexed == False) + # index=True + self.store.append('p5i', wp, index=True) + + assert(self.store.handle.root.p5i.table.cols.major_axis.is_indexed == True) + assert(self.store.handle.root.p5i.table.cols.minor_axis.is_indexed == True) + # default optlevels assert(self.store.handle.root.p5.table.cols.major_axis.index.optlevel == 6) assert(self.store.handle.root.p5.table.cols.major_axis.index.kind == 'medium') From 97bdb5cc433d65fd0e720e3e1a4cfcfc84e5367f Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 19 Dec 2012 13:09:07 -0500 Subject: [PATCH 06/35] DOC: minor doc updates and use cases --- doc/source/io.rst | 28 ++++++++++++++++++------ pandas/io/pytables.py | 50 +++++++++++++++++++++---------------------- 2 files changed, 46 insertions(+), 32 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 20d91df5e21a9..4e5bf79280a6d 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1148,8 +1148,7 @@ You can create/modify an index for a table with ``create_table_index`` after dat .. ipython:: python - # create an index - store.create_table_index('df') + # we have automagically already created an index (in the first section) i = store.root.df.table.cols.index.index i.optlevel, i.kind @@ -1168,20 +1167,35 @@ You can designate (and index) certain columns that you want to be able to perfor df['string'] = 'foo' df.ix[4:6,'string'] = np.nan df.ix[7:9,'string'] = 'bar' + df['string2'] = 'cool' df # on-disk operations - store.append('df_dc', df, columns = ['B','string']) + store.append('df_dc', df, columns = ['B','C','string','string2']) store.select('df_dc',[ Term('B>0') ]) # getting creative - store.select('df_dc',[ Term('B>0'), Term('string=foo') ]) + store.select('df_dc',[ Term('B>0'), Term('C>0'), Term('string=foo') ]) - # index the data_column - store.create_table_index('df_dc', columns = ['B']) + # this is in-memory version of this type of selection + df[(df.B > 0) & (df.C > 0) & (df.string == 'foo')] + + # we have automagically created this index and that the B/string columns are stored separately as ``PyTables`` columns store.root.df_dc.table -There is some performance degredation by making lots of columns into `data columns`, so it is up to the user to designate these. +There is some performance degredation by making lots of columns into `data columns`, so it is up to the user to designate these. In addition, you cannot change data columns (nor indexables) after the first append/put operation (Of course you can simply read in the data and create a new table!) + +Advanced Queries +~~~~~~~~~~~~~~~~ + +``not`` and ``or`` conditions are unsupported at this time; however, ``or`` operations are easy to replicate. Repately apply the criteria to the table and concat. + +.. ipython:: python + + crit1 = [ Term('B>0'), Term('C>0'), Term('string=foo') ] + crit2 = [ Term('B<0'), Term('C>0'), Term('string=foo') ] + + concat([ store.select('df_dc',c) for c in [ crit1, crit2 ] ]) Delete from a Table ~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 50299d874343c..390fe9be9d69d 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -894,7 +894,9 @@ class IndexCol(object): pos : the position in the pytables """ - is_indexable = True + is_an_indexable = True + is_data_indexable = True + is_searchable = False def __init__(self, values = None, kind = None, typ = None, cname = None, itemsize = None, name = None, axis = None, kind_attr = None, pos = None, **kwargs): self.values = values @@ -1047,12 +1049,16 @@ class DataCol(IndexCol): data : the actual data cname : the column name in the table to hold the data (typeically values) """ - is_indexable = False - is_searchable = False + is_an_indexable = False + is_data_indexable = False + is_searchable = False @classmethod def create_for_block(cls, i = None, name = None, cname = None, **kwargs): """ return a new datacol with the block i """ + + # a little hacky here, to avoid a backwards compability issue + # columns in the table are named like: values_block_0...., but there name is values_0 (for kind attributes) if cname is None: cname = name or 'values_block_%d' % i if name is None: @@ -1110,7 +1116,7 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, **kwargs): elif inferred_type == 'date': raise NotImplementedError("date is not implemented as a table column") - self.set_atom_object(block, existing_col, min_itemsize, nan_rep) + self.set_atom_string(block, existing_col, min_itemsize, nan_rep) elif dtype == 'datetime64[ns]': raise NotImplementedError("datetime64[ns] is not implemented as a table column") else: @@ -1118,10 +1124,10 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, **kwargs): return self - def get_atom_object(self, block, itemsize): + def get_atom_string(self, block, itemsize): return _tables().StringCol(itemsize = itemsize, shape = block.shape[0]) - def set_atom_object(self, block, existing_col, min_itemsize, nan_rep): + def set_atom_string(self, block, existing_col, min_itemsize, nan_rep): # fill nan items with myself data = block.fillna(nan_rep).values @@ -1139,10 +1145,10 @@ def set_atom_object(self, block, existing_col, min_itemsize, nan_rep): itemsize = eci self.kind = 'string' - self.typ = self.get_atom_object(block, itemsize) - self.set_data(self.convert_object_data(data, itemsize)) + self.typ = self.get_atom_string(block, itemsize) + self.set_data(self.convert_string_data(data, itemsize)) - def convert_object_data(self, data, itemsize): + def convert_string_data(self, data, itemsize): return data.astype('S%s' % itemsize) def get_atom_data(self, block): @@ -1206,23 +1212,15 @@ def set_attr(self): class DataIndexableCol(DataCol): """ represent a data column that can be indexed """ + is_data_indexable = True @property def is_searchable(self): return self.kind == 'string' - def get_atom_object(self, block, itemsize): + def get_atom_string(self, block, itemsize): return _tables().StringCol(itemsize = itemsize) - # reshape the values if not shape (e.g. we are a scalar) - #if 'shape' not in kw: - # import pdb; pdb.set_trace() - # values = values.reshape(values.shape[1:]) - - - def convert_object_data(self, data, itemsize): - return data.astype('S%s' % itemsize) - def get_atom_data(self, block): return getattr(_tables(),"%sCol" % self.kind.capitalize())() @@ -1242,9 +1240,11 @@ class Table(object): These are attributes that are store in the main table node, they are necessary to recreate these tables when read back in. - index_axes: a list of tuples of the (original indexing axis and index column) + index_axes : a list of tuples of the (original indexing axis and index column) non_index_axes: a list of tuples of the (original index axis and columns on a non-indexing axis) - values_axes : a list of the columns which comprise the data of this table + values_axes : a list of the columns which comprise the data of this table + data_columns : a list of columns that we are allowing indexing (these become single columns in values_axes) + nan_rep : the string to use for nan representations for string objects """ table_type = None @@ -1429,7 +1429,7 @@ def create_index(self, columns = None, optlevel = None, kind = None): # index all indexables and data_columns if columns is None: - columns = [ a.cname for a in self.index_axes ] + [ v.cname for v in self.values_axes if v.name in set(self.data_columns) ] + columns = [ a.cname for a in self.axes if a.is_data_indexable ] if not isinstance(columns, (tuple,list)): columns = [ columns ] @@ -1494,8 +1494,8 @@ def infer_axes(self): self.non_index_axes = getattr(self.attrs,'non_index_axes',None) or [] self.data_columns = getattr(self.attrs,'data_columns',None) or [] self.nan_rep = getattr(self.attrs,'nan_rep',None) - self.index_axes, self.values_axes = [ a.infer(self.table) for a in self.indexables if a.is_indexable ], [ a.infer(self.table) for a in self.indexables if not a.is_indexable ] - + self.index_axes = [ a.infer(self.table) for a in self.indexables if a.is_an_indexable ] + self.values_axes = [ a.infer(self.table) for a in self.indexables if not a.is_an_indexable ] return True def get_object(self, obj): @@ -2362,8 +2362,8 @@ def eval(self): raise Exception("passing a filterable condition to a non-table indexer [%s]" % str(self)) def convert_value(self, v): + """ convert the expression that is in the term to something that is accepted by pytables """ - #### a little hacky here, need to really figure out what we should convert ####x if self.kind == 'datetime64' : return [lib.Timestamp(v).value, None] elif isinstance(v, datetime) or hasattr(v,'timetuple') or self.kind == 'date': From af43f710dba6b59387bbd29ed1fe0e13da679d22 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 19 Dec 2012 15:03:23 -0500 Subject: [PATCH 07/35] ENH/DOC: updated docs for compression added parameter chunksize to append, now writing occurs in chunks, significatnly reducing memory usage --- doc/source/io.rst | 16 ++++++++++-- pandas/io/pytables.py | 44 ++++++++++++++++++++++++-------- pandas/io/tests/test_pytables.py | 28 ++++++++++---------- 3 files changed, 60 insertions(+), 28 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 4e5bf79280a6d..d5497848d382a 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1221,6 +1221,20 @@ It should be clear that a delete operation on the ``major_axis`` will be fairly store.remove('wp', 'major_axis>20000102' ) store.select('wp') +Compression +~~~~~~~~~~~ +``PyTables`` allows the stored data to be compressed (this applies to all kinds of stores, not just tables). You can pass ``complevel=int`` for a compression level (1-9, with 0 being no compression, and the default), ``complib=lib`` where lib is any of ``zlib, bzip2, lzo, blosc`` for whichever compression library you prefer. ``blosc`` offers very fast compression (its level defaults to 9), and is my most used. + +``PyTables`` offer better write performance when compressed after writing them, as opposed to turning on compression at the very beginning. You can use the supplied ``PyTables`` utility ``ptrepack``. ``ptrepack`` also can change compression levels after the fact. + + - ``ptrepack --chunkshape=auto --propindexes --complevel=9 --complib=blosc in.h5 out.h5`` + +Or on-the-fly compression + + - ``store_compressed = HDFStore('store_compressed.h5', complevel=9, complib='blosc')`` + + + Notes & Caveats ~~~~~~~~~~~~~~~ @@ -1258,8 +1272,6 @@ Performance - ``AppendableTable`` which is a similiar table to past versions (this is the default). - ``WORMTable`` (pending implementation) - is available to faciliate very fast writing of tables that are also queryable (but CANNOT support appends) - - ``Tables`` offer better performance when compressed after writing them (as opposed to turning on compression at the very beginning) - use the pytables utilities ``ptrepack`` to rewrite the file (and also can change compression methods) - Duplicate rows can be written, but are filtered out in selection (with the last items being selected; thus a table is unique on major, minor pairs) Experimental diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 390fe9be9d69d..cd816fb25b958 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -420,6 +420,14 @@ def append(self, key, value, **kwargs): key : object value : {Series, DataFrame, Panel} + Optional Parameters + ------------------- + columns : list of columns to create as data columns + min_itemsize : dict of columns that specify minimum string sizes + nan_rep : string to use as string nan represenation + chunksize : size to chunk the writing + + Notes ----- Does *not* check if data being appended overlaps with existing @@ -1820,7 +1828,7 @@ class AppendableTable(LegacyTable): table_type = 'appendable' def write(self, axes, obj, append=False, compression=None, - complevel=None, min_itemsize = None, **kwargs): + complevel=None, min_itemsize = None, chunksize = 50000, **kwargs): # create the table if it doesn't exist (or get it if it does) if not append: @@ -1849,10 +1857,9 @@ def write(self, axes, obj, append=False, compression=None, a.validate_and_set(table, append) # add the rows - self.write_data() - self.handle.flush() + self.write_data(chunksize) - def write_data(self): + def write_data(self, chunksize): """ fast writing of data: requires specific cython routines each axis shape """ # create the masks & values @@ -1869,16 +1876,29 @@ def write_data(self): m = mask & m # the arguments - args = [ a.cvalues for a in self.index_axes ] - search = np.array([ a.is_searchable for a in self.values_axes ]).astype('u1') - values = [ a.data for a in self.values_axes ] + indexes = [ a.cvalues for a in self.index_axes ] + search = np.array([ a.is_searchable for a in self.values_axes ]).astype('u1') + values = [ a.take_data() for a in self.values_axes ] + + # write the chunks + rows = np.prod([ i.shape[0] for i in indexes ]) + chunks = int(rows / chunksize) + 1 + for i in xrange(chunks): + start_i = i*chunksize + end_i = min((i+1)*chunksize,rows) + + self.write_data_chunk(indexes = [ a[start_i:end_i] for a in indexes ], + mask = mask[start_i:end_i], + search = search, + values = [ v[:,start_i:end_i] for v in values ]) + + def write_data_chunk(self, indexes, mask, search, values): # get our function try: func = getattr(lib,"create_hdf_rows_%sd" % self.ndim) - args.append(mask) - args.append(search) - args.append(values) + args = list(indexes) + args.extend([ mask, search, values ]) rows = func(*args) except (Exception), detail: raise Exception("cannot create row-data -> %s" % str(detail)) @@ -1886,7 +1906,9 @@ def write_data(self): try: if len(rows): self.table.append(rows) + self.table.flush() except (Exception), detail: + import pdb; pdb.set_trace() raise Exception("tables cannot write this data -> %s" % str(detail)) def delete(self, where = None): @@ -1934,7 +1956,7 @@ def delete(self, where = None): table.removeRows(start = rows[rows.index[0]], stop = rows[rows.index[-1]]+1) pg = g - self.handle.flush() + self.table.flush() # return the number of rows removed return ln diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 2a7db27aee742..eb8b1cae3bf32 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -576,28 +576,26 @@ def test_big_table2_frame(self): for x in range(20): df['String%03d' % x] = 'string%03d' % x - print "\nbig_table2 frame (creation of df) -> %5.2f" % (time.time()-start_time) - start_time = time.time() - - from arb.common import profile + print "\nbig_table2 frame (creation of df) [rows->%s] -> %5.2f" % (len(df.index),time.time()-start_time) fn = 'big_table2.h5' try: - @profile.profile_func() - def f(): + def f(chunksize): store = HDFStore(fn,mode = 'w') - store.append('df',df) + store.append('df',df,chunksize=chunksize) + r = store.root.df.table.nrows store.close() - - f() - rows = store.root.df.table.nrows - #recons = store.select('df') - finally: - pass - #os.remove(fn) + return r - print "\nbig_table2 frame [%s] -> %5.2f" % (rows,time.time()-start_time) + for c in [ 10000, 50000, 100000, 250000 ]: + start_time = time.time() + print "big_table2 frame [chunk->%s]" % c + rows = f(c) + print "big_table2 frame [rows->%s,chunk->%s] -> %5.2f" % (rows,c,time.time()-start_time) + + finally: + os.remove(fn) def test_big_table_panel(self): raise nose.SkipTest('no big table panel') From ce6a7a992c61706ee3b0bd29c695db91552cfae9 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 20 Dec 2012 08:38:35 -0500 Subject: [PATCH 08/35] ENH: export of get_store context manager in __init__ for pandas add expectedrows keyword to append to give pytables an estimate of the total rows in a new table add start/stop keywords as selection criteria to limit searches to these rows added multi-index support for dataframes docs/tests for the above --- doc/source/io.rst | 15 ++++- pandas/__init__.py | 2 +- pandas/io/pytables.py | 104 ++++++++++++++++++++++--------- pandas/io/tests/test_pytables.py | 45 +++++++++++-- 4 files changed, 128 insertions(+), 38 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index d5497848d382a..1a056f6ae8682 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1030,6 +1030,17 @@ Deletion of the object specified by the key del store['wp'] store +Closing a Store + +.. ipython:: python + + + # closing a store + store.close() + + # Working with, and automatically closing the store with the context manager. + with get_store('store.h5') as store: + store.keys() .. ipython:: python :suppress: @@ -1267,7 +1278,9 @@ Performance - ``Tables`` come with a writing performance penalty as compared to regular stores. The benefit is the ability to append/delete and query (potentially very large amounts of data). Write times are generally longer as compared with regular stores. Query times can be quite fast, especially on an indexed axis. - - ``Tables`` can (as of 0.10.0) be expressed as different types. + - You can pass ``chunksize=an integer`` to ``append``, to change the writing chunksize (default is 50000). This will signficantly lower your memory usage on writing. + - You can pass ``expectedrows=an integer`` to the first ``append``, to set the TOTAL number of expectedrows that ``PyTables`` will expected. This will optimize read/write performance. + - ``Tables`` can be expressed as different types. - ``AppendableTable`` which is a similiar table to past versions (this is the default). - ``WORMTable`` (pending implementation) - is available to faciliate very fast writing of tables that are also queryable (but CANNOT support appends) diff --git a/pandas/__init__.py b/pandas/__init__.py index 1d45727257eeb..6c58c708b8306 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -32,7 +32,7 @@ from pandas.io.parsers import (read_csv, read_table, read_clipboard, read_fwf, to_clipboard, ExcelFile, ExcelWriter) -from pandas.io.pytables import HDFStore, Term +from pandas.io.pytables import HDFStore, Term, get_store from pandas.util.testing import debug from pandas.tools.describe import value_range diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index cd816fb25b958..10a7227b53734 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -336,7 +336,7 @@ def get(self, key): raise KeyError('No object named %s in the file' % key) return self._read_group(group) - def select(self, key, where=None, **kwargs): + def select(self, key, where=None, start=None, stop=None, **kwargs): """ Retrieve pandas object stored in file, optionally based on where criteria @@ -350,7 +350,7 @@ def select(self, key, where=None, **kwargs): group = self.get_node(key) if group is None: raise KeyError('No object named %s in the file' % key) - return self._read_group(group, where, **kwargs) + return self._read_group(group, where=where, start=start, stop=stop, **kwargs) def put(self, key, value, table=False, append=False, compression=None, **kwargs): @@ -376,7 +376,7 @@ def put(self, key, value, table=False, append=False, self._write_to_group(key, value, table=table, append=append, comp=compression, **kwargs) - def remove(self, key, where=None): + def remove(self, key, where=None, start=None, stop=None): """ Remove pandas object partially by specifying the where condition @@ -406,7 +406,7 @@ def remove(self, key, where=None): if not _is_table_type(group): raise Exception('can only remove with where on objects written as tables') t = create_table(self, group) - return t.delete(where) + return t.delete(where = where, start=start, stop=stop) return None @@ -426,7 +426,7 @@ def append(self, key, value, **kwargs): min_itemsize : dict of columns that specify minimum string sizes nan_rep : string to use as string nan represenation chunksize : size to chunk the writing - + expectedrows : expected TOTAL row size of this table Notes ----- @@ -472,6 +472,15 @@ def get_node(self, key): except: return None + def get_table(self, key): + """ return the table object for a key, raise if not in the file or a non-table """ + group = self.get_node(key) + if group is None: + raise KeyError('No object named %s in the file' % key) + if not _is_table_type(group): + raise Exception("cannot return a table object for a non-table") + return create_table(self, group) + ###### private methods ###### def _get_handler(self, op, kind): @@ -596,7 +605,7 @@ def _read_sparse_panel(self, group, where=None): def _write_frame(self, group, df): self._write_block_manager(group, df._data) - def _read_frame(self, group, where=None): + def _read_frame(self, group, where=None, **kwargs): return DataFrame(self._read_block_manager(group)) def _write_block_manager(self, group, data): @@ -638,7 +647,7 @@ def _write_wide(self, group, panel): panel._consolidate_inplace() self._write_block_manager(group, panel._data) - def _read_wide(self, group, where=None): + def _read_wide(self, group, where=None, **kwargs): return Panel(self._read_block_manager(group)) def _write_ndim_table(self, group, obj, append=False, comp=None, axes=None, index=True, **kwargs): @@ -652,12 +661,13 @@ def _write_ndim_table(self, group, obj, append=False, comp=None, axes=None, inde def _read_ndim_table(self, group, where=None, **kwargs): t = create_table(self, group, **kwargs) - return t.read(where) + return t.read(where, **kwargs) def _write_frame_table(self, group, df, append=False, comp=None, axes=None, index=True, **kwargs): if axes is None: axes = [0] - t = create_table(self, group, typ = 'appendable_frame') + + t = create_table(self, group, typ = 'appendable_frame' if df.index.nlevels == 1 else 'appendable_multiframe') t.write(axes=axes, obj=df, append=append, compression=comp, **kwargs) if index: t.create_index() @@ -860,9 +870,9 @@ def _read_group(self, group, where=None, **kwargs): kind = group._v_attrs.pandas_type kind = _LEGACY_MAP.get(kind, kind) handler = self._get_handler(op='read', kind=kind) - return handler(group, where, **kwargs) + return handler(group, where=where, **kwargs) - def _read_series(self, group, where=None): + def _read_series(self, group, where=None, **kwargs): index = self._read_index(group, 'index') if len(index) > 0: values = _read_array(group, 'values') @@ -872,12 +882,12 @@ def _read_series(self, group, where=None): name = getattr(group._v_attrs, 'name', None) return Series(values, index=index, name=name) - def _read_legacy_series(self, group, where=None): + def _read_legacy_series(self, group, where=None, **kwargs): index = self._read_index_legacy(group, 'index') values = _read_array(group, 'values') return Series(values, index=index) - def _read_legacy_frame(self, group, where=None): + def _read_legacy_frame(self, group, where=None, **kwargs): index = self._read_index_legacy(group, 'index') columns = self._read_index_legacy(group, 'columns') values = _read_array(group, 'values') @@ -1253,11 +1263,13 @@ class Table(object): values_axes : a list of the columns which comprise the data of this table data_columns : a list of columns that we are allowing indexing (these become single columns in values_axes) nan_rep : the string to use for nan representations for string objects + levels : the names of levels """ table_type = None obj_type = None ndim = None + levels = 1 def __init__(self, parent, group, **kwargs): self.parent = parent @@ -1384,6 +1396,7 @@ def set_attrs(self): self.attrs.non_index_axes = self.non_index_axes self.attrs.data_columns = self.data_columns self.attrs.nan_rep = self.nan_rep + self.attrs.levels = self.levels def validate_version(self, where = None): """ are we trying to operate on an old version? """ @@ -1472,7 +1485,7 @@ def create_index(self, columns = None, optlevel = None, kind = None): if not v.is_indexed: v.createIndex(**kw) - def read_axes(self, where): + def read_axes(self, where, **kwargs): """ create and return the axes sniffed from the table: return boolean for success """ # validate the version @@ -1482,7 +1495,7 @@ def read_axes(self, where): if not self.infer_axes(): return False # create the selection - self.selection = Selection(self, where) + self.selection = Selection(self, where = where, **kwargs) values = self.selection.select() # convert the data @@ -1502,6 +1515,7 @@ def infer_axes(self): self.non_index_axes = getattr(self.attrs,'non_index_axes',None) or [] self.data_columns = getattr(self.attrs,'data_columns',None) or [] self.nan_rep = getattr(self.attrs,'nan_rep',None) + self.levels = getattr(self.attrs,'levels',None) or [] self.index_axes = [ a.infer(self.table) for a in self.indexables if a.is_an_indexable ] self.values_axes = [ a.infer(self.table) for a in self.indexables if not a.is_an_indexable ] return True @@ -1659,10 +1673,11 @@ def reindex(obj, axis, filt, ordered): return obj - def create_description(self, compression = None, complevel = None): + def create_description(self, compression = None, complevel = None, expectedrows = None): """ create the description of the table from the axes & values """ - d = { 'name' : 'table' } + d = dict( name = 'table', + expectedrows = expectedrows ) # description from the axes & values d['description'] = dict([ (a.cname,a.typ) for a in self.axes ]) @@ -1728,11 +1743,11 @@ class LegacyTable(Table): def write(self, **kwargs): raise Exception("write operations are not allowed on legacy tables!") - def read(self, where=None): + def read(self, where=None, **kwargs): """ we have n indexable columns, with an arbitrary number of data axes """ - if not self.read_axes(where): return None + if not self.read_axes(where=where, **kwargs): return None factors = [ Categorical.from_array(a.values) for a in self.index_axes ] levels = [ f.levels for f in factors ] @@ -1828,7 +1843,8 @@ class AppendableTable(LegacyTable): table_type = 'appendable' def write(self, axes, obj, append=False, compression=None, - complevel=None, min_itemsize = None, chunksize = 50000, **kwargs): + complevel=None, min_itemsize = None, chunksize = 50000, + expectedrows = None, **kwargs): # create the table if it doesn't exist (or get it if it does) if not append: @@ -1841,7 +1857,7 @@ def write(self, axes, obj, append=False, compression=None, if 'table' not in self.group: # create the table - options = self.create_description(compression = compression, complevel = complevel) + options = self.create_description(compression = compression, complevel = complevel, expectedrows = expectedrows) # set the table attributes self.set_attrs() @@ -1911,7 +1927,7 @@ def write_data_chunk(self, indexes, mask, search, values): import pdb; pdb.set_trace() raise Exception("tables cannot write this data -> %s" % str(detail)) - def delete(self, where = None): + def delete(self, where = None, **kwargs): # delete all rows (and return the nrows) if where is None or not len(where): @@ -1924,7 +1940,7 @@ def delete(self, where = None): # create the selection table = self.table - self.selection = Selection(self, where) + self.selection = Selection(self, where, **kwargs) values = self.selection.select_coords() # delete the rows in reverse order @@ -1977,9 +1993,9 @@ def get_object(self, obj): obj = obj.T return obj - def read(self, where=None): + def read(self, where=None, **kwargs): - if not self.read_axes(where): return None + if not self.read_axes(where=where, **kwargs): return None index = self.index_axes[0].values frames = [] @@ -2014,6 +2030,30 @@ def read(self, where=None): return df +class AppendableMultiFrameTable(AppendableFrameTable): + """ a frame with a multi-index """ + table_type = 'appendable_multiframe' + obj_type = DataFrame + ndim = 2 + + @property + def table_type_short(self): + return 'appendable_multi' + + def write(self, obj, columns = None, **kwargs): + if columns is None: + columns = [] + for n in obj.index.names: + if n not in columns: + columns.insert(0,n) + self.levels = obj.index.names + return super(AppendableMultiFrameTable, self).write(obj = obj.reset_index(), columns = columns, **kwargs) + + def read(self, where=None, **kwargs): + df = super(AppendableMultiFrameTable, self).read(where = where, **kwargs) + df.set_index(self.levels, inplace=True) + return df + class AppendablePanelTable(AppendableTable): """ suppor the new appendable table formats """ table_type = 'appendable_panel' @@ -2038,7 +2078,8 @@ class AppendableNDimTable(AppendablePanelTable): # table maps _TABLE_MAP = { - 'appendable_frame' : AppendableFrameTable, + 'appendable_frame' : AppendableFrameTable, + 'appendable_multiframe' : AppendableMultiFrameTable, 'appendable_panel' : AppendablePanelTable, 'appendable_ndim' : AppendableNDimTable, 'worm' : WORMTable, @@ -2410,11 +2451,14 @@ class Selection(object): ---------- table : a Table object where : list of Terms (or convertable to) + start, stop: indicies to start and/or stop selection """ - def __init__(self, table, where=None): + def __init__(self, table, where=None, start=None, stop=None, **kwargs): self.table = table self.where = where + self.start = start + self.stop = stop self.condition = None self.filter = None self.terms = self.generate(where) @@ -2448,15 +2492,15 @@ def select(self): generate the selection """ if self.condition is not None: - return self.table.table.readWhere(self.condition) + return self.table.table.readWhere(self.condition, start=self.start, stop=self.stop) else: - return self.table.table.read() + return self.table.table.read(start=self.start,stop=self.stop) def select_coords(self): """ generate the selection """ - return self.table.table.getWhereList(self.condition, sort = True) + return self.table.table.getWhereList(self.condition, start=self.start, stop=self.stop, sort = True) def _get_index_factory(klass): diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index eb8b1cae3bf32..51c5680c98c07 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -633,6 +633,30 @@ def test_append_diff_item_order(self): self.assertRaises(Exception, self.store.put, 'panel', wp2, append=True) + def test_append_hierarchical(self): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['foo', 'bar']) + df = DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + + self.store.append('mi',df) + result = self.store.select('mi') + tm.assert_frame_equal(result, df) + + def test_append_misc(self): + + df = tm.makeDataFrame() + self.store.append('df',df,chunksize=1) + result = self.store.select('df') + tm.assert_frame_equal(result, df) + + self.store.append('df1',df,expectedrows=10) + result = self.store.select('df1') + tm.assert_frame_equal(result, df) + def test_table_index_incompatible_dtypes(self): df1 = DataFrame({'a': [1, 2, 3]}) df2 = DataFrame({'a': [4, 5, 6]}, @@ -1291,6 +1315,15 @@ def test_frame_select(self): #self.assertRaises(Exception, self.store.select, # 'frame', [crit1, crit2]) + def test_start_stop(self): + + df = DataFrame(dict(A = np.random.rand(20), B = np.random.rand(20))) + self.store.append('df', df) + + result = self.store.select('df', [ Term("columns", "=", ["A"]) ], start=0, stop=5) + expected = df.ix[0:4,['A']] + tm.assert_frame_equal(result, expected) + def test_select_filter_corner(self): df = DataFrame(np.random.randn(50, 100)) df.index = ['%.3d' % c for c in df.index] @@ -1453,13 +1486,13 @@ def test_store_datetime_mixed(self): df['d'] = ts.index[:3] self._check_roundtrip(df, tm.assert_frame_equal) - def test_cant_write_multiindex_table(self): - # for now, #1848 - df = DataFrame(np.random.randn(10, 4), - index=[np.arange(5).repeat(2), - np.tile(np.arange(2), 5)]) + #def test_cant_write_multiindex_table(self): + # # for now, #1848 + # df = DataFrame(np.random.randn(10, 4), + # index=[np.arange(5).repeat(2), + # np.tile(np.arange(2), 5)]) - self.assertRaises(Exception, self.store.put, 'foo', df, table=True) + # self.assertRaises(Exception, self.store.put, 'foo', df, table=True) def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) From 0180e798e2c9819207f5b1bff0cfee53b357d401 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 20 Dec 2012 08:50:15 -0500 Subject: [PATCH 09/35] DOC: doc updates for multi-index & start/stop --- doc/source/io.rst | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/doc/source/io.rst b/doc/source/io.rst index 1a056f6ae8682..0366000248eda 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1123,6 +1123,28 @@ Passing ``min_itemsize = { `values` : size }`` as a parameter to append will set # we have provided a minimum string column size store.root.df_mixed.table +Storing Multi-Index DataFrames +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Storing multi-index dataframes is very similar to storing/selecting from homogenous index DataFrames. + +.. ipython:: python + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['foo', 'bar']) + df = DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + df + + store.append('mi',df) + store.select('mi') + + # the levels are automatically included as data columns + store.select('mi', Term('foo=bar')) + Querying a Table ~~~~~~~~~~~~~~~~ @@ -1153,6 +1175,17 @@ Queries are built up using a list of ``Terms`` (currently only **anding** of ter store store.select('wp',[ Term('major_axis>20000102'), Term('minor_axis', '=', ['A','B']) ]) +Start and Stop parameters can be specified to limit the total search space. These are in terms of the total number of rows in a table. + +.. ipython:: python + + # this is effectively what the storage of a Panel looks like + wp.to_frame() + + # limiting the search + store.select('wp',[ Term('major_axis>20000102'), Term('minor_axis', '=', ['A','B']) ], start=0, stop=10) + + Indexing ~~~~~~~~ You can create/modify an index for a table with ``create_table_index`` after data is already in the table (after and ``append/put`` operation). Creating a table index is **highly** encouraged. This will speed your queries a great deal when you use a ``select`` with the indexed dimension as the ``where``. **Indexes are automagically created** on the indexables and any data columns you specify. This behavior can be turned off by passing ``index=False`` to ``append``. From c3e580e683b8f6ea218df355094c0683fecc30bb Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 20 Dec 2012 10:10:50 -0500 Subject: [PATCH 10/35] DOC: added whatsnew 0.10.1 --- RELEASE.rst | 25 ++++++++++++ doc/source/v0.10.1.txt | 88 +++++++++++++++++++++++++++++++++++++++++ doc/source/whatsnew.rst | 2 + 3 files changed, 115 insertions(+) create mode 100644 doc/source/v0.10.1.txt diff --git a/RELEASE.rst b/RELEASE.rst index 63accf42c470d..299fa0b2e81d8 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -22,6 +22,31 @@ Where to get it * Binary installers on PyPI: http://pypi.python.org/pypi/pandas * Documentation: http://pandas.pydata.org +pandas 0.10.1 +============= + +**Release date:** 2013-??-?? + +**New features** + +**Improvements to existing features** + + - ``HDFStore`` + - enables storing of multi-index dataframes + - support data column indexing and selection + - support write chunking to reduce memory footprint, via ``chunksize`` keywork to append + - support automagic indexing via ``index`` keyworkd to append + - support ``expectedrows`` keyword in append to inform ``PyTables`` about the expected tablesize + - support ``start`` and ``stop`` keywords in select to limit the row selection space + +**Bug fixes** + + - ``HDFStore`` + - correctly handle ``nan`` elements in string columns; serialize via the ``nan_rep`` keyword to append + - raise correctly on non-implemented column types (unicode/datetime64/date) + - handle correctly ``Term`` passed types (e.g. ``index<1000``, when index is ``Int64``) + + pandas 0.10.0 ============= diff --git a/doc/source/v0.10.1.txt b/doc/source/v0.10.1.txt new file mode 100644 index 0000000000000..6e6b2f236cd61 --- /dev/null +++ b/doc/source/v0.10.1.txt @@ -0,0 +1,88 @@ +.. _whatsnew_0101: + +v0.10.1 (January ??, 2013) +--------------------------- + +This is a minor release from 0.10.0 and includes many new features and +enhancements along with a large number of bug fixes. There are also a number of +important API changes that long-time pandas users should pay close attention +to. + +API changes +~~~~~~~~~~~ + +New features +~~~~~~~~~~~~ + +HDFStore +~~~~~~~~ + +.. ipython:: python + :suppress: + :okexcept: + + os.remove('store.h5') + +You can designate (and index) certain columns that you want to be able to perform queries on a table. + +.. ipython:: python + + store = HDFStore('store.h5') + df = DataFrame(randn(8, 3), index=date_range('1/1/2000', periods=8), + columns=['A', 'B', 'C']) + df['string'] = 'foo' + df.ix[4:6,'string'] = np.nan + df.ix[7:9,'string'] = 'bar' + df['string2'] = 'cool' + df + + # on-disk operations + store.append('df', df, columns = ['B','C','string','string2']) + store.select('df',[ Term('B>0') ]) + + # getting creative + store.select('df',[ Term('B>0'), Term('C>0'), Term('string=foo') ]) + + # this is in-memory version of this type of selection + df[(df.B > 0) & (df.C > 0) & (df.string == 'foo')] + + +``HDFStore`` now serializes multi-index dataframes. + +.. ipython:: python + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['foo', 'bar']) + df = DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + df + + store.append('mi',df) + store.select('mi') + + # the levels are automatically included as data columns + store.select('mi', Term('foo=bar')) + +.. ipython:: python + :suppress: + + store.close() + import os + os.remove('store.h5') + +**Enhancements** + +- You can pass ``nan_rep = 'my_nan_rep'`` to append, to change the default nan representation on disk (which converts to/from `np.nan`), this defaults to `nan`. +- You can pass ``index`` to ``append``. This defaults to ``True``. This will automagically create indicies on the *indexables* and *data columns* of the table +- You can pass ``chunksize=an integer`` to ``append``, to change the writing chunksize (default is 50000). This will signficantly lower your memory usage on writing. +- You can pass ``expectedrows=an integer`` to the first ``append``, to set the TOTAL number of expectedrows that ``PyTables`` will expected. This will optimize read/write performance. +- ``Select`` now supports passing ``start`` and ``stop`` to provide selection space liminting in selection. + + +See the `full release notes +`__ or issue tracker +on GitHub for a complete list. + diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index 82ed64680f1eb..6c125c45a2599 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -16,6 +16,8 @@ What's New These are new features and improvements of note in each release. +.. include:: v0.10.1.txt + .. include:: v0.10.0.txt .. include:: v0.9.1.txt From 3d75a3ef1a2a8b9a3d8242d815e94908da2a8b9c Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 20 Dec 2012 10:12:26 -0500 Subject: [PATCH 11/35] DOC: minor RELEAST.rst addition --- RELEASE.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/RELEASE.rst b/RELEASE.rst index 299fa0b2e81d8..6a9fd92a4c531 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -38,6 +38,7 @@ pandas 0.10.1 - support automagic indexing via ``index`` keyworkd to append - support ``expectedrows`` keyword in append to inform ``PyTables`` about the expected tablesize - support ``start`` and ``stop`` keywords in select to limit the row selection space + - added ``get_store`` context manager to automatically import with pandas **Bug fixes** From 88a06e2d4a04719bddc8a84b96444789b31dbe3e Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 20 Dec 2012 10:25:49 -0500 Subject: [PATCH 12/35] DOC: docstring updates --- RELEASE.rst | 2 +- doc/source/io.rst | 7 +++++++ doc/source/v0.10.1.txt | 2 +- pandas/io/pytables.py | 22 +++++++++++++++------- 4 files changed, 24 insertions(+), 9 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 6a9fd92a4c531..d3c70e73650ce 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -33,7 +33,7 @@ pandas 0.10.1 - ``HDFStore`` - enables storing of multi-index dataframes - - support data column indexing and selection + - support data column indexing and selection, via ``columns`` keyword in append - support write chunking to reduce memory footprint, via ``chunksize`` keywork to append - support automagic indexing via ``index`` keyworkd to append - support ``expectedrows`` keyword in append to inform ``PyTables`` about the expected tablesize diff --git a/doc/source/io.rst b/doc/source/io.rst index 0366000248eda..ef611780d54e6 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1241,6 +1241,13 @@ Advanced Queries concat([ store.select('df_dc',c) for c in [ crit1, crit2 ] ]) +If you want to inspect the table object, retrieve via ``get_table``. You could use this progamatically to say get the number of rows in the table. + +.. ipython:: python + + store.get_table('df_dc').nrows + + Delete from a Table ~~~~~~~~~~~~~~~~~~~ You can delete from a table selectively by specifying a ``where``. In deleting rows, it is important to understand the ``PyTables`` deletes rows by erasing the rows, then **moving** the following data. Thus deleting can potentially be a very expensive operation depending on the orientation of your data. This is especially true in higher dimensional objects (``Panel`` and ``Panel4D``). To get optimal deletion speed, it pays to have the dimension you are deleting be the first of the ``indexables``. diff --git a/doc/source/v0.10.1.txt b/doc/source/v0.10.1.txt index 6e6b2f236cd61..b60228323e713 100644 --- a/doc/source/v0.10.1.txt +++ b/doc/source/v0.10.1.txt @@ -23,7 +23,7 @@ HDFStore os.remove('store.h5') -You can designate (and index) certain columns that you want to be able to perform queries on a table. +You can designate (and index) certain columns that you want to be able to perform queries on a table, by passing a list to ``columns`` .. ipython:: python diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 10a7227b53734..8147c9d5e0f06 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -344,7 +344,12 @@ def select(self, key, where=None, start=None, stop=None, **kwargs): Parameters ---------- key : object + + Optional Parameters + ------------------- where : list of Term (or convertable) objects, optional + start : integer (defaults to None), row number to start selection + stop : integer (defaults to None), row number to stop selection """ group = self.get_node(key) @@ -384,9 +389,12 @@ def remove(self, key, where=None, start=None, stop=None): ---------- key : string Node to remove or delete rows from - where : list - For Table node, delete specified rows. See HDFStore.select for more - information + + Optional Parameters + ------------------- + where : list of Term (or convertable) objects, optional + start : integer (defaults to None), row number to start selection + stop : integer (defaults to None), row number to stop selection Returns ------- @@ -418,14 +426,14 @@ def append(self, key, value, **kwargs): Parameters ---------- key : object - value : {Series, DataFrame, Panel} + value : {Series, DataFrame, Panel, Panel4D} Optional Parameters ------------------- - columns : list of columns to create as data columns + columns : list of columns to create as data columns min_itemsize : dict of columns that specify minimum string sizes - nan_rep : string to use as string nan represenation - chunksize : size to chunk the writing + nan_rep : string to use as string nan represenation + chunksize : size to chunk the writing expectedrows : expected TOTAL row size of this table Notes From 91526a381ba262fe804e49ed4922852f7e412ffb Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 20 Dec 2012 10:36:01 -0500 Subject: [PATCH 13/35] DOC: RELEASE notes updates --- RELEASE.rst | 6 ++++-- doc/source/v0.10.1.txt | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index d3c70e73650ce..cb3f840fe31cc 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -32,7 +32,7 @@ pandas 0.10.1 **Improvements to existing features** - ``HDFStore`` - - enables storing of multi-index dataframes + - enables storing of multi-index dataframes (closes GH1277_) - support data column indexing and selection, via ``columns`` keyword in append - support write chunking to reduce memory footprint, via ``chunksize`` keywork to append - support automagic indexing via ``index`` keyworkd to append @@ -45,8 +45,10 @@ pandas 0.10.1 - ``HDFStore`` - correctly handle ``nan`` elements in string columns; serialize via the ``nan_rep`` keyword to append - raise correctly on non-implemented column types (unicode/datetime64/date) - - handle correctly ``Term`` passed types (e.g. ``index<1000``, when index is ``Int64``) + - handle correctly ``Term`` passed types (e.g. ``index<1000``, when index is ``Int64``), closes (GH512_) +.. _GH512: https://github.com/pydata/pandas/issues/512 +.. _GH1277: https://github.com/pydata/pandas/issues/1277 pandas 0.10.0 ============= diff --git a/doc/source/v0.10.1.txt b/doc/source/v0.10.1.txt index b60228323e713..5b8db89635589 100644 --- a/doc/source/v0.10.1.txt +++ b/doc/source/v0.10.1.txt @@ -79,7 +79,7 @@ You can designate (and index) certain columns that you want to be able to perfor - You can pass ``index`` to ``append``. This defaults to ``True``. This will automagically create indicies on the *indexables* and *data columns* of the table - You can pass ``chunksize=an integer`` to ``append``, to change the writing chunksize (default is 50000). This will signficantly lower your memory usage on writing. - You can pass ``expectedrows=an integer`` to the first ``append``, to set the TOTAL number of expectedrows that ``PyTables`` will expected. This will optimize read/write performance. -- ``Select`` now supports passing ``start`` and ``stop`` to provide selection space liminting in selection. +- ``Select`` now supports passing ``start`` and ``stop`` to provide selection space limiting in selection. See the `full release notes From 2570a3bfc5633da74258084ad489999e05c5aebe Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 20 Dec 2012 11:12:48 -0500 Subject: [PATCH 14/35] DOC: io.rst example for multi-index frame was propgating, making next examples confusing --- RELEASE.rst | 2 +- doc/source/io.rst | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index cb3f840fe31cc..1df9d71153462 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -35,7 +35,7 @@ pandas 0.10.1 - enables storing of multi-index dataframes (closes GH1277_) - support data column indexing and selection, via ``columns`` keyword in append - support write chunking to reduce memory footprint, via ``chunksize`` keywork to append - - support automagic indexing via ``index`` keyworkd to append + - support automagic indexing via ``index`` keywork to append - support ``expectedrows`` keyword in append to inform ``PyTables`` about the expected tablesize - support ``start`` and ``stop`` keywords in select to limit the row selection space - added ``get_store`` context manager to automatically import with pandas diff --git a/doc/source/io.rst b/doc/source/io.rst index ef611780d54e6..00e6519d50821 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1135,15 +1135,15 @@ Storing multi-index dataframes is very similar to storing/selecting from homogen labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['foo', 'bar']) - df = DataFrame(np.random.randn(10, 3), index=index, + df_mi = DataFrame(np.random.randn(10, 3), index=index, columns=['A', 'B', 'C']) - df + df_mi - store.append('mi',df) - store.select('mi') + store.append('df_mi',df_mi) + store.select('df_mi') # the levels are automatically included as data columns - store.select('mi', Term('foo=bar')) + store.select('df_mi', Term('foo=bar')) Querying a Table From a780c4c8ba8c42677b050d04ad0ce15ff51cffa7 Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 21 Dec 2012 09:03:33 -0500 Subject: [PATCH 15/35] BUG: reworked versioning to only act on specific version --- RELEASE.rst | 2 +- doc/source/io.rst | 17 ++++++------- pandas/io/pytables.py | 40 +++++++++++++++++++++---------- pandas/io/tests/legacy_table.h5 | Bin 211111 -> 211391 bytes pandas/io/tests/test_pytables.py | 23 ++++++++++++------ 5 files changed, 53 insertions(+), 29 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 1df9d71153462..a182f5a7fae2b 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -45,7 +45,7 @@ pandas 0.10.1 - ``HDFStore`` - correctly handle ``nan`` elements in string columns; serialize via the ``nan_rep`` keyword to append - raise correctly on non-implemented column types (unicode/datetime64/date) - - handle correctly ``Term`` passed types (e.g. ``index<1000``, when index is ``Int64``), closes (GH512_) + - handle correctly ``Term`` passed types (e.g. ``index<1000``, when index is ``Int64``), (closes GH512_) .. _GH512: https://github.com/pydata/pandas/issues/512 .. _GH1277: https://github.com/pydata/pandas/issues/1277 diff --git a/doc/source/io.rst b/doc/source/io.rst index 00e6519d50821..bc710f9337d9a 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1208,21 +1208,22 @@ You can designate (and index) certain columns that you want to be able to perfor .. ipython:: python - df['string'] = 'foo' - df.ix[4:6,'string'] = np.nan - df.ix[7:9,'string'] = 'bar' - df['string2'] = 'cool' - df + df_dc = df.copy() + df_dc['string'] = 'foo' + df_dc.ix[4:6,'string'] = np.nan + df_dc.ix[7:9,'string'] = 'bar' + df_dc['string2'] = 'cool' + df_dc # on-disk operations - store.append('df_dc', df, columns = ['B','C','string','string2']) + store.append('df_dc', df_dc, columns = ['B','C','string','string2']) store.select('df_dc',[ Term('B>0') ]) # getting creative store.select('df_dc',[ Term('B>0'), Term('C>0'), Term('string=foo') ]) # this is in-memory version of this type of selection - df[(df.B > 0) & (df.C > 0) & (df.string == 'foo')] + df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == 'foo')] # we have automagically created this index and that the B/string columns are stored separately as ``PyTables`` columns store.root.df_dc.table @@ -1232,7 +1233,7 @@ There is some performance degredation by making lots of columns into `data colum Advanced Queries ~~~~~~~~~~~~~~~~ -``not`` and ``or`` conditions are unsupported at this time; however, ``or`` operations are easy to replicate. Repately apply the criteria to the table and concat. +``not`` and ``or`` conditions are unsupported at this time; however, ``or`` operations are easy to replicate, by repeately applying the criteria to the table, and then ``concat`` the results. .. ipython:: python diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 8147c9d5e0f06..38eb223108df6 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -36,7 +36,7 @@ from contextlib import contextmanager # versioning attribute -_version = '0.11' +_version = '0.10.1' class IncompatibilityWarning(Warning): pass @@ -529,9 +529,9 @@ def _write_to_group(self, key, value, table=False, append=False, handler = self._get_handler(op='write', kind=kind) wrapper = lambda value: handler(group, value) - wrapper(value) group._v_attrs.pandas_type = kind group._v_attrs.pandas_version = _version + wrapper(value) def _write_series(self, group, series): self._write_index(group, 'index', series.index) @@ -1080,18 +1080,23 @@ class DataCol(IndexCol): is_searchable = False @classmethod - def create_for_block(cls, i = None, name = None, cname = None, **kwargs): + def create_for_block(cls, i = None, name = None, cname = None, version = None, **kwargs): """ return a new datacol with the block i """ - # a little hacky here, to avoid a backwards compability issue - # columns in the table are named like: values_block_0...., but there name is values_0 (for kind attributes) if cname is None: cname = name or 'values_block_%d' % i if name is None: name = cname - m = re.search("values_block_(\d+)",name) - if m: - name = "values_%s" % m.groups()[0] + + # prior to 0.10.1, we named values blocks like: values_block_0 an the name values_0 + try: + if version[0] == 0 and version[1] <= 10 and version[2] == 0: + m = re.search("values_block_(\d+)",name) + if m: + name = "values_%s" % m.groups()[0] + except: + pass + return cls(name = name, cname = cname, **kwargs) def __init__(self, values = None, kind = None, typ = None, cname = None, data = None, block = None, **kwargs): @@ -1282,7 +1287,16 @@ class Table(object): def __init__(self, parent, group, **kwargs): self.parent = parent self.group = group - self.version = getattr(group._v_attrs,'pandas_version',None) + + # compute our version + version = getattr(group._v_attrs,'pandas_version',None) + try: + self.version = tuple([ int(x) for x in version.split('.') ]) + if len(self.version) == 2: + self.version = tuple(self.version + [0]) + except: + self.version = (0,0,0) + self.index_axes = [] self.non_index_axes = [] self.values_axes = [] @@ -1409,8 +1423,8 @@ def set_attrs(self): def validate_version(self, where = None): """ are we trying to operate on an old version? """ if where is not None: - if self.version is None or float(self.version) < 0.1: - warnings.warn("where criteria is being ignored as we this version is too old (or not-defined) [%s]" % self.version, IncompatibilityWarning) + if self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1: + warnings.warn("where criteria is being ignored as we this version is too old (or not-defined) [%s]" % '.'.join([ str(x) for x in self.version ]), IncompatibilityWarning) @property def indexables(self): @@ -1430,7 +1444,7 @@ def f(i, c): klass = DataCol if c in dc: klass = DataIndexableCol - return klass.create_for_block(i = i, name = c, pos = base_pos + i ) + return klass.create_for_block(i = i, name = c, pos = base_pos + i, version = self.version) self._indexables.extend([ f(i,c) for i, c in enumerate(self.attrs.values_cols) ]) @@ -1646,7 +1660,7 @@ def create_axes(self, axes, obj, validate = True, nan_rep = None, columns = None try: existing_col = existing_table.values_axes[i] if existing_table is not None and validate else None - col = klass.create_for_block(i = i, name = name) + col = klass.create_for_block(i = i, name = name, version = self.version) col.set_atom(block = b, existing_col = existing_col, min_itemsize = min_itemsize, diff --git a/pandas/io/tests/legacy_table.h5 b/pandas/io/tests/legacy_table.h5 index 1c90382d9125c039e40b821f41451b989d1c2b36..5f4089efc15c325a3d1149475e00259647a6cce4 100644 GIT binary patch delta 335 zcmZ4fk!SyBo(URE`z<$WJz!?!nm$pTNwiskWxE0kBfpTf0D}Mn2)J7^gXsz|2_h80 zEJiTNz+j+f2t-g0glf+cW89u4#-t|5$gv%0>RBen36m8$4!0|5Gi_JWX0GQAf#~63 zFo2rH&%gnM1&MhniN*0{sYS(^`FRWs!a!LzAV#+fyZCfp3uYM;a!j1AE5>MR3-$)N GP67b=2{D=g delta 76 zcmdo0nP>S&o(URE%Plr)Jz$<*D8?w(tiZBefrXJ@Xu6*|lT$lTXnUa;le{2Ua{DrM ZrX5U^6*!KxD`_)rSJGy#=VfC60|2jZ6vY4l diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 51c5680c98c07..d76800e5a8b82 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -93,16 +93,17 @@ def test_versioning(self): self.store.remove('df1') self.store.append('df1', df[:10]) self.store.append('df1', df[10:]) - self.assert_(self.store.root.a._v_attrs.pandas_version == '0.11') - self.assert_(self.store.root.b._v_attrs.pandas_version == '0.11') - self.assert_(self.store.root.df1._v_attrs.pandas_version == '0.11') + self.assert_(self.store.root.a._v_attrs.pandas_version == '0.10.1') + self.assert_(self.store.root.b._v_attrs.pandas_version == '0.10.1') + self.assert_(self.store.root.df1._v_attrs.pandas_version == '0.10.1') # write a file and wipe its versioning self.store.remove('df2') self.store.append('df2', df) + + # this is an error because its table_type is appendable, but no version info self.store.get_node('df2')._v_attrs.pandas_version = None - self.store.select('df2') - self.store.select('df2', [ Term('index','>',df.index[2]) ]) + self.assertRaises(Exception, self.store.select,'df2') def test_meta(self): raise nose.SkipTest('no meta') @@ -1310,6 +1311,11 @@ def test_frame_select(self): self.store.append('df_float', df) self.store.select('df_float', [ Term("index<10.0"), Term("columns", "=", ["A"]) ]) + # invalid terms + df = tm.makeTimeDataFrame() + self.store.append('df_time', df) + self.assertRaises(Exception, self.store.select, 'df_time', [ Term("index>0") ]) + # can't select if not written as table #self.store['frame'] = df #self.assertRaises(Exception, self.store.select, @@ -1401,10 +1407,13 @@ def test_legacy_table_read(self): # force the frame store.select('df2', typ = 'legacy_frame') - # old version (this still throws an exception though) + self.assertRaises(Exception, store.select, 'wp1', Term('minor_axis','=','B')) + + # old version warning import warnings warnings.filterwarnings('ignore', category=IncompatibilityWarning) - self.assertRaises(Exception, store.select, 'wp1', Term('minor_axis','=','B')) + df2 = store.select('df2') + store.select('df2', Term('index', '>', df2.index[2])) warnings.filterwarnings('always', category=IncompatibilityWarning) store.close() From 73d755468964a10cb882e34ccbfb3772880813fc Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 21 Dec 2012 09:45:34 -0500 Subject: [PATCH 16/35] BUG: more robust to whitespace in Terms --- doc/source/io.rst | 4 ++-- pandas/io/pytables.py | 2 +- pandas/io/tests/test_pytables.py | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index bc710f9337d9a..802256174ebb4 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1160,7 +1160,7 @@ Valid terms can be created from ``dict, list, tuple, or string``. Objects can be - ``dict(field = 'index', op = '>', value = '20121114')`` - ``('index', '>', '20121114')`` - - ``'index>20121114'`` + - ``'index > 20121114'`` - ``('index', '>', datetime(2012,11,14))`` - ``('index', ['20121114','20121115'])`` - ``('major_axis', '=', Timestamp('2012/11/14'))`` @@ -1220,7 +1220,7 @@ You can designate (and index) certain columns that you want to be able to perfor store.select('df_dc',[ Term('B>0') ]) # getting creative - store.select('df_dc',[ Term('B>0'), Term('C>0'), Term('string=foo') ]) + store.select('df_dc',[ 'B > 0', 'C > 0', 'string == foo' ]) # this is in-memory version of this type of selection df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == 'foo')] diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 38eb223108df6..5766db4f71ee3 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2317,7 +2317,7 @@ class Term(object): """ _ops = ['<=','<','>=','>','!=','='] - _search = re.compile("^(?P\w+)(?P%s)(?P.+)$" % '|'.join(_ops)) + _search = re.compile("^(?P\w+)\s*(?P%s)\s*(?P.+)$" % '|'.join(_ops)) def __init__(self, field, op = None, value = None, queryables = None): self.field = None diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index d76800e5a8b82..4a51676696a57 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -902,9 +902,9 @@ def test_terms(self): ('major_axis', '>', '20121114'), (('major_axis', ['20121114','20121114']),), ('major_axis', datetime.datetime(2012,11,14)), - 'major_axis>20121114', - 'major_axis>20121114', - 'major_axis>20121114', + 'major_axis> 20121114', + 'major_axis >20121114', + 'major_axis > 20121114', (('minor_axis', ['A','B']),), (('minor_axis', ['A','B']),), ((('minor_axis', ['A','B']),),), From dcbc020e6a53633ecd3f1bdd7240c01b5de6b02c Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 21 Dec 2012 10:14:31 -0500 Subject: [PATCH 17/35] BUG: make Term more robust to whitespace and syntax --- pandas/io/pytables.py | 13 ++++++++++--- pandas/io/tests/test_pytables.py | 17 +++++++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 5766db4f71ee3..02476dfcfef03 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2316,7 +2316,7 @@ class Term(object): """ - _ops = ['<=','<','>=','>','!=','='] + _ops = ['<=','<','>=','>','!=','==','='] _search = re.compile("^(?P\w+)\s*(?P%s)\s*(?P.+)$" % '|'.join(_ops)) def __init__(self, field, op = None, value = None, queryables = None): @@ -2376,6 +2376,10 @@ def __init__(self, field, op = None, value = None, queryables = None): if self.field is None or self.op is None or self.value is None: raise Exception("Could not create this term [%s]" % str(self)) + # = vs == + if self.op == '==': + self.op = '=' + # we have valid conditions if self.op in ['>','>=','<','<=']: if hasattr(self.value,'__iter__') and len(self.value) > 1: @@ -2502,9 +2506,12 @@ def generate(self, where): if not isinstance(where, (list,tuple)): where = [ where ] else: - # do we have all list/tuple + + # make this a list of we think that we only have a sigle term & no operands inside any terms if not any([ isinstance(w, (list,tuple,Term)) for w in where ]): - where = [ where ] + + if not any([ isinstance(w,basestring) and Term._search.match(w) for w in where ]): + where = [ where ] queryables = self.table.queryables() return [ Term(c, queryables = queryables) for c in where ] diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 4a51676696a57..0dbd9000e48d9 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -476,6 +476,23 @@ def test_append_with_data_columns(self): expected = df_new[(df_new.string == 'foo') & (df_new.string2 == 'bar') & (df_new.A > 0) & (df_new.B < 0)] tm.assert_frame_equal(result, expected) + # doc example + df_dc = df.copy() + df_dc['string'] = 'foo' + df_dc.ix[4:6,'string'] = np.nan + df_dc.ix[7:9,'string'] = 'bar' + df_dc['string2'] = 'cool' + df_dc + self.store.remove('df_dc') + self.store.append('df_dc', df_dc, columns = ['B','C','string','string2']) + result = self.store.select('df_dc',[ Term('B>0') ]) + expected = df_dc[df_dc.B > 0] + tm.assert_frame_equal(result, expected) + + result = self.store.select('df_dc',[ 'B > 0', 'C > 0', 'string == foo' ]) + expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == 'foo')] + tm.assert_frame_equal(result, expected) + def test_create_table_index(self): # index=False From 81aaa7c62b99606cdc29159b375eabbdf903d3aa Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 21 Dec 2012 15:41:10 -0500 Subject: [PATCH 18/35] BUG: versioning issue bug! --- pandas/io/pytables.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 02476dfcfef03..eb5e248382c3f 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -487,7 +487,9 @@ def get_table(self, key): raise KeyError('No object named %s in the file' % key) if not _is_table_type(group): raise Exception("cannot return a table object for a non-table") - return create_table(self, group) + t = create_table(self, group) + t.infer_axes() + return t ###### private methods ###### @@ -1293,7 +1295,7 @@ def __init__(self, parent, group, **kwargs): try: self.version = tuple([ int(x) for x in version.split('.') ]) if len(self.version) == 2: - self.version = tuple(self.version + [0]) + self.version = self.version + (0,) except: self.version = (0,0,0) From 04a1aa9404dc5274bc4187004c7e249769cbcb71 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 22 Dec 2012 09:14:05 -0500 Subject: [PATCH 19/35] ENH: added column filtering via keyword 'columns' passed to select --- RELEASE.rst | 1 + doc/source/io.rst | 6 ++++ doc/source/v0.10.1.txt | 6 ++++ pandas/io/pytables.py | 54 +++++++++++++++++--------------- pandas/io/tests/test_pytables.py | 27 ++++++++++++++++ 5 files changed, 69 insertions(+), 25 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index a182f5a7fae2b..f07a7b67bb153 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -39,6 +39,7 @@ pandas 0.10.1 - support ``expectedrows`` keyword in append to inform ``PyTables`` about the expected tablesize - support ``start`` and ``stop`` keywords in select to limit the row selection space - added ``get_store`` context manager to automatically import with pandas + - added column filtering via ``columns`` keyword in select **Bug fixes** diff --git a/doc/source/io.rst b/doc/source/io.rst index 802256174ebb4..7d8cbfa575d44 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1175,6 +1175,12 @@ Queries are built up using a list of ``Terms`` (currently only **anding** of ter store store.select('wp',[ Term('major_axis>20000102'), Term('minor_axis', '=', ['A','B']) ]) +The ``columns`` keyword can be supplied to select to filter a list of the return columns, this is equivalent to passing a ``Term('columns',list_of_columns_to_filter)`` + +.. ipython:: python + + store.select('df', columns = ['A','B']) + Start and Stop parameters can be specified to limit the total search space. These are in terms of the total number of rows in a table. .. ipython:: python diff --git a/doc/source/v0.10.1.txt b/doc/source/v0.10.1.txt index 5b8db89635589..e43cd5a21f2cf 100644 --- a/doc/source/v0.10.1.txt +++ b/doc/source/v0.10.1.txt @@ -47,6 +47,12 @@ You can designate (and index) certain columns that you want to be able to perfor df[(df.B > 0) & (df.C > 0) & (df.string == 'foo')] +You can pass ``columns`` keyword to select to filter a list of the return columns, this is equivalent to passing a ``Term('columns',list_of_columns_to_filter)`` + +.. ipython:: python + + store.select('df',columns = ['A','B']) + ``HDFStore`` now serializes multi-index dataframes. .. ipython:: python diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index eb5e248382c3f..5b31d46e22b10 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -336,7 +336,7 @@ def get(self, key): raise KeyError('No object named %s in the file' % key) return self._read_group(group) - def select(self, key, where=None, start=None, stop=None, **kwargs): + def select(self, key, where=None, start=None, stop=None, columns=None, **kwargs): """ Retrieve pandas object stored in file, optionally based on where criteria @@ -350,12 +350,13 @@ def select(self, key, where=None, start=None, stop=None, **kwargs): where : list of Term (or convertable) objects, optional start : integer (defaults to None), row number to start selection stop : integer (defaults to None), row number to stop selection + columns : a list of columns that if not None, will limit the return columns """ group = self.get_node(key) if group is None: raise KeyError('No object named %s in the file' % key) - return self._read_group(group, where=where, start=start, stop=stop, **kwargs) + return self._read_group(group, where=where, start=start, stop=stop, columns=columns, **kwargs) def put(self, key, value, table=False, append=False, compression=None, **kwargs): @@ -1317,11 +1318,12 @@ def pandas_type(self): def __repr__(self): """ return a pretty representatgion of myself """ self.infer_axes() - return "%s (typ->%s,nrows->%s,indexers->[%s],data->[%s])" % (self.pandas_type, + dc = ",dc->%s" % ','.join(self.data_columns) if len(self.data_columns) else '' + return "%s (typ->%s,nrows->%s,indexers->[%s]%s)" % (self.pandas_type, self.table_type_short, self.nrows, ','.join([ a.name for a in self.index_axes ]), - ','.join(self.data_columns)) + dc) __str__ = __repr__ @@ -1681,9 +1683,15 @@ def create_axes(self, axes, obj, validate = True, nan_rep = None, columns = None if validate: self.validate(existing_table) - def process_axes(self, obj): + def process_axes(self, obj, columns=None): """ process axes filters """ + # reorder by any non_index_axes & limit to the select columns + for axis,labels in self.non_index_axes: + if columns is not None: + labels = Index(labels) & Index(columns) + obj = obj.reindex_axis(labels,axis=axis,copy=False) + def reindex(obj, axis, filt, ordered): axis_name = obj._get_axis_name(axis) ordd = ordered & filt @@ -1767,7 +1775,7 @@ class LegacyTable(Table): def write(self, **kwargs): raise Exception("write operations are not allowed on legacy tables!") - def read(self, where=None, **kwargs): + def read(self, where=None, columns=None, **kwargs): """ we have n indexable columns, with an arbitrary number of data axes """ @@ -1840,12 +1848,8 @@ def read(self, where=None, **kwargs): else: wp = concat(objs, axis = 0, verify_integrity = True) - # reorder by any non_index_axes - for axis,labels in self.non_index_axes: - wp = wp.reindex_axis(labels,axis=axis,copy=False) - # apply the selection filters & axis orderings - wp = self.process_axes(wp) + wp = self.process_axes(wp, columns=columns) return wp @@ -2017,40 +2021,40 @@ def get_object(self, obj): obj = obj.T return obj - def read(self, where=None, **kwargs): + def read(self, where=None, columns=None, **kwargs): if not self.read_axes(where=where, **kwargs): return None index = self.index_axes[0].values frames = [] for a in self.values_axes: - columns = Index(a.values) + cols = Index(a.values) if self.is_transposed: values = a.cvalues - index_ = columns - columns_ = Index(index) + index_ = cols + cols_ = Index(index) else: values = a.cvalues.T index_ = Index(index) - columns_ = columns + cols_ = cols # if we have a DataIndexableCol, its shape will only be 1 dim if values.ndim == 1: values = values.reshape(1,values.shape[0]) - block = make_block(values, columns_, columns_) - mgr = BlockManager([ block ], [ columns_, index_ ]) + block = make_block(values, cols_, cols_) + mgr = BlockManager([ block ], [ cols_, index_ ]) frames.append(DataFrame(mgr)) - df = concat(frames, axis = 1, verify_integrity = True) - # sort the indicies & reorder the columns - for axis,labels in self.non_index_axes: - df = df.reindex_axis(labels,axis=axis,copy=False) + if len(frames) == 1: + df = frames[0] + else: + df = concat(frames, axis = 1, verify_integrity = True) # apply the selection filters & axis orderings - df = self.process_axes(df) + df = self.process_axes(df, columns=columns) return df @@ -2073,8 +2077,8 @@ def write(self, obj, columns = None, **kwargs): self.levels = obj.index.names return super(AppendableMultiFrameTable, self).write(obj = obj.reset_index(), columns = columns, **kwargs) - def read(self, where=None, **kwargs): - df = super(AppendableMultiFrameTable, self).read(where = where, **kwargs) + def read(self, *args, **kwargs): + df = super(AppendableMultiFrameTable, self).read(*args, **kwargs) df.set_index(self.levels, inplace=True) return df diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 0dbd9000e48d9..fbcbe7e1766b6 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1284,6 +1284,33 @@ def test_select(self): #self.assertRaises(Exception, self.store.select, # 'wp2', ('column', ['A', 'D'])) + # select with columns= + df = tm.makeTimeDataFrame() + self.store.remove('df') + self.store.append('df',df) + result = self.store.select('df', columns = ['A','B']) + expected = df.reindex(columns = ['A','B']) + tm.assert_frame_equal(expected, result) + + # equivalentsly + result = self.store.select('df', [ ('columns', ['A','B']) ]) + expected = df.reindex(columns = ['A','B']) + tm.assert_frame_equal(expected, result) + + # with a data column + self.store.remove('df') + self.store.append('df',df, columns = ['A']) + result = self.store.select('df', [ 'A > 0' ], columns = ['A','B']) + expected = df[df.A > 0].reindex(columns = ['A','B']) + tm.assert_frame_equal(expected, result) + + # with a data column, but different columns + self.store.remove('df') + self.store.append('df',df, columns = ['A']) + result = self.store.select('df', [ 'A > 0' ], columns = ['C','D']) + expected = df[df.A > 0].reindex(columns = ['C','D']) + tm.assert_frame_equal(expected, result) + def test_panel_select(self): wp = tm.makePanel() self.store.put('wp', wp, table=True) From 1c32ebffca4a883720956bb852b43ce7a0868326 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 22 Dec 2012 10:38:57 -0500 Subject: [PATCH 20/35] ENH: allow multiple table selection. retrieve multiple tables based on the results from a selector table. this allows one to potentially put the data you really want to index in a single table, and your actual (wide) data in another to speed queries --- RELEASE.rst | 1 + doc/source/io.rst | 18 +++++ doc/source/v0.10.1.txt | 39 +++++++--- pandas/io/pytables.py | 122 +++++++++++++++++++++++++++---- pandas/io/tests/test_pytables.py | 81 ++++++++++++++++++++ 5 files changed, 235 insertions(+), 26 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index f07a7b67bb153..6e2a6a70e0721 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -40,6 +40,7 @@ pandas 0.10.1 - support ``start`` and ``stop`` keywords in select to limit the row selection space - added ``get_store`` context manager to automatically import with pandas - added column filtering via ``columns`` keyword in select + - added methods select_multiple/select_as_coordinates to do multiple-table selection **Bug fixes** diff --git a/doc/source/io.rst b/doc/source/io.rst index 7d8cbfa575d44..c66801af38e39 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1254,6 +1254,24 @@ If you want to inspect the table object, retrieve via ``get_table``. You could u store.get_table('df_dc').nrows +Multiple Table Queries +~~~~~~~~~~~~~~~~~~~~~~ + +New in 0.10.1 is the method ``select_multiple``, that can perform selections from multiple tables and return a combined result, by using ``where`` on a selector table. The purpose is to allow fast selection from really wide tables. Construct 2 (or more) tables, where your indexing criteria is contained in a relatively small table. Then put your data in another table. Queries will be quite fast, yet you can allow your tables to grow (in column space). **THE USER IS RESPONSIBLE FOR SYNCHRONIZING THE TABLES**. This means, append to the tables in the same order. You can pass the ``axis`` parameter to control concatenation. Default is on the ``columns`` axis. + +.. ipython:: python + + index = date_range('1/1/2000', periods=8) + df1_mt = DataFrame(randn(8, 3), index=index, columns=['A', 'B', 'C']) + df2_mt = DataFrame(randn(8, 3), index=index, columns=['D', 'E', 'F']) + df2_mt['foo'] = 'bar' + + # you can use data columns as well + store.append('df1_mt',df1_mt, columns = ['A','B']) + store.append('df2_mt',df2_mt) + + store.select_multiple(['df1_mt','df2_mt'], where = [ 'A>0','B>0' ], axis = 1, selector = 'df1_mt') + Delete from a Table ~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.10.1.txt b/doc/source/v0.10.1.txt index e43cd5a21f2cf..f57844e0dc8db 100644 --- a/doc/source/v0.10.1.txt +++ b/doc/source/v0.10.1.txt @@ -57,21 +57,36 @@ You can pass ``columns`` keyword to select to filter a list of the return column .. ipython:: python - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['foo', 'bar']) - df = DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) - df + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['foo', 'bar']) + df = DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + df + + store.append('mi',df) + store.select('mi') + + # the levels are automatically included as data columns + store.select('mi', Term('foo=bar')) + +Multi-table Selection via ``select_multiple`` can perform selections from multiple tables and return a combined result, by using ``where`` on a selector table. + +.. ipython:: python - store.append('mi',df) - store.select('mi') + index = date_range('1/1/2000', periods=8) + df1_mt = DataFrame(randn(8, 3), index=index, columns=['A', 'B', 'C']) + df2_mt = DataFrame(randn(8, 3), index=index, columns=['D', 'E', 'F']) + df2_mt['foo'] = 'bar' - # the levels are automatically included as data columns - store.select('mi', Term('foo=bar')) + # you can use data columns as well + store.append('df1_mt',df1_mt, columns = ['A','B']) + store.append('df2_mt',df2_mt) + store.select_multiple(['df1_mt','df2_mt'], where = [ 'A>0' ], axis = 1, selector = 'df1_mt') + .. ipython:: python :suppress: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 5b31d46e22b10..894518992a57a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -358,6 +358,65 @@ def select(self, key, where=None, start=None, stop=None, columns=None, **kwargs) raise KeyError('No object named %s in the file' % key) return self._read_group(group, where=where, start=start, stop=stop, columns=columns, **kwargs) + def select_as_coordinates(self, key, where=None, **kwargs): + """ + return the selection as a Coordinates. Note that start/stop/columns parematers are inapplicable here. + + Parameters + ---------- + key : object + + Optional Parameters + ------------------- + where : list of Term (or convertable) objects, optional + """ + return self.get_table(key).read_coordinates(where = where, **kwargs) + + def select_multiple(self, keys, where=None, selector=None, columns=None, axis=1, **kwargs): + """ Retrieve pandas objects from multiple tables + + Parameters + ---------- + keys : a list of the tables + selector : the table to apply the where criteria (defaults to keys[0] if not supplied) + columns : the columns I want back + axis : the concentation axis (defaults to 1) + + Exceptions + ---------- + raise if any of the keys don't refer to tables or if they are not ALL THE SAME DIMENSIONS + """ + + if not isinstance(keys, (list,tuple)): + raise Exception("keys must be a list/tuple") + + if len(keys) == 0: + raise Exception("keys must have a non-zero length") + + if len(keys) == 1: + return self.select(key = keys[0], where=where, columns = columns, **kwargs) + + if selector is None: + selector = keys[0] + + # collect the tables + tbls = [ self.get_table(k) for k in keys ] + + # validate rows + nrows = tbls[0].nrows + for t in tbls: + if t.nrows != nrows: + raise Exception("all tables must have exactly the same nrows!") + + # select coordinates from the selector table + c = self.select_as_coordinates(selector, where) + + # collect the returns objs + objs = [ t.read(where = c, columns = columns) for t in tbls ] + + # concat and return + return concat(objs, axis = axis, verify_integrity = True) + def put(self, key, value, table=False, append=False, compression=None, **kwargs): """ @@ -1318,7 +1377,7 @@ def pandas_type(self): def __repr__(self): """ return a pretty representatgion of myself """ self.infer_axes() - dc = ",dc->%s" % ','.join(self.data_columns) if len(self.data_columns) else '' + dc = ",dc->[%s]" % ','.join(self.data_columns) if len(self.data_columns) else '' return "%s (typ->%s,nrows->%s,indexers->[%s]%s)" % (self.pandas_type, self.table_type_short, self.nrows, @@ -1730,6 +1789,18 @@ def create_description(self, compression = None, complevel = None, expectedrows def read(self, **kwargs): raise NotImplementedError("cannot read on an abstract table: subclasses should implement") + def read_coordinates(self, where=None, **kwargs): + + # validate the version + self.validate_version(where) + + # infer the data kind + if not self.infer_axes(): return False + + # create the selection + self.selection = Selection(self, where = where, **kwargs) + return Coordinates(self.selection.select_coords(), group = self.group, where = where) + def write(self, **kwargs): raise NotImplementedError("cannot write on an abstract table") @@ -2475,6 +2546,19 @@ def convert_value(self, v): # string quoting return ["'" + v + "'", v] +class Coordinates(object): + """ holds a returned coordinates list, useful to select the same rows from different tables + + coordinates : holds the array of coordinates + group : the source group + where : the source where + """ + + def __init__(self, values, group, where, **kwargs): + self.values = values + self.group = group + self.where = where + class Selection(object): """ Carries out a selection operation on a tables.Table object. @@ -2493,17 +2577,23 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs): self.stop = stop self.condition = None self.filter = None - self.terms = self.generate(where) - - # create the numexpr & the filter - if self.terms: - conds = [ t.condition for t in self.terms if t.condition is not None ] - if len(conds): - self.condition = "(%s)" % ' & '.join(conds) - self.filter = [] - for t in self.terms: - if t.filter is not None: - self.filter.append(t.filter) + self.terms = None + self.coordinates = None + + if isinstance(where, Coordinates): + self.coordinates = where.values + else: + self.terms = self.generate(where) + + # create the numexpr & the filter + if self.terms: + conds = [ t.condition for t in self.terms if t.condition is not None ] + if len(conds): + self.condition = "(%s)" % ' & '.join(conds) + self.filter = [] + for t in self.terms: + if t.filter is not None: + self.filter.append(t.filter) def generate(self, where): """ where can be a : dict,list,tuple,string """ @@ -2528,13 +2618,17 @@ def select(self): """ if self.condition is not None: return self.table.table.readWhere(self.condition, start=self.start, stop=self.stop) - else: - return self.table.table.read(start=self.start,stop=self.stop) + elif self.coordinates is not None: + return self.table.table.readCoordinates(self.coordinates) + return self.table.table.read(start=self.start,stop=self.stop) def select_coords(self): """ generate the selection """ + if self.condition is None: + return np.arange(self.table.nrows) + return self.table.table.getWhereList(self.condition, start=self.start, stop=self.stop, sort = True) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index fbcbe7e1766b6..878aedb837086 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1365,6 +1365,82 @@ def test_frame_select(self): #self.assertRaises(Exception, self.store.select, # 'frame', [crit1, crit2]) + def test_coordinates(self): + df = tm.makeTimeDataFrame() + + self.store.remove('df') + self.store.append('df', df) + + # all + c = self.store.select_as_coordinates('df') + assert((c.values == np.arange(len(df.index))).all() == True) + + # get coordinates back & test vs frame + self.store.remove('df') + + df = DataFrame(dict(A = range(5), B = range(5))) + self.store.append('df', df) + c = self.store.select_as_coordinates('df',[ 'index<3' ]) + assert((c.values == np.arange(3)).all() == True) + result = self.store.select('df', where = c) + expected = df.ix[0:2,:] + tm.assert_frame_equal(result,expected) + + c = self.store.select_as_coordinates('df', [ 'index>=3', 'index<=4' ]) + assert((c.values == np.arange(2)+3).all() == True) + result = self.store.select('df', where = c) + expected = df.ix[3:4,:] + tm.assert_frame_equal(result,expected) + + # multiple tables + self.store.remove('df1') + self.store.remove('df2') + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns = lambda x: "%s_2" % x) + self.store.append('df1',df1, columns = ['A','B']) + self.store.append('df2',df2) + + c = self.store.select_as_coordinates('df1', [ 'A>0','B>0' ]) + df1_result = self.store.select('df1',c) + df2_result = self.store.select('df2',c) + result = concat([ df1_result, df2_result ], axis=1) + + expected = concat([ df1, df2 ], axis=1) + expected = expected[(expected.A > 0) & (expected.B > 0)] + tm.assert_frame_equal(result, expected) + + def test_select_multiple(self): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns = lambda x: "%s_2" % x) + df2['foo'] = 'bar' + self.store.append('df1',df1, columns = ['A','B']) + self.store.append('df2',df2) + + # exceptions + self.assertRaises(Exception, self.store.select_multiple, None, where = [ 'A>0','B>0' ], selector = 'df1') + self.assertRaises(Exception, self.store.select_multiple, [ None ], where = [ 'A>0','B>0' ], selector = 'df1') + + # default select + result = self.store.select('df1', ['A>0','B>0']) + expected = self.store.select_multiple([ 'df1' ], where = [ 'A>0','B>0' ], selector = 'df1') + tm.assert_frame_equal(result, expected) + + # multiple + result = self.store.select_multiple(['df1','df2'], where = [ 'A>0','B>0' ], selector = 'df1') + expected = concat([ df1, df2 ], axis=1) + expected = expected[(expected.A > 0) & (expected.B > 0)] + tm.assert_frame_equal(result, expected) + + # multiple (diff selector) + result = self.store.select_multiple(['df1','df2'], where = [ Term('index', '>', df2.index[4]) ], selector = 'df2') + expected = concat([ df1, df2 ], axis=1) + expected = expected[5:] + tm.assert_frame_equal(result, expected) + + # test excpection for diff rows + self.store.append('df3',tm.makeTimeDataFrame(nper=50)) + self.assertRaises(Exception, self.store.select_multiple, ['df1','df3'], where = [ 'A>0','B>0' ], selector = 'df1') + def test_start_stop(self): df = DataFrame(dict(A = np.random.rand(20), B = np.random.rand(20))) @@ -1374,6 +1450,11 @@ def test_start_stop(self): expected = df.ix[0:4,['A']] tm.assert_frame_equal(result, expected) + # out of range + result = self.store.select('df', [ Term("columns", "=", ["A"]) ], start=30, stop=40) + assert(len(result) == 0) + assert(type(result) == DataFrame) + def test_select_filter_corner(self): df = DataFrame(np.random.randn(50, 100)) df.index = ['%.3d' % c for c in df.index] From c314534a7d190d763fd5a4efb47b30a15f237473 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 23 Dec 2012 10:40:50 -0500 Subject: [PATCH 21/35] BUG: renamed method select_multiple -> select_as_multiple renamed keyword 'columns' to 'data_columns' when passed to 'append' (to avoid confusion with 'columns' keyword in select) --- RELEASE.rst | 4 +-- doc/source/conf.py | 1 + doc/source/io.rst | 8 +++--- doc/source/v0.10.1.txt | 6 ++-- pandas/io/pytables.py | 49 +++++++++++++++++--------------- pandas/io/tests/test_pytables.py | 30 +++++++++---------- 6 files changed, 51 insertions(+), 47 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 6e2a6a70e0721..72345a33603c5 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -33,14 +33,14 @@ pandas 0.10.1 - ``HDFStore`` - enables storing of multi-index dataframes (closes GH1277_) - - support data column indexing and selection, via ``columns`` keyword in append + - support data column indexing and selection, via ``data_columns`` keyword in append - support write chunking to reduce memory footprint, via ``chunksize`` keywork to append - support automagic indexing via ``index`` keywork to append - support ``expectedrows`` keyword in append to inform ``PyTables`` about the expected tablesize - support ``start`` and ``stop`` keywords in select to limit the row selection space - added ``get_store`` context manager to automatically import with pandas - added column filtering via ``columns`` keyword in select - - added methods select_multiple/select_as_coordinates to do multiple-table selection + - added methods select_as_multiple/select_as_coordinates to do multiple-table selection **Bug fixes** diff --git a/doc/source/conf.py b/doc/source/conf.py index 692c7757ee17c..6895f00414b0b 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -16,6 +16,7 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. #sys.path.append(os.path.abspath('.')) +sys.path.insert(0, '/home/jreback/pandas') sys.path.insert(0, os.path.abspath('../sphinxext')) sys.path.extend([ diff --git a/doc/source/io.rst b/doc/source/io.rst index c66801af38e39..70b65f2e46f74 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1222,7 +1222,7 @@ You can designate (and index) certain columns that you want to be able to perfor df_dc # on-disk operations - store.append('df_dc', df_dc, columns = ['B','C','string','string2']) + store.append('df_dc', df_dc, data_columns = ['B','C','string','string2']) store.select('df_dc',[ Term('B>0') ]) # getting creative @@ -1257,7 +1257,7 @@ If you want to inspect the table object, retrieve via ``get_table``. You could u Multiple Table Queries ~~~~~~~~~~~~~~~~~~~~~~ -New in 0.10.1 is the method ``select_multiple``, that can perform selections from multiple tables and return a combined result, by using ``where`` on a selector table. The purpose is to allow fast selection from really wide tables. Construct 2 (or more) tables, where your indexing criteria is contained in a relatively small table. Then put your data in another table. Queries will be quite fast, yet you can allow your tables to grow (in column space). **THE USER IS RESPONSIBLE FOR SYNCHRONIZING THE TABLES**. This means, append to the tables in the same order. You can pass the ``axis`` parameter to control concatenation. Default is on the ``columns`` axis. +New in 0.10.1 is the method ``select_as_multiple``, that can perform selections from multiple tables and return a combined result, by using ``where`` on a selector table. The purpose is to allow fast selection from really wide tables. Construct 2 (or more) tables, where your indexing criteria is contained in a relatively small table. Then put your data in another table. Queries will be quite fast, yet you can allow your tables to grow (in column space). **THE USER IS RESPONSIBLE FOR SYNCHRONIZING THE TABLES**. This means, append to the tables in the same order. You can pass the ``axis`` parameter to control concatenation. Default is on the ``columns`` axis. .. ipython:: python @@ -1267,10 +1267,10 @@ New in 0.10.1 is the method ``select_multiple``, that can perform selections fro df2_mt['foo'] = 'bar' # you can use data columns as well - store.append('df1_mt',df1_mt, columns = ['A','B']) + store.append('df1_mt',df1_mt, data_columns = ['A','B']) store.append('df2_mt',df2_mt) - store.select_multiple(['df1_mt','df2_mt'], where = [ 'A>0','B>0' ], axis = 1, selector = 'df1_mt') + store.select_as_multiple(['df1_mt','df2_mt'], where = [ 'A>0','B>0' ], axis = 1, selector = 'df1_mt') Delete from a Table diff --git a/doc/source/v0.10.1.txt b/doc/source/v0.10.1.txt index f57844e0dc8db..6ba326bb46c63 100644 --- a/doc/source/v0.10.1.txt +++ b/doc/source/v0.10.1.txt @@ -23,7 +23,7 @@ HDFStore os.remove('store.h5') -You can designate (and index) certain columns that you want to be able to perform queries on a table, by passing a list to ``columns`` +You can designate (and index) certain columns that you want to be able to perform queries on a table, by passing a list to ``data_columns`` .. ipython:: python @@ -37,7 +37,7 @@ You can designate (and index) certain columns that you want to be able to perfor df # on-disk operations - store.append('df', df, columns = ['B','C','string','string2']) + store.append('df', df, data_columns = ['B','C','string','string2']) store.select('df',[ Term('B>0') ]) # getting creative @@ -82,7 +82,7 @@ Multi-table Selection via ``select_multiple`` can perform selections from multip df2_mt['foo'] = 'bar' # you can use data columns as well - store.append('df1_mt',df1_mt, columns = ['A','B']) + store.append('df1_mt',df1_mt, data_columns = ['A','B']) store.append('df2_mt',df2_mt) store.select_multiple(['df1_mt','df2_mt'], where = [ 'A>0' ], axis = 1, selector = 'df1_mt') diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 894518992a57a..f99bc245235dd 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -372,7 +372,7 @@ def select_as_coordinates(self, key, where=None, **kwargs): """ return self.get_table(key).read_coordinates(where = where, **kwargs) - def select_multiple(self, keys, where=None, selector=None, columns=None, axis=1, **kwargs): + def select_as_multiple(self, keys, where=None, selector=None, columns=None, axis=1, **kwargs): """ Retrieve pandas objects from multiple tables Parameters @@ -478,7 +478,7 @@ def remove(self, key, where=None, start=None, stop=None): return None - def append(self, key, value, **kwargs): + def append(self, key, value, columns = None, **kwargs): """ Append to Table in file. Node must already exist and be Table format. @@ -490,7 +490,7 @@ def append(self, key, value, **kwargs): Optional Parameters ------------------- - columns : list of columns to create as data columns + data_columns : list of columns to create as data columns min_itemsize : dict of columns that specify minimum string sizes nan_rep : string to use as string nan represenation chunksize : size to chunk the writing @@ -501,6 +501,9 @@ def append(self, key, value, **kwargs): Does *not* check if data being appended overlaps with existing data in the table, so be careful """ + if columns is not None: + raise Exception("columns is not a supported keyword in append, try data_columns") + self._write_to_group(key, value, table=True, append=True, **kwargs) def create_table_index(self, key, **kwargs): @@ -1336,7 +1339,7 @@ class Table(object): index_axes : a list of tuples of the (original indexing axis and index column) non_index_axes: a list of tuples of the (original index axis and columns on a non-indexing axis) values_axes : a list of the columns which comprise the data of this table - data_columns : a list of columns that we are allowing indexing (these become single columns in values_axes) + data_columns : a list of the columns that we are allowing indexing (these become single columns in values_axes) nan_rep : the string to use for nan representations for string objects levels : the names of levels @@ -1609,7 +1612,7 @@ def get_object(self, obj): """ return the data for this obj """ return obj - def create_axes(self, axes, obj, validate = True, nan_rep = None, columns = None, min_itemsize = None, **kwargs): + def create_axes(self, axes, obj, validate = True, nan_rep = None, data_columns = None, min_itemsize = None, **kwargs): """ create and return the axes leagcy tables create an indexable column, indexable index, non-indexable fields @@ -1620,7 +1623,7 @@ def create_axes(self, axes, obj, validate = True, nan_rep = None, columns = None validate: validate the obj against an existiing object already written min_itemsize: a dict of the min size for a column in bytes nan_rep : a values to use for string column nan_rep - columns : a list of columns that we want to create separate to allow indexing + data_columns : a list of columns that we want to create separate to allow indexing """ @@ -1630,9 +1633,9 @@ def create_axes(self, axes, obj, validate = True, nan_rep = None, columns = None # do we have an existing table (if so, use its axes & data_columns) if self.infer_axes(): existing_table = self.copy() - axes = [ a.axis for a in existing_table.index_axes] - columns = existing_table.data_columns - nan_rep = existing_table.nan_rep + axes = [ a.axis for a in existing_table.index_axes] + data_columns = existing_table.data_columns + nan_rep = existing_table.nan_rep else: existing_table = None @@ -1694,13 +1697,13 @@ def create_axes(self, axes, obj, validate = True, nan_rep = None, columns = None block_obj = self.get_object(obj) blocks = None - if columns is not None and len(self.non_index_axes): - axis = self.non_index_axes[0][0] - axis_labels = self.non_index_axes[0][1] - columns = [ c for c in columns if c in axis_labels ] - if len(columns): - blocks = block_obj.reindex_axis(Index(axis_labels)-Index(columns), axis = axis, copy = False)._data.blocks - for c in columns: + if data_columns is not None and len(self.non_index_axes): + axis = self.non_index_axes[0][0] + axis_labels = self.non_index_axes[0][1] + data_columns = [ c for c in data_columns if c in axis_labels ] + if len(data_columns): + blocks = block_obj.reindex_axis(Index(axis_labels)-Index(data_columns), axis = axis, copy = False)._data.blocks + for c in data_columns: blocks.extend(block_obj.reindex_axis([ c ], axis = axis, copy = False)._data.blocks) if blocks is None: @@ -1715,7 +1718,7 @@ def create_axes(self, axes, obj, validate = True, nan_rep = None, columns = None name = None # we have a data_column - if columns and len(b.items) == 1 and b.items[0] in columns: + if data_columns and len(b.items) == 1 and b.items[0] in data_columns: klass = DataIndexableCol name = b.items[0] self.data_columns.append(name) @@ -2139,14 +2142,14 @@ class AppendableMultiFrameTable(AppendableFrameTable): def table_type_short(self): return 'appendable_multi' - def write(self, obj, columns = None, **kwargs): - if columns is None: - columns = [] + def write(self, obj, data_columns = None, **kwargs): + if data_columns is None: + data_columns = [] for n in obj.index.names: - if n not in columns: - columns.insert(0,n) + if n not in data_columns: + data_columns.insert(0,n) self.levels = obj.index.names - return super(AppendableMultiFrameTable, self).write(obj = obj.reset_index(), columns = columns, **kwargs) + return super(AppendableMultiFrameTable, self).write(obj = obj.reset_index(), data_columns = data_columns, **kwargs) def read(self, *args, **kwargs): df = super(AppendableMultiFrameTable, self).read(*args, **kwargs) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 878aedb837086..f8c71a3f260a3 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -427,7 +427,7 @@ def test_append_with_data_columns(self): df = tm.makeTimeDataFrame() self.store.remove('df') - self.store.append('df', df[:2], columns = ['B']) + self.store.append('df', df[:2], data_columns = ['B']) self.store.append('df', df[2:]) tm.assert_frame_equal(self.store['df'], df) @@ -452,7 +452,7 @@ def test_append_with_data_columns(self): df_new['string'][1:4] = np.nan df_new['string'][5:6] = 'bar' self.store.remove('df') - self.store.append('df', df_new, columns = ['string']) + self.store.append('df', df_new, data_columns = ['string']) result = self.store.select('df', [ Term('string', '=', 'foo') ]) expected = df_new[df_new.string == 'foo'] tm.assert_frame_equal(result, expected) @@ -466,7 +466,7 @@ def test_append_with_data_columns(self): df_new['string2'][2:5] = np.nan df_new['string2'][7:8] = 'bar' self.store.remove('df') - self.store.append('df', df_new, columns = ['A','B','string','string2']) + self.store.append('df', df_new, data_columns = ['A','B','string','string2']) result = self.store.select('df', [ Term('string', '=', 'foo'), Term('string2=foo'), Term('A>0'), Term('B<0') ]) expected = df_new[(df_new.string == 'foo') & (df_new.string2 == 'foo') & (df_new.A > 0) & (df_new.B < 0)] tm.assert_frame_equal(result, expected) @@ -484,7 +484,7 @@ def test_append_with_data_columns(self): df_dc['string2'] = 'cool' df_dc self.store.remove('df_dc') - self.store.append('df_dc', df_dc, columns = ['B','C','string','string2']) + self.store.append('df_dc', df_dc, data_columns = ['B','C','string','string2']) result = self.store.select('df_dc',[ Term('B>0') ]) expected = df_dc[df_dc.B > 0] tm.assert_frame_equal(result, expected) @@ -1299,14 +1299,14 @@ def test_select(self): # with a data column self.store.remove('df') - self.store.append('df',df, columns = ['A']) + self.store.append('df',df, data_columns = ['A']) result = self.store.select('df', [ 'A > 0' ], columns = ['A','B']) expected = df[df.A > 0].reindex(columns = ['A','B']) tm.assert_frame_equal(expected, result) # with a data column, but different columns self.store.remove('df') - self.store.append('df',df, columns = ['A']) + self.store.append('df',df, data_columns = ['A']) result = self.store.select('df', [ 'A > 0' ], columns = ['C','D']) expected = df[df.A > 0].reindex(columns = ['C','D']) tm.assert_frame_equal(expected, result) @@ -1397,7 +1397,7 @@ def test_coordinates(self): self.store.remove('df2') df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns = lambda x: "%s_2" % x) - self.store.append('df1',df1, columns = ['A','B']) + self.store.append('df1',df1, data_columns = ['A','B']) self.store.append('df2',df2) c = self.store.select_as_coordinates('df1', [ 'A>0','B>0' ]) @@ -1409,37 +1409,37 @@ def test_coordinates(self): expected = expected[(expected.A > 0) & (expected.B > 0)] tm.assert_frame_equal(result, expected) - def test_select_multiple(self): + def test_select_as_multiple(self): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns = lambda x: "%s_2" % x) df2['foo'] = 'bar' - self.store.append('df1',df1, columns = ['A','B']) + self.store.append('df1',df1, data_columns = ['A','B']) self.store.append('df2',df2) # exceptions - self.assertRaises(Exception, self.store.select_multiple, None, where = [ 'A>0','B>0' ], selector = 'df1') - self.assertRaises(Exception, self.store.select_multiple, [ None ], where = [ 'A>0','B>0' ], selector = 'df1') + self.assertRaises(Exception, self.store.select_as_multiple, None, where = [ 'A>0','B>0' ], selector = 'df1') + self.assertRaises(Exception, self.store.select_as_multiple, [ None ], where = [ 'A>0','B>0' ], selector = 'df1') # default select result = self.store.select('df1', ['A>0','B>0']) - expected = self.store.select_multiple([ 'df1' ], where = [ 'A>0','B>0' ], selector = 'df1') + expected = self.store.select_as_multiple([ 'df1' ], where = [ 'A>0','B>0' ], selector = 'df1') tm.assert_frame_equal(result, expected) # multiple - result = self.store.select_multiple(['df1','df2'], where = [ 'A>0','B>0' ], selector = 'df1') + result = self.store.select_as_multiple(['df1','df2'], where = [ 'A>0','B>0' ], selector = 'df1') expected = concat([ df1, df2 ], axis=1) expected = expected[(expected.A > 0) & (expected.B > 0)] tm.assert_frame_equal(result, expected) # multiple (diff selector) - result = self.store.select_multiple(['df1','df2'], where = [ Term('index', '>', df2.index[4]) ], selector = 'df2') + result = self.store.select_as_multiple(['df1','df2'], where = [ Term('index', '>', df2.index[4]) ], selector = 'df2') expected = concat([ df1, df2 ], axis=1) expected = expected[5:] tm.assert_frame_equal(result, expected) # test excpection for diff rows self.store.append('df3',tm.makeTimeDataFrame(nper=50)) - self.assertRaises(Exception, self.store.select_multiple, ['df1','df3'], where = [ 'A>0','B>0' ], selector = 'df1') + self.assertRaises(Exception, self.store.select_as_multiple, ['df1','df3'], where = [ 'A>0','B>0' ], selector = 'df1') def test_start_stop(self): From cbbae3dc24ef93369b9c7dbc81f032ba57db4936 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 23 Dec 2012 11:34:52 -0500 Subject: [PATCH 22/35] ENH: added append_to_multiple, to support multiple table creation --- RELEASE.rst | 2 +- doc/source/io.rst | 20 +++++--- doc/source/v0.10.1.txt | 22 +++++---- pandas/io/pytables.py | 82 ++++++++++++++++++++++++++++---- pandas/io/tests/test_pytables.py | 20 ++++++++ 5 files changed, 121 insertions(+), 25 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 72345a33603c5..c43d242ec48e8 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -40,7 +40,7 @@ pandas 0.10.1 - support ``start`` and ``stop`` keywords in select to limit the row selection space - added ``get_store`` context manager to automatically import with pandas - added column filtering via ``columns`` keyword in select - - added methods select_as_multiple/select_as_coordinates to do multiple-table selection + - added methods append_to_multiple/select_as_multiple/select_as_coordinates to do multiple-table append/selection **Bug fixes** diff --git a/doc/source/io.rst b/doc/source/io.rst index 70b65f2e46f74..e27e4af8277d5 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1257,19 +1257,25 @@ If you want to inspect the table object, retrieve via ``get_table``. You could u Multiple Table Queries ~~~~~~~~~~~~~~~~~~~~~~ -New in 0.10.1 is the method ``select_as_multiple``, that can perform selections from multiple tables and return a combined result, by using ``where`` on a selector table. The purpose is to allow fast selection from really wide tables. Construct 2 (or more) tables, where your indexing criteria is contained in a relatively small table. Then put your data in another table. Queries will be quite fast, yet you can allow your tables to grow (in column space). **THE USER IS RESPONSIBLE FOR SYNCHRONIZING THE TABLES**. This means, append to the tables in the same order. You can pass the ``axis`` parameter to control concatenation. Default is on the ``columns`` axis. +New in 0.10.1 are the methods ``append_to_multple`` and ``select_as_multiple``, that can perform appending/selecting from multiple tables at once. The idea is to have one table (call it the selector table) that you index most/all of the columns, and perform your queries. The other table(s) are data tables that you are indexed the same the selector table. You can then perform a very fast query on the selector table, yet get lots of data back. This method works similar to having a very wide-table, but is more efficient in terms of queries. + +Note, **THE USER IS RESPONSIBLE FOR SYNCHRONIZING THE TABLES**. This means, append to the tables in the same order; ``append_to_multiple`` splits a single object to multiple tables, given a specification (as a dictionary). This dictionary is a mapping of the table names to the 'columns' you want included in that table. Pass a `None` for a single table (optional) to let it have the remaining columns. The argument ``selector`` defines which table is the selector table. .. ipython:: python index = date_range('1/1/2000', periods=8) - df1_mt = DataFrame(randn(8, 3), index=index, columns=['A', 'B', 'C']) - df2_mt = DataFrame(randn(8, 3), index=index, columns=['D', 'E', 'F']) - df2_mt['foo'] = 'bar' + df_mt = DataFrame(randn(8, 6), index=index, columns=['A', 'B', 'C', 'D', 'E', 'F']) + df_mt['foo'] = 'bar' - # you can use data columns as well - store.append('df1_mt',df1_mt, data_columns = ['A','B']) - store.append('df2_mt',df2_mt) + # you can also create the tables individually + store.append_to_multiple({ 'df1_mt' : ['A','B'], 'df2_mt' : None }, df_mt, selector = 'df1_mt') + store + # indiviual tables were created + store.select('df1_mt') + store.select('df2_mt') + + # as a multiple store.select_as_multiple(['df1_mt','df2_mt'], where = [ 'A>0','B>0' ], axis = 1, selector = 'df1_mt') diff --git a/doc/source/v0.10.1.txt b/doc/source/v0.10.1.txt index 6ba326bb46c63..56d38ee9c847a 100644 --- a/doc/source/v0.10.1.txt +++ b/doc/source/v0.10.1.txt @@ -72,21 +72,25 @@ You can pass ``columns`` keyword to select to filter a list of the return column # the levels are automatically included as data columns store.select('mi', Term('foo=bar')) -Multi-table Selection via ``select_multiple`` can perform selections from multiple tables and return a combined result, by using ``where`` on a selector table. +Multi-table creation via ``append_to_multiple`` and selection via ``select_multiple`` can create/select from multiple tables and return a combined result, by using ``where`` on a selector table. .. ipython:: python index = date_range('1/1/2000', periods=8) - df1_mt = DataFrame(randn(8, 3), index=index, columns=['A', 'B', 'C']) - df2_mt = DataFrame(randn(8, 3), index=index, columns=['D', 'E', 'F']) - df2_mt['foo'] = 'bar' + df_mt = DataFrame(randn(8, 6), index=index, columns=['A', 'B', 'C', 'D', 'E', 'F']) + df_mt['foo'] = 'bar' - # you can use data columns as well - store.append('df1_mt',df1_mt, data_columns = ['A','B']) - store.append('df2_mt',df2_mt) + # you can also create the tables individually + store.append_to_multiple({ 'df1_mt' : ['A','B'], 'df2_mt' : None }, df_mt, selector = 'df1_mt') + store + + # indiviual tables were created + store.select('df1_mt') + store.select('df2_mt') + + # as a multiple + store.select_as_multiple(['df1_mt','df2_mt'], where = [ 'A>0','B>0' ], axis = 1, selector = 'df1_mt') - store.select_multiple(['df1_mt','df2_mt'], where = [ 'A>0' ], axis = 1, selector = 'df1_mt') - .. ipython:: python :suppress: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index f99bc245235dd..ad31f0f5347cd 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -79,6 +79,13 @@ class IncompatibilityWarning(Warning): pass 'WidePanel': 'wide_table', } +# axes map +_AXES_MAP = { + DataFrame : [0], + Panel : [1,2], + Panel4D : [1,2,3], +} + # oh the troubles to reduce import time _table_mod = None _table_supports_index = False @@ -387,15 +394,18 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, axis raise if any of the keys don't refer to tables or if they are not ALL THE SAME DIMENSIONS """ + # default to single select + if isinstance(keys, (list,tuple)) and len(keys) == 1: + keys = keys[0] + if isinstance(keys,basestring): + return self.select(key = keys, where=where, columns = columns, **kwargs) + if not isinstance(keys, (list,tuple)): raise Exception("keys must be a list/tuple") if len(keys) == 0: raise Exception("keys must have a non-zero length") - if len(keys) == 1: - return self.select(key = keys[0], where=where, columns = columns, **kwargs) - if selector is None: selector = keys[0] @@ -506,6 +516,63 @@ def append(self, key, value, columns = None, **kwargs): self._write_to_group(key, value, table=True, append=True, **kwargs) + def append_to_multiple(self, d, value, selector, data_columns = None, axes = None, **kwargs): + """ + Append to multiple tables + + Parameters + ---------- + d : a dict of table_name to table_columns, None is acceptable as the values of one node (this will get all the remaining columns) + value : a pandas object + selector : a string that designates the indexable table; all of its columns will be designed as data_columns, unless data_columns is passed, + in which case these are used + + Notes + ----- + axes parameter is currently not accepted + + """ + if axes is not None: + raise Exception("axes is currently not accepted as a paremter to append_to_multiple; you can create the tables indepdently instead") + + if not isinstance(d, dict): + raise Exception("append_to_multiple must have a dictionary specified as the way to split the value") + + if selector not in d: + raise Exception("append_to_multiple requires a selector that is in passed dict") + + # figure out the splitting axis (the non_index_axis) + axis = list(set(range(value.ndim))-set(_AXES_MAP[type(value)]))[0] + + # figure out how to split the value + remain_key = None + remain_values = [] + for k, v in d.items(): + if v is None: + if remain_key is not None: + raise Exception("append_to_multiple can only have one value in d that is None") + remain_key = k + else: + remain_values.extend(v) + if remain_key is not None: + ordered = value.axes[axis] + ordd = ordered-Index(remain_values) + ordd = sorted(ordered.get_indexer(ordd)) + d[remain_key] = ordered.take(ordd) + + # data_columns + if data_columns is None: + data_columns = d[selector] + + # append + for k, v in d.items(): + dc = data_columns if k == selector else None + + # compute the val + val = value.reindex_axis(v, axis = axis, copy = False) + + self.append(k, val, data_columns = dc, **kwargs) + def create_table_index(self, key, **kwargs): """ Create a pytables index on the table Paramaters @@ -725,7 +792,7 @@ def _read_wide(self, group, where=None, **kwargs): def _write_ndim_table(self, group, obj, append=False, comp=None, axes=None, index=True, **kwargs): if axes is None: - axes = [1,2,3] + axes = _AXES_MAP[type(obj)] t = create_table(self, group, typ = 'appendable_ndim') t.write(axes=axes, obj=obj, append=append, compression=comp, **kwargs) @@ -738,7 +805,7 @@ def _read_ndim_table(self, group, where=None, **kwargs): def _write_frame_table(self, group, df, append=False, comp=None, axes=None, index=True, **kwargs): if axes is None: - axes = [0] + axes = _AXES_MAP[type(df)] t = create_table(self, group, typ = 'appendable_frame' if df.index.nlevels == 1 else 'appendable_multiframe') t.write(axes=axes, obj=df, append=append, compression=comp, **kwargs) @@ -749,7 +816,7 @@ def _write_frame_table(self, group, df, append=False, comp=None, axes=None, inde def _write_wide_table(self, group, panel, append=False, comp=None, axes=None, index=True, **kwargs): if axes is None: - axes = [1,2] + axes = _AXES_MAP[type(panel)] t = create_table(self, group, typ = 'appendable_panel') t.write(axes=axes, obj=panel, append=append, compression=comp, **kwargs) @@ -1755,10 +1822,9 @@ def process_axes(self, obj, columns=None): obj = obj.reindex_axis(labels,axis=axis,copy=False) def reindex(obj, axis, filt, ordered): - axis_name = obj._get_axis_name(axis) ordd = ordered & filt ordd = sorted(ordered.get_indexer(ordd)) - return obj.reindex_axis(ordered.take(ordd), axis = obj._get_axis_number(axis_name), copy = False) + return obj.reindex_axis(ordered.take(ordd), axis = obj._get_axis_number(axis), copy = False) # apply the selection filters (but keep in the same order) if self.selection.filter: diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index f8c71a3f260a3..e622626569609 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1409,6 +1409,24 @@ def test_coordinates(self): expected = expected[(expected.A > 0) & (expected.B > 0)] tm.assert_frame_equal(result, expected) + def test_append_to_multiple(self): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns = lambda x: "%s_2" % x) + df2['foo'] = 'bar' + df = concat([ df1, df2 ], axis=1) + + # exceptions + self.assertRaises(Exception, self.store.append_to_multiple, { 'df1' : ['A','B'], 'df2' : None }, df, selector = 'df3') + self.assertRaises(Exception, self.store.append_to_multiple, { 'df1' : None, 'df2' : None }, df, selector = 'df3') + self.assertRaises(Exception, self.store.append_to_multiple, 'df1', df, 'df1') + + # regular operation + self.store.append_to_multiple({ 'df1' : ['A','B'], 'df2' : None }, df, selector = 'df1') + result = self.store.select_as_multiple(['df1','df2'], where = [ 'A>0','B>0' ], selector = 'df1') + expected = df[(df.A > 0) & (df.B > 0)] + tm.assert_frame_equal(result, expected) + + def test_select_as_multiple(self): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns = lambda x: "%s_2" % x) @@ -1424,6 +1442,8 @@ def test_select_as_multiple(self): result = self.store.select('df1', ['A>0','B>0']) expected = self.store.select_as_multiple([ 'df1' ], where = [ 'A>0','B>0' ], selector = 'df1') tm.assert_frame_equal(result, expected) + expected = self.store.select_as_multiple( 'df1' , where = [ 'A>0','B>0' ], selector = 'df1') + tm.assert_frame_equal(result, expected) # multiple result = self.store.select_as_multiple(['df1','df2'], where = [ 'A>0','B>0' ], selector = 'df1') From 228df0b5997759e5ca34d160980e74be3885d28e Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 23 Dec 2012 11:37:04 -0500 Subject: [PATCH 23/35] removed paths from conf.py --- doc/source/conf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 6895f00414b0b..692c7757ee17c 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -16,7 +16,6 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. #sys.path.append(os.path.abspath('.')) -sys.path.insert(0, '/home/jreback/pandas') sys.path.insert(0, os.path.abspath('../sphinxext')) sys.path.extend([ From aafe311d48ce0a00cd7c24c74186827aa36138e7 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 23 Dec 2012 13:15:29 -0500 Subject: [PATCH 24/35] DOC: minor doc updates/typos --- RELEASE.rst | 2 +- doc/source/io.rst | 30 ++++++++++++++---------------- doc/source/v0.10.1.txt | 15 ++++++--------- pandas/io/pytables.py | 8 +++++--- 4 files changed, 26 insertions(+), 29 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index c43d242ec48e8..fd958ebb24efa 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -34,7 +34,7 @@ pandas 0.10.1 - ``HDFStore`` - enables storing of multi-index dataframes (closes GH1277_) - support data column indexing and selection, via ``data_columns`` keyword in append - - support write chunking to reduce memory footprint, via ``chunksize`` keywork to append + - support write chunking to reduce memory footprint, via ``chunksize`` keyword to append - support automagic indexing via ``index`` keywork to append - support ``expectedrows`` keyword in append to inform ``PyTables`` about the expected tablesize - support ``start`` and ``stop`` keywords in select to limit the row selection space diff --git a/doc/source/io.rst b/doc/source/io.rst index e27e4af8277d5..36ae14aaa6e06 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1114,6 +1114,7 @@ Passing ``min_itemsize = { `values` : size }`` as a parameter to append will set df_mixed['string'] = 'string' df_mixed['int'] = 1 df_mixed['bool'] = True + df_mixed.ix[3:4,['A','B','string']] = np.nan store.append('df_mixed', df_mixed, min_itemsize = { 'values' : 50 }) df_mixed1 = store.select('df_mixed') @@ -1126,7 +1127,7 @@ Passing ``min_itemsize = { `values` : size }`` as a parameter to append will set Storing Multi-Index DataFrames ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Storing multi-index dataframes is very similar to storing/selecting from homogenous index DataFrames. +Storing multi-index dataframes as tables is very similar to storing/selecting from homogenous index DataFrames. .. ipython:: python @@ -1194,7 +1195,7 @@ Start and Stop parameters can be specified to limit the total search space. Thes Indexing ~~~~~~~~ -You can create/modify an index for a table with ``create_table_index`` after data is already in the table (after and ``append/put`` operation). Creating a table index is **highly** encouraged. This will speed your queries a great deal when you use a ``select`` with the indexed dimension as the ``where``. **Indexes are automagically created** on the indexables and any data columns you specify. This behavior can be turned off by passing ``index=False`` to ``append``. +You can create/modify an index for a table with ``create_table_index`` after data is already in the table (after and ``append/put`` operation). Creating a table index is **highly** encouraged. This will speed your queries a great deal when you use a ``select`` with the indexed dimension as the ``where``. **Indexes are automagically created (starting 0.10.1)** on the indexables and any data columns you specify. This behavior can be turned off by passing ``index=False`` to ``append``. .. ipython:: python @@ -1210,7 +1211,7 @@ You can create/modify an index for a table with ``create_table_index`` after dat Query via Data Columns ~~~~~~~~~~~~~~~~~~~~~~ -You can designate (and index) certain columns that you want to be able to perform queries (other than the `indexable` columns, which you can always query). For instance say you want to perform this this common operation, on-disk, and return just the frame that matches this query. +You can designate (and index) certain columns that you want to be able to perform queries (other than the `indexable` columns, which you can always query). For instance say you want to perform this common operation, on-disk, and return just the frame that matches this query. .. ipython:: python @@ -1231,7 +1232,7 @@ You can designate (and index) certain columns that you want to be able to perfor # this is in-memory version of this type of selection df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == 'foo')] - # we have automagically created this index and that the B/string columns are stored separately as ``PyTables`` columns + # we have automagically created this index and that the B/C/string/string2 columns are stored separately as ``PyTables`` columns store.root.df_dc.table There is some performance degredation by making lots of columns into `data columns`, so it is up to the user to designate these. In addition, you cannot change data columns (nor indexables) after the first append/put operation (Of course you can simply read in the data and create a new table!) @@ -1257,14 +1258,14 @@ If you want to inspect the table object, retrieve via ``get_table``. You could u Multiple Table Queries ~~~~~~~~~~~~~~~~~~~~~~ -New in 0.10.1 are the methods ``append_to_multple`` and ``select_as_multiple``, that can perform appending/selecting from multiple tables at once. The idea is to have one table (call it the selector table) that you index most/all of the columns, and perform your queries. The other table(s) are data tables that you are indexed the same the selector table. You can then perform a very fast query on the selector table, yet get lots of data back. This method works similar to having a very wide-table, but is more efficient in terms of queries. +New in 0.10.1 are the methods ``append_to_multple`` and ``select_as_multiple``, that can perform appending/selecting from multiple tables at once. The idea is to have one table (call it the selector table) that you index most/all of the columns, and perform your queries. The other table(s) are data tables that are indexed the same the selector table. You can then perform a very fast query on the selector table, yet get lots of data back. This method works similar to having a very wide-table, but is more efficient in terms of queries. Note, **THE USER IS RESPONSIBLE FOR SYNCHRONIZING THE TABLES**. This means, append to the tables in the same order; ``append_to_multiple`` splits a single object to multiple tables, given a specification (as a dictionary). This dictionary is a mapping of the table names to the 'columns' you want included in that table. Pass a `None` for a single table (optional) to let it have the remaining columns. The argument ``selector`` defines which table is the selector table. .. ipython:: python - index = date_range('1/1/2000', periods=8) - df_mt = DataFrame(randn(8, 6), index=index, columns=['A', 'B', 'C', 'D', 'E', 'F']) + df_mt = DataFrame(randn(8, 6), index=date_range('1/1/2000', periods=8), + columns=['A', 'B', 'C', 'D', 'E', 'F']) df_mt['foo'] = 'bar' # you can also create the tables individually @@ -1276,7 +1277,7 @@ Note, **THE USER IS RESPONSIBLE FOR SYNCHRONIZING THE TABLES**. This means, appe store.select('df2_mt') # as a multiple - store.select_as_multiple(['df1_mt','df2_mt'], where = [ 'A>0','B>0' ], axis = 1, selector = 'df1_mt') + store.select_as_multiple(['df1_mt','df2_mt'], where = [ 'A>0','B>0' ], selector = 'df1_mt') Delete from a Table @@ -1303,11 +1304,13 @@ It should be clear that a delete operation on the ``major_axis`` will be fairly store.remove('wp', 'major_axis>20000102' ) store.select('wp') +Please note that HDF5 **DOES NOT RECLAIM SPACE** in the h5 files automatically. Thus, repeatedly deleting (or removing nodes) and adding again **WILL TEND TO INCREASE THE FILE SIZE**. To *clean* the file, use ``ptrepack`` (see below). + Compression ~~~~~~~~~~~ ``PyTables`` allows the stored data to be compressed (this applies to all kinds of stores, not just tables). You can pass ``complevel=int`` for a compression level (1-9, with 0 being no compression, and the default), ``complib=lib`` where lib is any of ``zlib, bzip2, lzo, blosc`` for whichever compression library you prefer. ``blosc`` offers very fast compression (its level defaults to 9), and is my most used. -``PyTables`` offer better write performance when compressed after writing them, as opposed to turning on compression at the very beginning. You can use the supplied ``PyTables`` utility ``ptrepack``. ``ptrepack`` also can change compression levels after the fact. +``PyTables`` offer better write performance when compressed after writing them, as opposed to turning on compression at the very beginning. You can use the supplied ``PyTables`` utility ``ptrepack``. In addition, ``ptrepack`` can change compression levels after the fact. - ``ptrepack --chunkshape=auto --propindexes --complevel=9 --complib=blosc in.h5 out.h5`` @@ -1315,7 +1318,7 @@ Or on-the-fly compression - ``store_compressed = HDFStore('store_compressed.h5', complevel=9, complib='blosc')`` - +Furthermore ``ptrepack in.h5 out.h5`` will *repack* the file to allow you to reuse previously deleted space (alternatively, one can simply remove the file and write again). Notes & Caveats ~~~~~~~~~~~~~~~ @@ -1351,12 +1354,7 @@ Performance Write times are generally longer as compared with regular stores. Query times can be quite fast, especially on an indexed axis. - You can pass ``chunksize=an integer`` to ``append``, to change the writing chunksize (default is 50000). This will signficantly lower your memory usage on writing. - You can pass ``expectedrows=an integer`` to the first ``append``, to set the TOTAL number of expectedrows that ``PyTables`` will expected. This will optimize read/write performance. - - ``Tables`` can be expressed as different types. - - - ``AppendableTable`` which is a similiar table to past versions (this is the default). - - ``WORMTable`` (pending implementation) - is available to faciliate very fast writing of tables that are also queryable (but CANNOT support appends) - - - Duplicate rows can be written, but are filtered out in selection (with the last items being selected; thus a table is unique on major, minor pairs) + - Duplicate rows can be written to tables, but are filtered out in selection (with the last items being selected; thus a table is unique on major, minor pairs) Experimental ~~~~~~~~~~~~ diff --git a/doc/source/v0.10.1.txt b/doc/source/v0.10.1.txt index 56d38ee9c847a..855fd619bdc55 100644 --- a/doc/source/v0.10.1.txt +++ b/doc/source/v0.10.1.txt @@ -38,13 +38,10 @@ You can designate (and index) certain columns that you want to be able to perfor # on-disk operations store.append('df', df, data_columns = ['B','C','string','string2']) - store.select('df',[ Term('B>0') ]) - - # getting creative - store.select('df',[ Term('B>0'), Term('C>0'), Term('string=foo') ]) + store.select('df',[ 'B > 0', 'string == foo' ]) # this is in-memory version of this type of selection - df[(df.B > 0) & (df.C > 0) & (df.string == 'foo')] + df[(df.B > 0) & (df.string == 'foo')] You can pass ``columns`` keyword to select to filter a list of the return columns, this is equivalent to passing a ``Term('columns',list_of_columns_to_filter)`` @@ -53,7 +50,7 @@ You can pass ``columns`` keyword to select to filter a list of the return column store.select('df',columns = ['A','B']) -``HDFStore`` now serializes multi-index dataframes. +``HDFStore`` now serializes multi-index dataframes when appending tables. .. ipython:: python @@ -76,8 +73,8 @@ Multi-table creation via ``append_to_multiple`` and selection via ``select_multi .. ipython:: python - index = date_range('1/1/2000', periods=8) - df_mt = DataFrame(randn(8, 6), index=index, columns=['A', 'B', 'C', 'D', 'E', 'F']) + df_mt = DataFrame(randn(8, 6), index=date_range('1/1/2000', periods=8), + columns=['A', 'B', 'C', 'D', 'E', 'F']) df_mt['foo'] = 'bar' # you can also create the tables individually @@ -89,7 +86,7 @@ Multi-table creation via ``append_to_multiple`` and selection via ``select_multi store.select('df2_mt') # as a multiple - store.select_as_multiple(['df1_mt','df2_mt'], where = [ 'A>0','B>0' ], axis = 1, selector = 'df1_mt') + store.select_as_multiple(['df1_mt','df2_mt'], where = [ 'A>0','B>0' ], selector = 'df1_mt') .. ipython:: python :suppress: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index ad31f0f5347cd..2c065d8940d33 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -379,7 +379,7 @@ def select_as_coordinates(self, key, where=None, **kwargs): """ return self.get_table(key).read_coordinates(where = where, **kwargs) - def select_as_multiple(self, keys, where=None, selector=None, columns=None, axis=1, **kwargs): + def select_as_multiple(self, keys, where=None, selector=None, columns=None, **kwargs): """ Retrieve pandas objects from multiple tables Parameters @@ -387,7 +387,6 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, axis keys : a list of the tables selector : the table to apply the where criteria (defaults to keys[0] if not supplied) columns : the columns I want back - axis : the concentation axis (defaults to 1) Exceptions ---------- @@ -424,6 +423,9 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, axis # collect the returns objs objs = [ t.read(where = c, columns = columns) for t in tbls ] + # axis is the concentation axes + axis = list(set([ t.non_index_axes[0][0] for t in tbls ]))[0] + # concat and return return concat(objs, axis = axis, verify_integrity = True) @@ -2463,7 +2465,7 @@ class Term(object): """ _ops = ['<=','<','>=','>','!=','==','='] - _search = re.compile("^(?P\w+)\s*(?P%s)\s*(?P.+)$" % '|'.join(_ops)) + _search = re.compile("^\s*(?P\w+)\s*(?P%s)\s*(?P.+)\s*$" % '|'.join(_ops)) def __init__(self, field, op = None, value = None, queryables = None): self.field = None From 47b0ad4da6ff93a9b7168b11470031cb8ac48a74 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 23 Dec 2012 13:17:29 -0500 Subject: [PATCH 25/35] DOC: minor doc updates 2 --- doc/source/v0.10.1.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/v0.10.1.txt b/doc/source/v0.10.1.txt index 855fd619bdc55..ebc343261842e 100644 --- a/doc/source/v0.10.1.txt +++ b/doc/source/v0.10.1.txt @@ -69,7 +69,7 @@ You can pass ``columns`` keyword to select to filter a list of the return column # the levels are automatically included as data columns store.select('mi', Term('foo=bar')) -Multi-table creation via ``append_to_multiple`` and selection via ``select_multiple`` can create/select from multiple tables and return a combined result, by using ``where`` on a selector table. +Multi-table creation via ``append_to_multiple`` and selection via ``select_as_multiple`` can create/select from multiple tables and return a combined result, by using ``where`` on a selector table. .. ipython:: python From 3cdc0cddd6eb68d5f0f47478c5fc46b5effa0f6c Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 23 Dec 2012 16:06:54 -0500 Subject: [PATCH 26/35] BUG: added datetime64 support in columns --- RELEASE.rst | 3 +- doc/source/io.rst | 5 +- doc/source/v0.10.1.txt | 12 ++++ pandas/io/pytables.py | 56 ++++++++++++---- pandas/io/tests/test_pytables.py | 111 ++++++++++++++++++------------- 5 files changed, 124 insertions(+), 63 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index fd958ebb24efa..c57678fd0f7cc 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -41,12 +41,13 @@ pandas 0.10.1 - added ``get_store`` context manager to automatically import with pandas - added column filtering via ``columns`` keyword in select - added methods append_to_multiple/select_as_multiple/select_as_coordinates to do multiple-table append/selection + - added support for datetime64 in columns **Bug fixes** - ``HDFStore`` - correctly handle ``nan`` elements in string columns; serialize via the ``nan_rep`` keyword to append - - raise correctly on non-implemented column types (unicode/datetime64/date) + - raise correctly on non-implemented column types (unicode/date) - handle correctly ``Term`` passed types (e.g. ``index<1000``, when index is ``Int64``), (closes GH512_) .. _GH512: https://github.com/pydata/pandas/issues/512 diff --git a/doc/source/io.rst b/doc/source/io.rst index 36ae14aaa6e06..39251a6b0ec9e 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1106,7 +1106,7 @@ Storing Mixed Types in a Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Storing mixed-dtype data is supported. Strings are store as a fixed-width using the maximum size of the appended column. Subsequent appends will truncate strings at this length. -Passing ``min_itemsize = { `values` : size }`` as a parameter to append will set a larger minimum for the string columns. Storing ``floats, strings, ints, bools`` are currently supported. For string columns, passing ``nan_rep = 'my_nan_rep'`` to append will change the default nan representation on disk (which converts to/from `np.nan`), this defaults to `nan`. +Passing ``min_itemsize = { `values` : size }`` as a parameter to append will set a larger minimum for the string columns. Storing ``floats, strings, ints, bools, datetime64`` are currently supported. For string columns, passing ``nan_rep = 'my_nan_rep'`` to append will change the default nan representation on disk (which converts to/from `np.nan`), this defaults to `nan`. .. ipython:: python @@ -1114,6 +1114,7 @@ Passing ``min_itemsize = { `values` : size }`` as a parameter to append will set df_mixed['string'] = 'string' df_mixed['int'] = 1 df_mixed['bool'] = True + df_mixed['datetime64'] = Timestamp('20010102') df_mixed.ix[3:4,['A','B','string']] = np.nan store.append('df_mixed', df_mixed, min_itemsize = { 'values' : 50 }) @@ -1124,6 +1125,8 @@ Passing ``min_itemsize = { `values` : size }`` as a parameter to append will set # we have provided a minimum string column size store.root.df_mixed.table +It is ok to store ``np.nan`` in a ``float or string``. Storing a column with a ``np.nan`` in a ``int, bool, or datetime64`` will currently throw an ``Exception`` as these columns will have converted to ``object`` type. + Storing Multi-Index DataFrames ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.10.1.txt b/doc/source/v0.10.1.txt index ebc343261842e..fd585d2a58ab7 100644 --- a/doc/source/v0.10.1.txt +++ b/doc/source/v0.10.1.txt @@ -43,6 +43,18 @@ You can designate (and index) certain columns that you want to be able to perfor # this is in-memory version of this type of selection df[(df.B > 0) & (df.string == 'foo')] +You can now store ``datetime64`` in data columns + +.. ipython:: python + + df_mixed = df.copy() + df_mixed['datetime64'] = Timestamp('20010102') + df_mixed.ix[3:4,['A','B']] = np.nan + + store.append('df_mixed', df_mixed) + df_mixed1 = store.select('df_mixed') + df_mixed1 + df_mixed1.get_dtype_counts() You can pass ``columns`` keyword to select to filter a list of the return columns, this is equivalent to passing a ``Term('columns',list_of_columns_to_filter)`` diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 2c065d8940d33..184fa7fb662de 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1246,10 +1246,13 @@ def __eq__(self, other): """ compare 2 col items """ return all([ getattr(self,a,None) == getattr(other,a,None) for a in ['name','cname','dtype','pos'] ]) - def set_data(self, data): + def set_data(self, data, dtype = None): self.data = data if data is not None: - if self.dtype is None: + if dtype is not None: + self.dtype = dtype + self.set_kind() + elif self.dtype is None: self.dtype = data.dtype.name self.set_kind() @@ -1267,23 +1270,26 @@ def set_kind(self): self.kind = 'float' elif self.dtype.startswith('int'): self.kind = 'integer' + elif self.dtype.startswith('date'): + self.kind = 'datetime' def set_atom(self, block, existing_col, min_itemsize, nan_rep, **kwargs): """ create and setup my atom from the block b """ self.values = list(block.items) dtype = block.dtype.name + inferred_type = lib.infer_dtype(block.values.flatten()) - if dtype == 'object': - inferred_type = lib.infer_dtype(block.values.flatten()) - if inferred_type == 'unicode': - raise NotImplementedError("unicode is not implemented as a table column") - elif inferred_type == 'date': - raise NotImplementedError("date is not implemented as a table column") + if inferred_type == 'datetime64': + self.set_atom_datetime64(block) + elif inferred_type == 'date': + raise NotImplementedError("date is not implemented as a table column") + elif inferred_type == 'unicode': + raise NotImplementedError("unicode is not implemented as a table column") + ### this is basically a catchall; if say a datetime64 has nans then will end up here ### + elif inferred_type == 'string' or dtype == 'object': self.set_atom_string(block, existing_col, min_itemsize, nan_rep) - elif dtype == 'datetime64[ns]': - raise NotImplementedError("datetime64[ns] is not implemented as a table column") else: self.set_atom_data(block) @@ -1324,6 +1330,14 @@ def set_atom_data(self, block): self.typ = self.get_atom_data(block) self.set_data(block.values.astype(self.typ._deftype)) + def get_atom_datetime64(self, block): + return _tables().Int64Col(shape = block.shape[0]) + + def set_atom_datetime64(self, block): + self.kind = 'datetime64' + self.typ = self.get_atom_datetime64(block) + self.set_data(block.values.view('i8'),'datetime64') + @property def shape(self): return getattr(self.data,'shape',None) @@ -1354,10 +1368,21 @@ def convert(self, values, nan_rep): # convert to the correct dtype if self.dtype is not None: - try: - self.data = self.data.astype(self.dtype) - except: - self.data = self.data.astype('O') + + # reverse converts + if self.dtype == 'datetime64': + self.data = np.asarray(self.data, dtype='M8[ns]') + elif self.dtype == 'date': + self.data = np.array([date.fromtimestamp(v) for v in self.data], dtype=object) + elif self.dtype == 'datetime': + self.data = np.array([datetime.fromtimestamp(v) for v in self.data], + dtype=object) + else: + + try: + self.data = self.data.astype(self.dtype) + except: + self.data = self.data.astype('O') # convert nans if self.kind == 'string': @@ -1389,6 +1414,9 @@ def get_atom_string(self, block, itemsize): def get_atom_data(self, block): return getattr(_tables(),"%sCol" % self.kind.capitalize())() + def get_atom_datetime64(self, block): + return _tables().Int64Col() + class Table(object): """ represent a table: facilitate read/write of various types of tables diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index e622626569609..296e807b5ad72 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -13,7 +13,7 @@ import pandas.util.testing as tm from pandas.tests.test_series import assert_series_equal from pandas.tests.test_frame import assert_frame_equal -from pandas import concat +from pandas import concat, Timestamp try: import tables @@ -482,10 +482,14 @@ def test_append_with_data_columns(self): df_dc.ix[4:6,'string'] = np.nan df_dc.ix[7:9,'string'] = 'bar' df_dc['string2'] = 'cool' - df_dc + df_dc['datetime'] = Timestamp('20010102') self.store.remove('df_dc') - self.store.append('df_dc', df_dc, data_columns = ['B','C','string','string2']) + self.store.append('df_dc', df_dc, data_columns = ['B','C','string','string2','datetime']) result = self.store.select('df_dc',[ Term('B>0') ]) + + # convert it + df_dc = df_dc.consolidate().convert_objects() + expected = df_dc[df_dc.B > 0] tm.assert_frame_equal(result, expected) @@ -699,61 +703,74 @@ def test_table_values_dtypes_roundtrip(self): def test_table_mixed_dtypes(self): # frame - def _make_one_df(): - df = tm.makeDataFrame() - df['obj1'] = 'foo' - df['obj2'] = 'bar' - df['bool1'] = df['A'] > 0 - df['bool2'] = df['B'] > 0 - df['bool3'] = True - df['int1'] = 1 - df['int2'] = 2 - return df.consolidate() - - df1 = _make_one_df() - - self.store.append('df1_mixed', df1) - tm.assert_frame_equal(self.store.select('df1_mixed'), df1) + df = tm.makeDataFrame() + df['obj1'] = 'foo' + df['obj2'] = 'bar' + df['bool1'] = df['A'] > 0 + df['bool2'] = df['B'] > 0 + df['bool3'] = True + df['int1'] = 1 + df['int2'] = 2 + df['timestamp1'] = Timestamp('20010102') + df['timestamp2'] = Timestamp('20010103') + df['datetime1'] = datetime.datetime(2001,1,2,0,0) + df['datetime2'] = datetime.datetime(2001,1,3,0,0) + df.ix[3:6,['obj1']] = np.nan + df = df.consolidate().convert_objects() + + self.store.append('df1_mixed', df) + tm.assert_frame_equal(self.store.select('df1_mixed'), df) # panel - def _make_one_panel(): - wp = tm.makePanel() - wp['obj1'] = 'foo' - wp['obj2'] = 'bar' - wp['bool1'] = wp['ItemA'] > 0 - wp['bool2'] = wp['ItemB'] > 0 - wp['int1'] = 1 - wp['int2'] = 2 - return wp.consolidate() - p1 = _make_one_panel() - - self.store.append('p1_mixed', p1) - tm.assert_panel_equal(self.store.select('p1_mixed'), p1) + wp = tm.makePanel() + wp['obj1'] = 'foo' + wp['obj2'] = 'bar' + wp['bool1'] = wp['ItemA'] > 0 + wp['bool2'] = wp['ItemB'] > 0 + wp['int1'] = 1 + wp['int2'] = 2 + wp = wp.consolidate() + + self.store.append('p1_mixed', wp) + tm.assert_panel_equal(self.store.select('p1_mixed'), wp) # ndim - def _make_one_p4d(): - wp = tm.makePanel4D() - wp['obj1'] = 'foo' - wp['obj2'] = 'bar' - wp['bool1'] = wp['l1'] > 0 - wp['bool2'] = wp['l2'] > 0 - wp['int1'] = 1 - wp['int2'] = 2 - return wp.consolidate() - - p4d = _make_one_p4d() - self.store.append('p4d_mixed', p4d) - tm.assert_panel4d_equal(self.store.select('p4d_mixed'), p4d) + wp = tm.makePanel4D() + wp['obj1'] = 'foo' + wp['obj2'] = 'bar' + wp['bool1'] = wp['l1'] > 0 + wp['bool2'] = wp['l2'] > 0 + wp['int1'] = 1 + wp['int2'] = 2 + wp = wp.consolidate() + + self.store.append('p4d_mixed', wp) + tm.assert_panel4d_equal(self.store.select('p4d_mixed'), wp) def test_unimplemented_dtypes_table_columns(self): #### currently not supported dtypes #### - from pandas import Timestamp - - for n,f in [ ('timestamp',Timestamp('20010102')), ('unicode',u'\u03c3'), ('datetime',datetime.datetime(2001,1,2)), ('date',datetime.date(2001,1,2)) ]: + for n,f in [ ('unicode',u'\u03c3'), ('date',datetime.date(2001,1,2)) ]: df = tm.makeDataFrame() df[n] = f self.assertRaises(NotImplementedError, self.store.append, 'df1_%s' % n, df) + # frame + df = tm.makeDataFrame() + df['obj1'] = 'foo' + df['obj2'] = 'bar' + df['datetime1'] = datetime.date(2001,1,2) + df = df.consolidate().convert_objects() + + # datetime64 with nan + df = tm.makeDataFrame() + df['timestamp1'] = Timestamp('20010102') + df.ix[3:6,:] = np.nan + df = df.consolidate().convert_objects() + self.assertRaises(Exception, self.store.append, 'df_datetime64_with_nan', df) + + # this fails because we have a date in the object block...... + self.assertRaises(Exception, self.store.append, 'df_unimplemented', df) + def test_remove(self): ts = tm.makeTimeSeries() df = tm.makeDataFrame() From 6c2dd270e2ac039cfd5ba6b60e2175f7752c7900 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 23 Dec 2012 16:22:47 -0500 Subject: [PATCH 27/35] BUG: updated tests for datetim64 detection in columns --- doc/source/conf.py | 1 + doc/source/io.rst | 7 +++++-- pandas/io/tests/test_pytables.py | 13 +++---------- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 692c7757ee17c..6895f00414b0b 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -16,6 +16,7 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. #sys.path.append(os.path.abspath('.')) +sys.path.insert(0, '/home/jreback/pandas') sys.path.insert(0, os.path.abspath('../sphinxext')) sys.path.extend([ diff --git a/doc/source/io.rst b/doc/source/io.rst index 39251a6b0ec9e..adb43190fb6ca 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1115,7 +1115,10 @@ Passing ``min_itemsize = { `values` : size }`` as a parameter to append will set df_mixed['int'] = 1 df_mixed['bool'] = True df_mixed['datetime64'] = Timestamp('20010102') - df_mixed.ix[3:4,['A','B','string']] = np.nan + + # make sure that we have datetime64[ns] types + df_mixed = df_mixed.convert_objects() + df_mixed.ix[3:5,['A','B','string','datetime64']] = np.nan store.append('df_mixed', df_mixed, min_itemsize = { 'values' : 50 }) df_mixed1 = store.select('df_mixed') @@ -1125,7 +1128,7 @@ Passing ``min_itemsize = { `values` : size }`` as a parameter to append will set # we have provided a minimum string column size store.root.df_mixed.table -It is ok to store ``np.nan`` in a ``float or string``. Storing a column with a ``np.nan`` in a ``int, bool, or datetime64`` will currently throw an ``Exception`` as these columns will have converted to ``object`` type. +It is ok to store ``np.nan`` in a ``float or string``. Make sure to do a ``convert_objects()`` on the frame before storing a ``np.nan`` in a datetime64 column. Storing a column with a ``np.nan`` in a ``int, bool`` will currently throw an ``Exception`` as these columns will have converted to ``object`` type. Storing Multi-Index DataFrames ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 296e807b5ad72..da15e533adcd5 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -483,13 +483,13 @@ def test_append_with_data_columns(self): df_dc.ix[7:9,'string'] = 'bar' df_dc['string2'] = 'cool' df_dc['datetime'] = Timestamp('20010102') + df_dc = df_dc.convert_objects() + df_dc.ix[3:5,['A','B','datetime']] = np.nan + self.store.remove('df_dc') self.store.append('df_dc', df_dc, data_columns = ['B','C','string','string2','datetime']) result = self.store.select('df_dc',[ Term('B>0') ]) - # convert it - df_dc = df_dc.consolidate().convert_objects() - expected = df_dc[df_dc.B > 0] tm.assert_frame_equal(result, expected) @@ -761,13 +761,6 @@ def test_unimplemented_dtypes_table_columns(self): df['datetime1'] = datetime.date(2001,1,2) df = df.consolidate().convert_objects() - # datetime64 with nan - df = tm.makeDataFrame() - df['timestamp1'] = Timestamp('20010102') - df.ix[3:6,:] = np.nan - df = df.consolidate().convert_objects() - self.assertRaises(Exception, self.store.append, 'df_datetime64_with_nan', df) - # this fails because we have a date in the object block...... self.assertRaises(Exception, self.store.append, 'df_unimplemented', df) From a130c6238d151b115f46250e42fb103a99195246 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 23 Dec 2012 17:04:31 -0500 Subject: [PATCH 28/35] removed paths from conf.py --- doc/source/conf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 6895f00414b0b..692c7757ee17c 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -16,7 +16,6 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. #sys.path.append(os.path.abspath('.')) -sys.path.insert(0, '/home/jreback/pandas') sys.path.insert(0, os.path.abspath('../sphinxext')) sys.path.extend([ From 1a3301c538eb2240a118863e9904da4f8fe8e3da Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 25 Dec 2012 21:14:26 -0500 Subject: [PATCH 29/35] BUG/TST: min_itemsize not working on data_columns, added more tests --- pandas/io/pytables.py | 4 +++- pandas/io/tests/test_pytables.py | 38 +++++++++++++++++++++++++++++--- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 184fa7fb662de..2a2b19dc2dd29 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1307,7 +1307,8 @@ def set_atom_string(self, block, existing_col, min_itemsize, nan_rep): # specified min_itemsize? if isinstance(min_itemsize, dict): - itemsize = max(int(min_itemsize.get('values')),itemsize) + min_itemsize = int(min_itemsize.get(self.name) or min_itemsize.get('values') or 0) + itemsize = max(min_itemsize or 0,itemsize) # check for column in the values conflicts if existing_col is not None: @@ -1315,6 +1316,7 @@ def set_atom_string(self, block, existing_col, min_itemsize, nan_rep): if eci > itemsize: itemsize = eci + self.itemsize = itemsize self.kind = 'string' self.typ = self.get_atom_string(block, itemsize) self.set_data(self.convert_string_data(data, itemsize)) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index da15e533adcd5..3955830330398 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -381,11 +381,15 @@ def test_append_with_strings(self): wp = tm.makePanel() wp2 = wp.rename_axis(dict([ (x,"%s_extra" % x) for x in wp.minor_axis ]), axis = 2) + def check_col(key,name,size): + self.assert_(getattr(self.store.get_table(key).table.description,name).itemsize == size) + self.store.append('s1', wp, min_itemsize = 20) self.store.append('s1', wp2) expected = concat([ wp, wp2], axis = 2) expected = expected.reindex(minor_axis = sorted(expected.minor_axis)) tm.assert_panel_equal(self.store['s1'], expected) + check_col('s1','minor_axis',20) # test dict format self.store.append('s2', wp, min_itemsize = { 'minor_axis' : 20 }) @@ -393,6 +397,7 @@ def test_append_with_strings(self): expected = concat([ wp, wp2], axis = 2) expected = expected.reindex(minor_axis = sorted(expected.minor_axis)) tm.assert_panel_equal(self.store['s2'], expected) + check_col('s1','minor_axis',20) # apply the wrong field (similar to #1) self.store.append('s3', wp, min_itemsize = { 'major_axis' : 20 }) @@ -404,25 +409,29 @@ def test_append_with_strings(self): # avoid truncation on elements df = DataFrame([[123,'asdqwerty'], [345,'dggnhebbsdfbdfb']]) - self.store.append('df_big',df, min_itemsize = { 'values' : 1024 }) + self.store.append('df_big',df) tm.assert_frame_equal(self.store.select('df_big'), df) + check_col('df_big','values_block_1',15) # appending smaller string ok df2 = DataFrame([[124,'asdqy'], [346,'dggnhefbdfb']]) self.store.append('df_big',df2) expected = concat([ df, df2 ]) tm.assert_frame_equal(self.store.select('df_big'), expected) + check_col('df_big','values_block_1',15) # avoid truncation on elements df = DataFrame([[123,'asdqwerty'], [345,'dggnhebbsdfbdfb']]) - self.store.append('df_big2',df, min_itemsize = { 'values' : 10 }) + self.store.append('df_big2',df, min_itemsize = { 'values' : 50 }) tm.assert_frame_equal(self.store.select('df_big2'), df) + check_col('df_big2','values_block_1',50) # bigger string on next append - self.store.append('df_new',df, min_itemsize = { 'values' : 16 }) + self.store.append('df_new',df) df_new = DataFrame([[124,'abcdefqhij'], [346, 'abcdefghijklmnopqrtsuvwxyz']]) self.assertRaises(Exception, self.store.append, 'df_new',df_new) + def test_append_with_data_columns(self): df = tm.makeTimeDataFrame() @@ -457,6 +466,29 @@ def test_append_with_data_columns(self): expected = df_new[df_new.string == 'foo'] tm.assert_frame_equal(result, expected) + # using min_itemsize and a data column + def check_col(key,name,size): + self.assert_(getattr(self.store.get_table(key).table.description,name).itemsize == size) + + self.store.remove('df') + self.store.append('df', df_new, data_columns = ['string'], min_itemsize = { 'string' : 30 }) + check_col('df','string',30) + self.store.remove('df') + self.store.append('df', df_new, data_columns = ['string'], min_itemsize = 30) + check_col('df','string',30) + self.store.remove('df') + self.store.append('df', df_new, data_columns = ['string'], min_itemsize = { 'values' : 30 }) + check_col('df','string',30) + + df_new['string2'] = 'foobarbah' + df_new['string_block1'] = 'foobarbah1' + df_new['string_block2'] = 'foobarbah2' + self.store.remove('df') + self.store.append('df', df_new, data_columns = ['string','string2'], min_itemsize = { 'string' : 30, 'string2' : 40, 'values' : 50 }) + check_col('df','string',30) + check_col('df','string2',40) + check_col('df','values_block_1',50) + # multiple data columns df_new = df.copy() df_new['string'] = 'foo' From 2e3a3c6e42b54004885ba8d56678ba4fd68a3ca2 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 26 Dec 2012 14:38:14 -0500 Subject: [PATCH 30/35] BUG: performance issue with reconsituting string arrays changed to use simpler cython routine to avoid copying --- pandas/io/pytables.py | 5 ++--- pandas/io/tests/test_pytables.py | 15 ++++++++++++- pandas/lib.pyx | 38 ++++++++++++++++++++++---------- 3 files changed, 42 insertions(+), 16 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 2a2b19dc2dd29..c736c677e7d5b 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -21,7 +21,6 @@ from pandas.tseries.api import PeriodIndex, DatetimeIndex from pandas.core.common import adjoin from pandas.core.algorithms import match, unique, factorize -from pandas.core.strings import str_len, _na_map from pandas.core.categorical import Categorical from pandas.core.common import _asarray_tuplesafe, _try_sort from pandas.core.internals import BlockManager, make_block, form_blocks @@ -1303,7 +1302,7 @@ def set_atom_string(self, block, existing_col, min_itemsize, nan_rep): data = block.fillna(nan_rep).values # itemsize is the maximum length of a string (along any dimension) - itemsize = lib.max_len_string_array(data) + itemsize = lib.max_len_string_array(data.flatten()) # specified min_itemsize? if isinstance(min_itemsize, dict): @@ -1388,7 +1387,7 @@ def convert(self, values, nan_rep): # convert nans if self.kind == 'string': - self.data = _na_map(lambda x: np.nan if x == nan_rep else x, self.data.flatten()).reshape(self.data.shape) + self.data = lib.array_replace_from_nan_rep(self.data.flatten(), nan_rep).reshape(self.data.shape) def get_attr(self): """ get the data for this colummn """ diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 3955830330398..aed81bc437e41 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -397,7 +397,7 @@ def check_col(key,name,size): expected = concat([ wp, wp2], axis = 2) expected = expected.reindex(minor_axis = sorted(expected.minor_axis)) tm.assert_panel_equal(self.store['s2'], expected) - check_col('s1','minor_axis',20) + check_col('s2','minor_axis',20) # apply the wrong field (similar to #1) self.store.append('s3', wp, min_itemsize = { 'major_axis' : 20 }) @@ -431,6 +431,19 @@ def check_col(key,name,size): df_new = DataFrame([[124,'abcdefqhij'], [346, 'abcdefghijklmnopqrtsuvwxyz']]) self.assertRaises(Exception, self.store.append, 'df_new',df_new) + # with nans + self.store.remove('df') + df = tm.makeTimeDataFrame() + df['string'] = 'foo' + df.ix[1:4,'string'] = np.nan + df['string2'] = 'bar' + df.ix[4:8,'string2'] = np.nan + df['string3'] = 'bah' + df.ix[1:,'string3'] = np.nan + self.store.append('df',df) + result = self.store.select('df') + tm.assert_frame_equal(result,df) + def test_append_with_data_columns(self): diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 014d69706e89c..94936db47f4df 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -749,23 +749,37 @@ from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, @cython.boundscheck(False) @cython.wraparound(False) -def max_len_string_array(ndarray arr): - """ return the maximum size of elements in a strnig array """ +def max_len_string_array(ndarray[object, ndim=1] arr): + """ return the maximum size of elements in a 1-dim string array """ cdef: - int i, n_i, n_j, m, l + int i, m, l + length = arr.shape[0] - n_i = arr.shape[0] m = 0 - for i from 0 <= i < n_i: - n_j = len(arr[i]) - - for j from 0 <= j < n_j: - - l = len(arr[i,j]) - if l > m: - m = l + for i from 0 <= i < length: + l = len(arr[i]) + + if l > m: + m = l + return m +@cython.boundscheck(False) +@cython.wraparound(False) +def array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_rep, object replace = None): + """ replace the values in the array with replacement if they are nan_rep; return the same array """ + + cdef int length = arr.shape[0] + cdef int i = 0 + if replace is None: + replace = np.nan + + for i from 0 <= i < length: + if arr[i] == nan_rep: + arr[i] = replace + + return arr + @cython.boundscheck(False) @cython.wraparound(False) def create_hdf_rows_2d(ndarray indexer0, From a602839f85f439730b7e7b1e2440aa082c44ad2f Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 26 Dec 2012 15:01:47 -0500 Subject: [PATCH 31/35] ENH: allow index=list of columns or True/False/None to guide index creation at append time --- pandas/io/pytables.py | 13 ++++---- pandas/io/tests/test_pytables.py | 52 ++++++++++++++++++++------------ 2 files changed, 39 insertions(+), 26 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index c736c677e7d5b..a389ccf007da7 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -798,7 +798,7 @@ def _write_ndim_table(self, group, obj, append=False, comp=None, axes=None, inde t.write(axes=axes, obj=obj, append=append, compression=comp, **kwargs) if index: - t.create_index() + t.create_index(columns = index) def _read_ndim_table(self, group, where=None, **kwargs): t = create_table(self, group, **kwargs) @@ -811,7 +811,7 @@ def _write_frame_table(self, group, df, append=False, comp=None, axes=None, inde t = create_table(self, group, typ = 'appendable_frame' if df.index.nlevels == 1 else 'appendable_multiframe') t.write(axes=axes, obj=df, append=append, compression=comp, **kwargs) if index: - t.create_index() + t.create_index(columns = index) _read_frame_table = _read_ndim_table @@ -822,7 +822,7 @@ def _write_wide_table(self, group, panel, append=False, comp=None, axes=None, in t.write(axes=axes, obj=panel, append=append, compression=comp, **kwargs) if index: - t.create_index() + t.create_index(columns = index) _read_wide_table = _read_ndim_table @@ -1617,12 +1617,12 @@ def f(i, c): def create_index(self, columns = None, optlevel = None, kind = None): """ Create a pytables index on the specified columns - note: cannot index Time64Col() currently; PyTables must be >= 2.3.1 + note: cannot index Time64Col() currently; PyTables must be >= 2.3 Paramaters ---------- - columns : None or list_like (the indexers to index) + columns : False (don't create an index), True (create all columns index), None or list_like (the indexers to index) optlevel: optimization level (defaults to 6) kind : kind of index (defaults to 'medium') @@ -1633,9 +1633,10 @@ def create_index(self, columns = None, optlevel = None, kind = None): """ if not self.infer_axes(): return + if columns is False: return # index all indexables and data_columns - if columns is None: + if columns is None or columns is True: columns = [ a.cname for a in self.axes if a.is_data_indexable ] if not isinstance(columns, (tuple,list)): columns = [ columns ] diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index aed81bc437e41..096d35922209a 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -544,42 +544,54 @@ def check_col(key,name,size): def test_create_table_index(self): + def col(t,column): + return getattr(self.store.get_table(t).table.cols,column) + # index=False wp = tm.makePanel() self.store.append('p5', wp, index=False) self.store.create_table_index('p5', columns = ['major_axis']) - - assert(self.store.handle.root.p5.table.cols.major_axis.is_indexed == True) - assert(self.store.handle.root.p5.table.cols.minor_axis.is_indexed == False) + assert(col('p5','major_axis').is_indexed == True) + assert(col('p5','minor_axis').is_indexed == False) # index=True self.store.append('p5i', wp, index=True) - - assert(self.store.handle.root.p5i.table.cols.major_axis.is_indexed == True) - assert(self.store.handle.root.p5i.table.cols.minor_axis.is_indexed == True) + assert(col('p5i','major_axis').is_indexed == True) + assert(col('p5i','minor_axis').is_indexed == True) # default optlevels - assert(self.store.handle.root.p5.table.cols.major_axis.index.optlevel == 6) - assert(self.store.handle.root.p5.table.cols.major_axis.index.kind == 'medium') + self.store.get_table('p5').create_index() + assert(col('p5','major_axis').index.optlevel == 6) + assert(col('p5','minor_axis').index.kind == 'medium') # let's change the indexing scheme self.store.create_table_index('p5') - assert(self.store.handle.root.p5.table.cols.major_axis.index.optlevel == 6) - assert(self.store.handle.root.p5.table.cols.major_axis.index.kind == 'medium') + assert(col('p5','major_axis').index.optlevel == 6) + assert(col('p5','minor_axis').index.kind == 'medium') self.store.create_table_index('p5', optlevel=9) - assert(self.store.handle.root.p5.table.cols.major_axis.index.optlevel == 9) - assert(self.store.handle.root.p5.table.cols.major_axis.index.kind == 'medium') + assert(col('p5','major_axis').index.optlevel == 9) + assert(col('p5','minor_axis').index.kind == 'medium') self.store.create_table_index('p5', kind='full') - assert(self.store.handle.root.p5.table.cols.major_axis.index.optlevel == 9) - assert(self.store.handle.root.p5.table.cols.major_axis.index.kind == 'full') + assert(col('p5','major_axis').index.optlevel == 9) + assert(col('p5','minor_axis').index.kind == 'full') self.store.create_table_index('p5', optlevel=1, kind='light') - assert(self.store.handle.root.p5.table.cols.major_axis.index.optlevel == 1) - assert(self.store.handle.root.p5.table.cols.major_axis.index.kind == 'light') - + assert(col('p5','major_axis').index.optlevel == 1) + assert(col('p5','minor_axis').index.kind == 'light') + + # data columns df = tm.makeTimeDataFrame() - self.store.append('f', df[:10]) - self.store.append('f', df[10:]) - self.store.create_table_index('f') + df['string'] = 'foo' + df['string2'] = 'bar' + self.store.append('f', df, data_columns=['string','string2']) + assert(col('f','index').is_indexed == True) + assert(col('f','string').is_indexed == True) + assert(col('f','string2').is_indexed == True) + + # specify index=columns + self.store.append('f2', df, index=['string'], data_columns=['string','string2']) + assert(col('f2','index').is_indexed == False) + assert(col('f2','string').is_indexed == True) + assert(col('f2','string2').is_indexed == False) # try to index a non-table self.store.put('f2', df) From 6bac89441c4f70a496c876bf9890f7e2ab2f3e12 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 26 Dec 2012 17:48:12 -0500 Subject: [PATCH 32/35] BUG: minor change in way expectedrows works (better defaults) --- pandas/io/pytables.py | 13 ++++++++++--- pandas/lib.pyx | 11 +++-------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index a389ccf007da7..577f0ea990f70 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1507,6 +1507,11 @@ def validate(self, other): def nrows(self): return getattr(self.table,'nrows',None) + @property + def nrows_expected(self): + """ based on our axes, compute the expected nrows """ + return np.prod([ i.cvalues.shape[0] for i in self.index_axes ]) + @property def table(self): """ return the table group """ @@ -1868,8 +1873,10 @@ def reindex(obj, axis, filt, ordered): def create_description(self, compression = None, complevel = None, expectedrows = None): """ create the description of the table from the axes & values """ - d = dict( name = 'table', - expectedrows = expectedrows ) + # expected rows estimate + if expectedrows is None: + expectedrows = max(self.nrows_expected,10000) + d = dict( name = 'table', expectedrows = expectedrows ) # description from the axes & values d['description'] = dict([ (a.cname,a.typ) for a in self.axes ]) @@ -2097,7 +2104,7 @@ def write_data(self, chunksize): values = [ a.take_data() for a in self.values_axes ] # write the chunks - rows = np.prod([ i.shape[0] for i in indexes ]) + rows = self.nrows_expected chunks = int(rows / chunksize) + 1 for i in xrange(chunks): start_i = i*chunksize diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 94936db47f4df..39911c88e5686 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -15,7 +15,9 @@ from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, PyList_Check, PyFloat_Check, PyString_Check, PyTuple_SetItem, - PyTuple_New) + PyTuple_New, + PyObject_SetAttrString) + cimport cpython isnan = np.isnan @@ -740,13 +742,6 @@ def clean_index_list(list obj): return maybe_convert_objects(converted), 0 -from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, - PyDict_Contains, PyDict_Keys, - Py_INCREF, PyTuple_SET_ITEM, - PyTuple_SetItem, - PyTuple_New, - PyObject_SetAttrString) - @cython.boundscheck(False) @cython.wraparound(False) def max_len_string_array(ndarray[object, ndim=1] arr): From e078eadebbbbab38ec0cd5b7c914c993beec67b4 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 26 Dec 2012 22:14:14 -0500 Subject: [PATCH 33/35] ENH: added unique method to store, for selectin unique values in an indexable or data column w/o selecting the entire table --- RELEASE.rst | 1 + doc/source/io.rst | 13 +++++++ doc/source/v0.10.1.txt | 7 ++++ pandas/io/pytables.py | 65 +++++++++++++++++++++++++++++--- pandas/io/tests/test_pytables.py | 35 +++++++++++++++++ 5 files changed, 116 insertions(+), 5 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index c57678fd0f7cc..ccd27685f975a 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -42,6 +42,7 @@ pandas 0.10.1 - added column filtering via ``columns`` keyword in select - added methods append_to_multiple/select_as_multiple/select_as_coordinates to do multiple-table append/selection - added support for datetime64 in columns + - added method ``unique`` to select the unique values in an indexable or data column **Bug fixes** diff --git a/doc/source/io.rst b/doc/source/io.rst index adb43190fb6ca..73822b38719a0 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1246,6 +1246,17 @@ There is some performance degredation by making lots of columns into `data colum Advanced Queries ~~~~~~~~~~~~~~~~ +**Unique** + +To retrieve the *unique* values of an indexable or data column, use the method ``unique``. This will, for example, enable you to get the index very quickly. Note ``nan`` are excluded from the result set. + +.. ipython:: python + + store.unique('df_dc','index') + store.unique('df_dc','string') + +**Replicating or** + ``not`` and ``or`` conditions are unsupported at this time; however, ``or`` operations are easy to replicate, by repeately applying the criteria to the table, and then ``concat`` the results. .. ipython:: python @@ -1255,6 +1266,8 @@ Advanced Queries concat([ store.select('df_dc',c) for c in [ crit1, crit2 ] ]) +**Table Object** + If you want to inspect the table object, retrieve via ``get_table``. You could use this progamatically to say get the number of rows in the table. .. ipython:: python diff --git a/doc/source/v0.10.1.txt b/doc/source/v0.10.1.txt index fd585d2a58ab7..b8137fda540cd 100644 --- a/doc/source/v0.10.1.txt +++ b/doc/source/v0.10.1.txt @@ -43,6 +43,13 @@ You can designate (and index) certain columns that you want to be able to perfor # this is in-memory version of this type of selection df[(df.B > 0) & (df.string == 'foo')] +Retrieving unique values in an indexable or data column. + +.. ipython:: python + + store.unique('df','index') + store.unique('df','string') + You can now store ``datetime64`` in data columns .. ipython:: python diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 577f0ea990f70..4df28b59fafc1 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -378,6 +378,23 @@ def select_as_coordinates(self, key, where=None, **kwargs): """ return self.get_table(key).read_coordinates(where = where, **kwargs) + def unique(self, key, column, **kwargs): + """ + return a single column uniquely from the table. This is generally only useful to select an indexable + + Parameters + ---------- + key : object + column: the column of interest + + Exceptions + ---------- + raises KeyError if the column is not found (or key is not a valid store) + raises ValueError if the column can not be extracted indivually (it is part of a data block) + + """ + return self.get_table(key).read_column(column = column, **kwargs) + def select_as_multiple(self, keys, where=None, selector=None, columns=None, **kwargs): """ Retrieve pandas objects from multiple tables @@ -1124,9 +1141,19 @@ def infer(self, table): return new_self def convert(self, values, nan_rep): - """ set the values from this selection """ - self.values = Index(_maybe_convert(values[self.cname], self.kind)) - + """ set the values from this selection: take = take ownership """ + try: + values = values[self.cname] + except: + pass + self.values = Index(_maybe_convert(values, self.kind)) + return self + + def take_data(self): + """ return the values & release the memory """ + self.values, values = None, self.values + return values + @property def attrs(self): return self.table._v_attrs @@ -1365,7 +1392,11 @@ def validate_attr(self, append): def convert(self, values, nan_rep): """ set the data from this selection (and convert to the correct dtype if we can) """ - self.set_data(values[self.cname]) + try: + values = values[self.cname] + except: + pass + self.set_data(values) # convert to the correct dtype if self.dtype is not None: @@ -1388,7 +1419,8 @@ def convert(self, values, nan_rep): # convert nans if self.kind == 'string': self.data = lib.array_replace_from_nan_rep(self.data.flatten(), nan_rep).reshape(self.data.shape) - + return self + def get_attr(self): """ get the data for this colummn """ self.values = getattr(self.attrs,self.kind_attr,None) @@ -1898,6 +1930,7 @@ def read(self, **kwargs): raise NotImplementedError("cannot read on an abstract table: subclasses should implement") def read_coordinates(self, where=None, **kwargs): + """ select coordinates (row numbers) from a table; return the coordinates object """ # validate the version self.validate_version(where) @@ -1909,6 +1942,28 @@ def read_coordinates(self, where=None, **kwargs): self.selection = Selection(self, where = where, **kwargs) return Coordinates(self.selection.select_coords(), group = self.group, where = where) + def read_column(self, column, **kwargs): + """ return a single column from the table, generally only indexables are interesting """ + + # validate the version + self.validate_version() + + # infer the data kind + if not self.infer_axes(): return False + + # find the axes + for a in self.axes: + if column == a.name: + + if not a.is_data_indexable: + raise ValueError("column [%s] can not be extracted individually; it is not data indexable" % column) + + # column must be an indexable or a data column + c = getattr(self.table.cols,column) + return Categorical.from_array(a.convert(c[:], nan_rep = self.nan_rep).take_data()).levels + + raise KeyError("column [%s] not found in the table" % column) + def write(self, **kwargs): raise NotImplementedError("cannot write on an abstract table") diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 096d35922209a..a594da99a269f 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1432,6 +1432,41 @@ def test_frame_select(self): #self.assertRaises(Exception, self.store.select, # 'frame', [crit1, crit2]) + def test_unique(self): + df = tm.makeTimeDataFrame() + + + def check(x, y): + self.assert_((np.unique(x) == np.unique(y)).all() == True) + + self.store.remove('df') + self.store.append('df', df) + + # error + self.assertRaises(KeyError, self.store.unique, 'df','foo') + + # valid + result = self.store.unique('df','index') + check(result.values,df.index.values) + + # not a data indexable column + self.assertRaises(ValueError, self.store.unique, 'df','values_block_0') + + # a data column + df2 = df.copy() + df2['string'] = 'foo' + self.store.append('df2',df2,data_columns = ['string']) + result = self.store.unique('df2','string') + check(result.values,df2['string'].unique()) + + # a data column with NaNs, result excludes the NaNs + df3 = df.copy() + df3['string'] = 'foo' + df3.ix[4:6,'string'] = np.nan + self.store.append('df3',df3,data_columns = ['string']) + result = self.store.unique('df3','string') + check(result.values,df3['string'].valid().unique()) + def test_coordinates(self): df = tm.makeTimeDataFrame() From 6c58bf7991a3f46f73e8d8ef81b7efb245fe5d36 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 27 Dec 2012 11:32:00 -0500 Subject: [PATCH 34/35] CLN: removed keywork 'compression' from put (replaced by complib), to make nomenclature consistent for compression. doc updates for compression --- RELEASE.rst | 5 +++ doc/source/io.rst | 19 ++++++-- pandas/io/pytables.py | 75 ++++++++++---------------------- pandas/io/tests/test_pytables.py | 8 ++-- 4 files changed, 48 insertions(+), 59 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index ccd27685f975a..d2b9952829619 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -51,6 +51,11 @@ pandas 0.10.1 - raise correctly on non-implemented column types (unicode/date) - handle correctly ``Term`` passed types (e.g. ``index<1000``, when index is ``Int64``), (closes GH512_) +**API Changes** + + - ``HDFStore`` + - removed keyword ``compression`` from ``put`` (replaced by keyword ``complib`` to be consistent across library) + .. _GH512: https://github.com/pydata/pandas/issues/512 .. _GH1277: https://github.com/pydata/pandas/issues/1277 diff --git a/doc/source/io.rst b/doc/source/io.rst index 73822b38719a0..bf9c913909dee 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1327,16 +1327,27 @@ Please note that HDF5 **DOES NOT RECLAIM SPACE** in the h5 files automatically. Compression ~~~~~~~~~~~ -``PyTables`` allows the stored data to be compressed (this applies to all kinds of stores, not just tables). You can pass ``complevel=int`` for a compression level (1-9, with 0 being no compression, and the default), ``complib=lib`` where lib is any of ``zlib, bzip2, lzo, blosc`` for whichever compression library you prefer. ``blosc`` offers very fast compression (its level defaults to 9), and is my most used. +``PyTables`` allows the stored data to be compressed. Tthis applies to all kinds of stores, not just tables. -``PyTables`` offer better write performance when compressed after writing them, as opposed to turning on compression at the very beginning. You can use the supplied ``PyTables`` utility ``ptrepack``. In addition, ``ptrepack`` can change compression levels after the fact. + - Pass ``complevel=int`` for a compression level (1-9, with 0 being no compression, and the default) + - Pass ``complib=lib`` where lib is any of ``zlib, bzip2, lzo, blosc`` for whichever compression library you prefer. - - ``ptrepack --chunkshape=auto --propindexes --complevel=9 --complib=blosc in.h5 out.h5`` +``HDFStore`` will use the file based compression scheme if no overriding ``complib`` or ``complevel`` options are provided. ``blosc`` offers very fast compression, and is my most used. Note that ``lzo`` and ``bzip2`` may not be installed (by Python) by default. -Or on-the-fly compression +Compression for all objects within the file - ``store_compressed = HDFStore('store_compressed.h5', complevel=9, complib='blosc')`` +Or on-the-fly compression (this only applies to tables). You can turn off file compression for a specific table by passing ``complevel=0`` + + - ``store.append('df', df, complib='zlib', complevel=5)`` + +**ptrepack** + +``PyTables`` offer better write performance when compressed after writing them, as opposed to turning on compression at the very beginning. You can use the supplied ``PyTables`` utility ``ptrepack``. In addition, ``ptrepack`` can change compression levels after the fact. + + - ``ptrepack --chunkshape=auto --propindexes --complevel=9 --complib=blosc in.h5 out.h5`` + Furthermore ``ptrepack in.h5 out.h5`` will *repack* the file to allow you to reuse previously deleted space (alternatively, one can simply remove the file and write again). Notes & Caveats diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 4df28b59fafc1..346dfb7c8b4ce 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -114,28 +114,7 @@ def get_store(path, mode='a', complevel=None, complib=None, Parameters ---------- - path : string - File path to HDF5 file - mode : {'a', 'w', 'r', 'r+'}, default 'a' - - ``'r'`` - Read-only; no data can be modified. - ``'w'`` - Write; a new file is created (an existing file with the same - name would be deleted). - ``'a'`` - Append; an existing file is opened for reading and writing, - and if the file does not exist it is created. - ``'r+'`` - It is similar to ``'a'``, but the file must already exist. - complevel : int, 1-9, default 0 - If a complib is specified compression will be applied - where possible - complib : {'zlib', 'bzip2', 'lzo', 'blosc', None}, default None - If complevel is > 0 apply compression to objects written - in the store wherever possible - fletcher32 : bool, default False - If applying compression use the fletcher32 checksum + same as HDFStore Examples -------- @@ -445,8 +424,7 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, **kw # concat and return return concat(objs, axis = axis, verify_integrity = True) - def put(self, key, value, table=False, append=False, - compression=None, **kwargs): + def put(self, key, value, table=False, append=False, **kwargs): """ Store object in HDFStore @@ -461,13 +439,8 @@ def put(self, key, value, table=False, append=False, append : boolean, default False For table data structures, append the input data to the existing table - compression : {None, 'blosc', 'lzo', 'zlib'}, default None - Use a compression algorithm to compress the data - If None, the compression settings specified in the ctor will - be used. """ - self._write_to_group(key, value, table=table, append=append, - comp=compression, **kwargs) + self._write_to_group(key, value, table=table, append=append, **kwargs) def remove(self, key, where=None, start=None, stop=None): """ @@ -645,7 +618,7 @@ def _get_handler(self, op, kind): return getattr(self, '_%s_%s' % (op, kind)) def _write_to_group(self, key, value, table=False, append=False, - comp=None, **kwargs): + complib=None, **kwargs): group = self.get_node(key) if group is None: paths = key.split('/') @@ -669,11 +642,11 @@ def _write_to_group(self, key, value, table=False, append=False, kind = '%s_table' % kind handler = self._get_handler(op='write', kind=kind) wrapper = lambda value: handler(group, value, append=append, - comp=comp, **kwargs) + complib=complib, **kwargs) else: if append: raise ValueError('Can only append to Tables') - if comp: + if complib: raise ValueError('Compression only supported on Tables') handler = self._get_handler(op='write', kind=kind) @@ -808,12 +781,11 @@ def _write_wide(self, group, panel): def _read_wide(self, group, where=None, **kwargs): return Panel(self._read_block_manager(group)) - def _write_ndim_table(self, group, obj, append=False, comp=None, axes=None, index=True, **kwargs): + def _write_ndim_table(self, group, obj, append=False, axes=None, index=True, **kwargs): if axes is None: axes = _AXES_MAP[type(obj)] t = create_table(self, group, typ = 'appendable_ndim') - t.write(axes=axes, obj=obj, - append=append, compression=comp, **kwargs) + t.write(axes=axes, obj=obj, append=append, **kwargs) if index: t.create_index(columns = index) @@ -821,23 +793,22 @@ def _read_ndim_table(self, group, where=None, **kwargs): t = create_table(self, group, **kwargs) return t.read(where, **kwargs) - def _write_frame_table(self, group, df, append=False, comp=None, axes=None, index=True, **kwargs): + def _write_frame_table(self, group, df, append=False, axes=None, index=True, **kwargs): if axes is None: axes = _AXES_MAP[type(df)] t = create_table(self, group, typ = 'appendable_frame' if df.index.nlevels == 1 else 'appendable_multiframe') - t.write(axes=axes, obj=df, append=append, compression=comp, **kwargs) + t.write(axes=axes, obj=df, append=append, **kwargs) if index: t.create_index(columns = index) _read_frame_table = _read_ndim_table - def _write_wide_table(self, group, panel, append=False, comp=None, axes=None, index=True, **kwargs): + def _write_wide_table(self, group, panel, append=False, axes=None, index=True, **kwargs): if axes is None: axes = _AXES_MAP[type(panel)] t = create_table(self, group, typ = 'appendable_panel') - t.write(axes=axes, obj=panel, - append=append, compression=comp, **kwargs) + t.write(axes=axes, obj=panel, append=append, **kwargs) if index: t.create_index(columns = index) @@ -1902,7 +1873,7 @@ def reindex(obj, axis, filt, ordered): return obj - def create_description(self, compression = None, complevel = None, expectedrows = None): + def create_description(self, complib = None, complevel = None, fletcher32 = False, expectedrows = None): """ create the description of the table from the axes & values """ # expected rows estimate @@ -1913,13 +1884,12 @@ def create_description(self, compression = None, complevel = None, expectedrows # description from the axes & values d['description'] = dict([ (a.cname,a.typ) for a in self.axes ]) - if compression: - complevel = self.complevel + if complib: if complevel is None: - complevel = 9 - filters = _tables().Filters(complevel=complevel, - complib=compression, - fletcher32=self.fletcher32) + complevel = self.complevel or 9 + filters = _tables().Filters(complevel = complevel, + complib = complib, + fletcher32 = fletcher32 or self.fletcher32) d['filters'] = filters elif self.filters is not None: d['filters'] = self.filters @@ -2104,8 +2074,8 @@ class AppendableTable(LegacyTable): _indexables = None table_type = 'appendable' - def write(self, axes, obj, append=False, compression=None, - complevel=None, min_itemsize = None, chunksize = 50000, + def write(self, axes, obj, append=False, complib=None, + complevel=None, fletcher32=None, min_itemsize = None, chunksize = 50000, expectedrows = None, **kwargs): # create the table if it doesn't exist (or get it if it does) @@ -2119,7 +2089,10 @@ def write(self, axes, obj, append=False, compression=None, if 'table' not in self.group: # create the table - options = self.create_description(compression = compression, complevel = complevel, expectedrows = expectedrows) + options = self.create_description(complib = complib, + complevel = complevel, + fletcher32 = fletcher32, + expectedrows = expectedrows) # set the table attributes self.set_attrs() diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index a594da99a269f..6f11ebdaaa7b3 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -203,12 +203,12 @@ def test_put_string_index(self): def test_put_compression(self): df = tm.makeTimeDataFrame() - self.store.put('c', df, table=True, compression='zlib') + self.store.put('c', df, table=True, complib='zlib') tm.assert_frame_equal(self.store['c'], df) # can't compress if table=False self.assertRaises(ValueError, self.store.put, 'b', df, - table=False, compression='zlib') + table=False, complib='zlib') def test_put_compression_blosc(self): tm.skip_if_no_package('tables', '2.2', app='blosc support') @@ -216,9 +216,9 @@ def test_put_compression_blosc(self): # can't compress if table=False self.assertRaises(ValueError, self.store.put, 'b', df, - table=False, compression='blosc') + table=False, complib='blosc') - self.store.put('c', df, table=True, compression='blosc') + self.store.put('c', df, table=True, complib='blosc') tm.assert_frame_equal(self.store['c'], df) def test_put_integer(self): From 17b6c0df5195a403b97830c6e16a96197af3be24 Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 28 Dec 2012 09:52:44 -0500 Subject: [PATCH 35/35] BUG: updated with smaller legacy_0.10.h5 file --- pandas/io/tests/legacy_0.10.h5 | Bin 0 -> 238321 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 pandas/io/tests/legacy_0.10.h5 diff --git a/pandas/io/tests/legacy_0.10.h5 b/pandas/io/tests/legacy_0.10.h5 new file mode 100644 index 0000000000000000000000000000000000000000..b1439ef16361abbc0756fbf7d344fd65d8a1a473 GIT binary patch literal 238321 zcmeEP1zZ(P*WW9Mh@u#X2_kkN4d&3&T_)1fDF_yJcOWK;ofw$lV1pvkCDIMjDWKo% z-Q9bENA}+5jqmsV-q%&`Y|MXW&Y3f3&YUy5yGKi1Q&3>60ETb;{1`7L#N4HRevw=K zB_8G}>X^E|n%rTK>25N;-jCQ}V0;*UZ8Ujah`j$q;&xa$dEU%KLjyxOsryqYc^Ql4 zBlf8KQ0M=o{~H|8(oiwNDUBe>WgejY%+16JO0ISX6UXa_1RIP^XYS#0$dCS+X`AUG zLBmF%KU^*;xn*WTKiYl{?kf0?D*mH}4I|~Xw_N|-`}WSSrl(?Rips<5(I5R$^;lwJ zXpYmy&X3ocAG6RfG1WFSVCKiW4wYY8QVJd6`IVR{1NAgKztK`N6;(YAQ)^8_6MYpk z>%QZevD$y#LUMe1*g z6V4v?4(rf?t(&KvtCY3BjoVrWU-q#x>U{ViX0y)OR~n^M2K*S8cuM|-@JwhkH84~s z`Z#gS&r76z{ZhEq6Z^_RoTudYup@mwiq6p+>GMq|FXxP;&o`R9ypTga`n&x7tCYTz zBKc=Twi{a?ai>QIV}xG9^$Miidf2!-;9SS^ch}PflK5F%PgC^q zqkWX0Sy5jM*R$w9+LW2U6kn*6jq6X~?48~5jD6uqb=|Ezyfl8y%f`dr#@E^}(2H43 zW8Ej2_yqj zd!qjYnK_%1w6nEp5oPbWOrnLGuh0v`NAQ&GC~*V$Egdi}8U9XxFnf8T+pnh%%h zOhc)x(`be1DBaq2dpn*c?NLd=(~0BK#b=<@^R|l@o~~FueZ@?a-l~tfiKjod8n@%A z_Q=vDvru~KO3+C>-LX_@>ui*!R>mjdsl)~wu{kL95o~eC(^u#BM&apQ6Rqo#DD5iB zX~9#ywcZ+YQF?}t?>L@ThMX>(D|qOb%@(b_89*zvzoFW&c(_Fto=%wF2vY}K)qPd> z5{x#MD2lgjfb+g@D_$i&L^%unH}$4 z0n(&$ecJCA0zRXbyOM1U@bpu^x8+zp*r&>$n}7NPc%{!X(rRfbs2On6r8A%w)@c+L z#qX{M(=Ke-F>hfB=!>sw^t|SoSgHmo>KLr?@X=J)mDMuRRB~6a@K$kJE90VPr6Rk` z&);&bgR8cgbCBjb8!aCeoHRnm`+GsZJPf21jSPIAz?(=zW?L^9+{byx@;89<8 zn2L#s%2HHkHa*K9>5+>@#w(Vlh8law9r=;PpxpTOHQ0sT>Rva|E_3baPE6#?{P=jq|= z=d;$%&(lZ1OIl#M9bsKd`Vm&Nq^F~!uY;ejfa+RbXAcKoUv-;60WTQ=J8Ns(wa#vS z&K}m*0-m-m4t9P5Ua|rPekNXW0@LliOnxC{G+YR3c zIVJWnthdf&_hW8#~{Ef>1$=8fynWd>eT=0{`{w({XQ6(>p% zEy8VFF>xx9pBT-k*8G$HuW+FE98!xUJa$q~4*aX*ic~yo_e7$c>tuQC@s?XVTZ(yu zKk+bvkN?T>uGq1^ThYCgl={E@wqjF6+=zrEvOyXAZ5=y^=^N(`GW)AM}q zqlx5+$-{eJ>3K{t4;wwB=lR}8{N#z4DZQ`sJZ6n|r2nMn)qmd+TfhBvUfO;Bf&5?p zmHti2kI;9&;xs=X`~;L=hw2}c51ZZJrq3i=F@|zEFb{k8H+Fk)=g+13`SxMN-NT6F zZ}(3UhVhpPwtbpW16C*bCo7nB!4+01#Y;EmgUaof`JZh02(?#4YDJhQLiahvS`UW( zgsE>#RD-}>IKEz1cF3nw9lX_WEj<>Auxq zTe-LGpn3JcYLb}zFR^qecxim|5Saofqat!6{Co^Js4Zb!|Dh5VI%^I2wxbRj?(@x^ z99IHd7mK-l-CPbV*6@{wDZK)h>;~4`&CCJ3?-Fm^^vMA>Z?8V?e4Gzb4xL(Gd9@9` zo*2SAcuE7<`ouduKDHj-UdhODvB`mLzB4|2tFMQ`mpiAO`_u|o6c@iR=P3fpm(o<` zWmiK5$*;CI9ExDfhr@e=Q{thT$R`(%qg^m|#t6-E8J(bB>K%AKq7EK7I`u`r+-6{M zE?Ib3OdO~iza%d%D-|5Oco(W{`3enkopgrRCBy3ZEoq6r8la`u#jYD6)sR2j#?rYW z3JU(}7r1j$88Ck59QAZS3fTTZ-)>*wFKF6mX*FOU7V}8B|F4-zMR42d#wU?&Mev3C z!hvqiHDJ)*(vF0PG@vud@_FT@LU7UWQSgbRF4$PR+(EP+$;Fw_l2q3itfQ; zVo%)^(d*0wr$2n%niW|Jz0V}i{;bso-_4!lxBpc%thng8eD%F1aO(bY5Ao-v@WL{Xz;}(QhNe%TlZma1fsVPeo2#XX)EXlPcU>PJ zO+OjOKqYe@KWR%D4HZ3U&v|}Uy6U!m8gAakn%?g6{*G45*1G%I2dXXechPrK^j7o? zG}6&m(DN`^=dGmVp9Oi$iz`)nY*o!jj$r8=I{ zOSISMYI&&WnYh}j>#ft+Jz`GniTzhVC*RKrYgTzdal0-VfOJSr*3z=|$%I?-KtQB}&ZFRnJqzX(8J>ER2umQL}*6UE?&jIQK z(?yea*8*J&y<&%N%^+CvRX=rrV(1(9Khh0qhazL&ZTD8G1IqmK8=XsIz}kjziPO2w zaOHacy1ApSfJfKml_eh)fUAyr23-eU!v{&0Pp1#gfWBKgVr`>4pxhi8&xp^7;N!wK zUq%clfp}*Z=e{ zZq`n5dR|6(G0`rRKcGh^`E&4J&jHH6yjqbU)t>CNz4bn4edxu!0lwx5mrHUTkEa{4 zjEA|7X$i_fIrXNy;R}|~G9c4w%x5H~3HYIvNjB59WzZRS8y8O>Yy6Q7{Io27S{6U; zj8-MFPRj|{pan#OvFl{HgkRLd)5Dr5+h!g9909(5D)_JEqSZ)ls65kYZqjIR5-I`h z$)G)SQ?w_G_T=$Bv^WaCu0ZbNITZ1ITTf3WP>DLgugt>_&^iemftEWF2lxfF>dD5> zOB#_UiRw#@8+|_yf%{1*K6#+$EUc-8Y3Tev?gysICEAC1OZHP7NQSWd+B^h{crYnu=zS_$+<>c4)&pZ` zALj+`!)J>zIzStKuKT(lwmdU0WB39q9E%s(gKXp^#uxwKjyimq7JeO1@eS{;L>2ja zV$eftxGxyT;GRG{59Q$d-I{_PsGec|*eA&LG#ep{ErV64(Fy{yh$` z)&Y_WO4blU#d;q8pSRnP5TlKOq@1}1T3bU;-CD}pKt*2zKaa}@B{%pn)t}hvkT;Ec9vwQinJ@DQ2W{*CBNd7%ujypW+!@tMNbB9O$ z`1g1P?(nEjv&Wn6^h;T5YU?q3BHnH|?DiTCbPYzS-9?c1ke9h{O{yN zhK2uo<;t>f-Sy?~%T>ltqxuLU-AdJ4j)ni7?j-UvVXU9u(=X4u&p$@L0t??a{S@E$ zP#f@L1}27Q2!8>KHMP3=uk!=;x*y6u7h=vU z*mG_IjN3`w{VCM*W9;X_D0_*-v+RF1zWcm{T`q+q{7$6ZFJ)~{yxIaU7dK%jyB7Bt z;mtJ|Gr|*eo_H?_3rCWQ%`VY;6AZsE>5_`gDH| z!_m#t#t$)}kNQYctjFtvFA<~YApJT_?-#Q&R82^?8R3=dK5ytzADQm;!TllBeUaZx z7GDB4n0)H87k^~B*GH@yj;s&qc4O8E?X$Vl2VZ2n*Mq;?{fPFJk?W%#vK;C`It-ch zK>OY6LHVmVJ>RN;YJ*{%{PtEB%xp;J|7`z@9O%6sU5I3Ym2ppQP_ARDpK0~H{I`#& zc)@5x0;)uo*L(iRIbN`uc>}t50lsvWsyDNr6Y&DJitiaTU}65AL1L-)r~Lfg>))e}vgmC@B!;r3C=5;$?#id)0%4Zu0B!lCf4^H? zAb2@(X(O_$$TvK&aS?XJ#eDL{z2gG0X2kVBzkiPdz2gGwNI2!s`{&~Vl>Oe*zuY}8 z@b~@uH1Pq}C_xa?-A~JoOZ1En{C)pEm+^r<{rg!IBwtxR0^V75a{QsPvp6&1d$LMGK`TM4y z;v08`wzK|Se)_aN{Pa`9d$uI5jPZ@7kcq=TzkfLJhXa2&@P`9`IPix9|3w^N z+lznv?Rc{?&}*CUc>_7$<-*@?#gv9#{_P(AcHD7ASy`lin{q=j9m|v(in(l+azl}h zceXv=qh5df?#x!Ad)Fx8ckk<+jYiJQX2aS@{{kofe1toZ|Brv3;KTpgZ(siTwlzID z@$d4_KXoL^$t26`J#XafpVwmEfX+WZ2Ic6Xr|`Fb;!EJsr!`2|e7E>y1bM%{{qQ^f zdO!4x0I^PAWg5P4968_7FY>yhUv=g$8E|8&LNegS^v+WOQ>J&G3Yaq2@#6>hI&$Vh zell*zT*ptw4bd`wv{)al-zVb-l%Jw+|9f};6!U#8-5l_}^)dPV11`Q}K$Y9~dVR|O ziZiyG|D9Qd|2_Y^7*iIlNP+yT|GkcReJTn%t08_-?`gt$hx=&oH3B_+*2q1BpTm9R zxGNgJE$Nb`U2~7G;m0u((I1W>-6PoV{82s1L%uxxfm9qzwJYT#@9u-@fkFPeMI?rk zU%kx(Wkj15Nklilx{(iYDH&Z-BX-$-_23}l%&9*7>RQZI4HW86$}I+Q+!A+hcI?*90m{`9}^59iXK-lspDOMiNw{%|h+>8L-k^>ugp+5Ygp z{kfFB7DV@;){V3M;QyFEo9z$(o&F%&m)Xp$pYH9)t_RyM-d&gc9{)SJVf(}Trk}!b z=g+3(y|-QR@A7B2^x@C$xI}4Lf`zye#5a~ABo6=l{^7tM4*cQ39}fKCz#k6$7jb}X z?{$A~0XOT;DSM}fpOuUC-@o<$N_AgX{2Tu-<<4OGe<^nc)9FFEGwA)lT==Cae@~Bf z={@|?zgc(vd$$ZCfX&w){Q2XD=GqVa`|p;=lIAMWTAbI3$7~}yEB}8{pZ!A$_Z;ZU z4;{R@=RIuEw{P){bf5jx-@c;mZbR=LttoPbQM8 z*J|=U|LTNV*RD){KY2CDA8zNT|M+`G1erV^O%g<{gXPDa;b`+B`#Yz2)9&3we(FBN z(VyS{6%O?7=gK7EYe@Z1@uB;7squ5J!h=RNV~aBwIaN7%vK$N#32{P}mi--`S1HlUCh>-mgC{9!44BL`$w9Em}_kqM6u zu(#gN2lRe#B%8lI>E$fvRDSN>W6Irk8~T%9x5KcQ5H=0J+ashNy>BY?cURfx2q{P3 z9Y4!H_q#oQ^5nW*zq`slN9o_hz3g-UNqcO4^`HKJPWpcJkA9byT0cbf4^|wyUq5D# z;ru<$iOl9?kR5}HA5-xXs=u)NUH(3z{gv@$2&}Dlzhk!xcXU&B=tQy~gpqgdeIHK! z)Y|Q$;k>sEplwgUT~*tB*w#<>{cF!QP_#Yc+UG+h@WqOxVaFZXK#a_aZSEt!!ew0x zEVk)IgAv7BR_CWxLGv7K-ltg|@Y2logZ^K(!e_%}3*-~qp-|V{;M}5Iz_X=3TIX#F z3~!QE7%FrZ%yZuDBxh0yZ+mQVu2(65ftU7)SA}K(&FSe616)(UM|shqk&?0Ct55aZeCv_H>|W#_UYH37$WI&g)YNyAx92lg$oUk4g&rl1loL*P4Bd8FRmll9^=h|UrmgJY&mWkl5VtcVg zNg{mR3WDBSSAc@>#i3>!ia>5hyhOOjdlFn(Hg11F3QM)sr{3z+W5MZ=N0f9oQF%BxpaW1Dit^>5Ax-z_oSh`}}iqf#J?3 z**91#d^jX%hv|b_IH=^!7v-PLz%j_!@1SWr$mUJDZB+9WR0kfNqw*^SI7qlC-`W!k zIs(^rm941)l2^(_^e&e}=@q92j#^#|w)~37s+R8p*K1}4je1cHhu$rWD!W++vu(F| zoa6Zh&sDt1A26^2P99&n?|VxwoG3W;OYNg7$Y_+`W%aV3^14g&g&nW;Q@$Vbi&2o* z0bkYLif}>iy7zB8`2OZ4!I&Y@%P;1iONXBVTrARdJc8vrhhC5?ZinJImxn0$R6()H zyyA6!*AE8>H^EQ3YM=Qg zRKpjGjY89&G{CzBvx{aby@rjm%6-Ehr9jc(56Y#VKZ7fiv#Z+D%m>=OHC?D$soac^O21ci@~Eo&6n)06JTglm1xM_HaMvM=*@%X2Z7{rbzSL# zuV8_@UeWz|EpSK8Q-{2?0vJ5pW^=)g0?4=i#X(2(oe;s(sUf@Nf51b7Ewl8p>)`t6PK;pKJ-~DfLroDIhYl}BXgA(oGxhtuiMWCobV`vhQ8kt6#Tw}mr{yneqJpF zDvZhMqw|X4p^d@gwbiO&amCIZ!@noM8Ot*zN)zirqiWqo-w6dEqHgEpct#z3>0MMS zZS(^nA>e#Rs`+8c^R+1oduRB4cwnC(*h+2mwgpo zej8*rMBaVia2cA{kJZ_u*9_)1deztEmBOLIN%hj(KEdR*iWh4oGof~>FW>ob(ZHbQ z(F60J4CTUdL!Hb1d9YRHd<=g;G8|A*ENVWW98TyD1%}9d2HTH`WB1=Tz|;w)7o_4! z;Y2Id^ViS42MJG1<)sZO!F}sX;=GZ$(C?ex26f>&C_76^xW1+Wu8#g5^*X8!td1J~ zJosG$7zxvWzg{66rlMG$|EwH@9Ltd#i~3pO(X&QJPGtiRhec|$+c0GV>DAg_f(oGA z%YrXE=3+7ZX6br~eD8$ok9DaBoT~$E=B-PIZ%u`Z&2y^#HF%X@-Fx>^leZqagdQ?` zG5aSR^JVaciGERV;+L;RAUhd+E$@G5#@jOJyLqh0I-L%%>D#3mO|fRsAO{mcT@>1zh^K}?Pnw0`!uX@zI+o{p~!nleM=}fbvb>-8SQo;|3T=yh*bkz zUvbj)MO_n|{oM7sz|>M0`XKtQ)SE)k*mm#C*}Ph4Fis$O?lbyhThVG1EIQo{bg+y2=w@Z_T%on{?S@RVMQ!-GRGWRq@F=h-3v_pR<|5Uqz- zqZ&L{94my)Q7R=Tw&uX7ev?&iNR^=Waa!xwm$Il0T;4nV#-oHX9hXN*1^jcU%gb@kO8uw z7EeG>4a|1j;UksZ3U4{>u>aDY4~Ov?Shp@H0@GAXSCnka25Ze9S@C`-1YV&rzYZ_^ z3?A&8TrNB&8uE_1qBcvv5!CHoKUpfS5a#;l2Grlcl#6YghE}@N0o55>7XR8(4SjvQ zOz-Wf2M1-R$`|LQ!`y*qW>37B2*S(;Xv{j&0{m+}jE*h44Q7?YeU-kYVE%wB7YxNP z<%-3#%QwbWfJq_+b*(q+Kw++GwM%Rf6xhE|TwCTje5#L?#cof6y!DCujh7Tc>9r># zcOR*M^FJ;6;lsERFFtg=n;O3DjaP!J*7GeTraL6j&*(VM# zV)jZm2#@ot0FgSfu_G*!fV%f~Q!x2EXi#`^I{91;G?MRk`l0rBSiLw-Ml7xg?mYbI zV%zaHC^L88vu`3{;FS0IEC?1jqq;u@o=npa)_GhH!P3TxORx<-x-k0OkS&Gd2@ zFl?_?-jo8E_&hgUG&c`OcdU@|h)9AH-uIu^bw3Ua96E2PeNi)1w~ARNR@??3Ze3lQ ztH4mMdA&!dwK5%E^_8`an|l#Hm6jU+zOxSOS=#S`fNwp_%u(p#o81P)M6HAm&if2j zm%m&1aprgEnfSn|(5n-e`-*Jy2j78tko>I1&@MRA^SLHw+6Xfr34HjK)&y}xrss;&|^6a@XeDP0EMz1;X|8~Xm!_^wm7 z0&*>&LEZGi54A>Exj6pznANRt&4^0rA1}*5lF-zV^Q|FRZ=-B5aAy{X9xf|>PwqZg zudG`+Y%8XG;oJt%_hALdesztnF8cx2pE%>%b-D%)IrQtvORzaQij5I?l=W9Mb2bb|@_!ccKLx zsgUAdaJmJu*K-YGt}FBNvGMTr^7N$~=zXm#Bd6yG^Y`Y-?EIYGQ-l`Fq3!Q{mj(AD z^fmuy&u4hasWrVHfn85xKEr7q!p{GnJ}>3YpTVYwyZIbDKez2l_=ot(3FlwuvD|&< zo|?C^+jj)nzI9Cbp?)~cS2?#YaX$un%Q{YB_wU>%Fy)RY$jBRSM42qB|RM-eI5LK1ytAiI(s+I&|>|t#!fF3$vwp} z<1EkN_78v>f2pS$(E1s)w5JDJmBEPC(VQTh-vBB!-Y;Ds)&_Qp-<>u0ODp68t%hM$ z9y2zbF|3DD>4x{JPB*}~!9F6dpVf1NR%b9KMDF~x<7zsvXlm1U5h(yO3nf;~Zu~%x zmf}00G+9uE!H9d(Ja5mFHn9K7?A)p@OgY&h{O#Bqzv$^E(5e_FJ-ooV>6PuDO(+5jQKSAWu0^( zoOSfY++f=b;AK(8SEwFMU-sq@@m);1E93SB%pQ{iZjNr*K6XI`P&YUby=hAcec1$B zm4QuOtMkKg;yZBZqo8)PR3)@i7v4~Xy!^E7O`s9o@6EEV6qja!_LY0x;tw;F6PBF3 zy)C$ao0=oK7sL;p`T0!-eE2wM$7<1EzHQz1w>B~--{x}WMjbT5IIm1@uA1jXc z0TtIXp?t7G;;K`H^koxhM7KY`#n4Wt5)kEa7pqyB4gJHjOJv6X;s%ZAHu1Xqd0Rw2 z7_a~=YkZA-S@xD=bw}rMgGO?0^y$>dZ>I{tg5*hW9o!SZ<6{fnFY5Y9kDl;cXYv%B z?)#l_k2Vkd3FXbdxgR;70Ie@9kKQ{hh8r}Z`$drRTnV#xVAMQu?Q>TXVa3wHms2H5 zxk0O8jNBya)Lfl`?Y5g8S--y?3~txmoNdwu z72eq0($s_WXaNs?p{a;&2LGP`DJeC1Qi#3=X zdb$ZT(rY7pOMhK9S`U9Nyd;p|(gt<%*H1p4z*X654D2~?{57#tO+fnWiRfn{17bSb zjh38H=fHEKY(%$E!v52`=Uc$Fn-Ao4UlziJiD`L1ORDMVzJ6Hx4zs<@Grko)I@$RSPS8~nx)xwvmp z0bCPX9|<&D=+T-Fm+4HyI`_uv- z*p$mki+#Rv>eQj<%eEp!`jxXkUAToPm{JwQRxG5XSxq7qI=69=V!9~f{ zNr}6k!{p)mZ@q02>FFlQM(sT&$~;UgAsYr+`dmEowidd9a|S=AbFjF!w%y*&q-zM^pY zniFUw=Zi$%WQ4jj!0JOuy2{6|z%WMA=Z!l#aE(ACx|2lS+(AKcXgO-R`&IFL=xgE- z8Sy-Yp65w>R8nxd8Sy0}ALndr0M&jUzs+nc02zyRJ&3Wn!VMbHEwg}cmrPX(7_78e z=I#3e_&6&0w$JinZqSHs-5-I%(u1ntE?0h;nxXkHCTCzlJBo$T_7bAz$ZmMA?XL->R_H;hRdp z;MphI!`D>8QMU&D7;>M3J}1yfuMNHE+4#Vy76zK#+8J=E4qTsS_dWi3EPacuSUr8k zOkA(Y_?7gZGmlR>`P;ITrM_9PD@bZ)o7F>Z(1`BTj!m*28{R|z5cAi2-133Y^^Z|& zPV>ajN-9w{ve!;5nYWfztRUdT|(T!pA8zRSBf2@U~v8PtvBajuMud} z-cfrJY+qEjz?Gv8Td#Rv2@C>5wFppjlH5PI=etECXQeu}*6fo9T8!o`opW+&03wMUjNnT6V$@lDTt z#r&ULaQ((Y`3YT(;LxXjrH1=BXhi~z=)V1O9egCrP;NauK*u({9{hT!oV_=WD>Rbx zyqXz1%-f5=bHn|i%U{)lRR>R9HnVS}=NeHqYHuY*^frEx6sRc#kI6-rfSjr->8_(3 zG}EaoK_?O246HV5iXddvgX@Ct48?;!!Q{2sJu07@veCRmd;S1;*rEc6 zt*{FXSyKoPjnA4I+t@~5_Ku}WTW2%rK496XwDlu6D&q-+HYdZCd(9_Go{gX{n?R%S zdwJ@^>m>$C5M(Q{bVDhZYRW zE|~FIGQ>S5lN&VB=LP9kO|)M%z{B+`NLyd6jysW21k@>*MkfHcFpF?{~Y_JiVgWGEn z*S?JXvZx7MSUBc}ooOMME-hUx#&|_fH-ScUUv_z=Zd{1Pq<*|!si|56vM;_!Y^=M( z4I1_DueBGVO0zzI#)w=KUbI%PTsz_9^!O9ppiz5&7c~*jSIULL4KbHC-f4vA((2Z{ zY2?sDeFR(FnRM%{db>k;X9?Ic>{$Mc_&1<*QTjg1$N}^tFo8yLUd2;%dWCfo7|b}F zV>jjpEGT%gGD@x$(2m~(8nyS!FCR4w?xw;`SyMJ$(5r%jPQ6|lsD^P9uW7^^f1@Ed)GaPqDb>*Ko>^u*i@R9IX4hCR*1eaXD9$vyzR9_zn(q zJeo5`stTl4jW6TT7|80qxQ~EpSJJIGvp}=-m+!U)k9$akR|okAAF%l zcNOKdFzKGIcC96HOfksnT)y$_h;-03h+oOPlR?V$u3_3xjJ z^FHmd?f{bDqP>gP2k6{ayO+MLhvGqg6IsF;*o`W-de8&;p4D6}e0dhCTji@xzxTeR2gL}o?ppjk^U9fQeNT(Q3zI=>y4ORyY9$AYBw_l?{ zqY4-JwN*&q_PKw)c%tj(dbs$G#K(6(zJlrR`vpz7-^>jf-NyFF5aEv&`S5fEcGz}P zCTyKM;nCD{bef4(Hlq9P`Q~fU!#{wahW4(Ln$5tNarn4^$3AY#M(rIq*RB>DQVcGX ztY?IKyaHP#Mfq**=y;CGX2nB?he*^_l*WO<1zo}8_tk^*5l>3}vl%pFC<~4Bn!XVb zk2{tEJBosz-P+O$EW&gKKW(7%fV0p@&JT*2<~w$lfHL*m(lfH<@MFOTk*lV3^D!10 z(QOzOaJIoR6O8-VdDWsd4`f?Uo@7{0HxeN<84tx6VvWYxP4nyFtwp@q(hZeBRN0_Y zK!^j~tazyXy;*bb`!<0OD}?lvPS(McV(*4lQ@W@Ft8CQXDj`p!r@2*uoR7_ei{l!> zz8TZK=Ga8hw>MEXlJgS*5h^JU-hqB_^ZdattD(w}lF{;RbUsZ~HWd$LoOiU$+L&Aj z!^8(*>GHwN5*dG=L`W{PIXw=?Ljj}>ZAJqXp@qS(lN8bZ7I&;em zLpkuAjE6E9eE!0G>p#_jyKOfD8bSn=g-S(!Ovs_r$f#@rjr6&l#<6`$-Zh~AbNL{( zxE82AVgBC3Tj}gI78;FX=Z3dVG%#YsJOlAQ{^3`khgJ3=Uq?Ef%0i>|j(TOeDKzsn zsIOhf(>S0E1|*z0o@d9QY$_g#mF?LrqIx6>I%imJ5kAlfM~LR`b`7VSL8G!+@lcm= ztF2#PE0}-VM1s$_9*&i7&A+{kF51FEqxMeSZ;|AxTLm3Pj1tSamj_}igRB|f=zLr( zG@@H(#^KbcsuD0sP+`^$DptOO){0HL3re4;O#6%XCGDC}nbm`JE}BPGwtt_&VueJ*3k5;{K_zhWhwy{^wb3c5cVLFg$`=V~bfZ5i zn-veO$=@y))%5~Of8SP!4Q>WQkK37+2ei<)7=cE9rn8|V-+Zqx0hU9c-Kp6Pp#93? zs}oF{=xa{KLm7-u2U0fa&TIg81I$+@C>4T+*>!h;c@>R>vfx;bpLNVrzz2`Z$Vrm(9C(s%1LwT zn&HDXnVMs3D#7gS)1^hj=w>x6G#bC9_iAm4(D(t&G?&1W-*TYD&tFF>rqZp3VxbY; zHK+T%@PnhESeJx#d`unW4SMYSup@(>ZZaN# zbAv{e*gI_F(Nz#Vo#(93G+d|=o)NgPZ0sUBPZ0tUWg|HcJKR5A$Ug^`y-nC+7r+}c z@YJ`eexh-ZwttiHPzL52{~^7#G!@)dFj}-|QZbxvl^t_L{4;&c$#^KnC_QQS#dAeI ztO=R1;E`@7Fqpsl++7zAy@VAH9pg87-JD2-KI_efXHjP1#7V1*=}T zy1uvwgzO3*`n0PCxJQgjv>hMM4I0fu!$;Jfce+~xC)^!=#ARdy7%um|3PVyu%QY$< ziVZP+$*3Ch5KK*ceS^oh4u(6Vy{J0Op_dS7G=7I%%{scxuNpWWiX6LO@qm~ydLIwX z*iEg_V#A!_!8h#?s@Ivi+E7KM?Pt^ULif*R6G=GtM_?0sHy=>I5}_4 zI=v3KJTE7B$P*`eG%FtZ*dv2)Wa(SzH;n&s(7|_bg}XxBTFo!?TqEP57{h33ou9qJ zYj{vsaJ(xYrd-$fC5lIv!`y%s53P7KP-t{{0jQbscxK4>N~mevIBJp$hgl7QM&q~j zo%p@Gz88ZlMpNaMq9=d!$dlJQUmV?<(1+S!M-KqqX$fP<6DL1g|tUtJ>( zGYbNZ^x9r)HDAW4Pw=bQ=e2>uGC||hhcxRbn!7HEo3~Df!&f9|ICQA1&@cF`JRlPBp3NGd%dR&o%bD~$#^KnI8$8HIjK4w zrk(NAJ-@60@E^RlZ`wsVp9c$#B&9*LgD(uOXy{diicwU zYo6~^{8l{`=xQP z*Qj_XgO@S!s8?YVYy*pL?XBqqhjoi9Z>Foz%Q=BYbf?K*cd(t<4!jF)Khv961%3Co zb!?31FfU=nLnoiuexoAiD;&Ii_w`$^OJLQq&Tu)uTyDxndTrCyM)^ms-{3;GS<_Va zdY>B4)dlXt&CvI|LB{tW4z>eo73e2?+SKrlgN04T zLoo(#G{5nu-K8)#N#l-E zCvW61hLZ763=^9$&dTpg32Znf2WlJ9`w9{w9p<}nu&~K^D8?9ct?b4Q$4dCr?aiqV z7m`3%x#^UETsr?b;u?WQ4FVt*PhG^#m) zMms}iYJHfKnE>y{hOC&KR|mHl4E7qYNcXG*3ytJlpLg<{XXBp0*Xx$=-*WaUFs%I9 zx$6{%`8O*b`uOw>rP{Md8$NKqsdzIJtSDZg7-3sRFXs|OJQRKFNYd+Ja_t*9C&#<^ zd~poi>}Xusu!w_|OQ6ww%=X+(N0G@bz-ejc_LA>tL`uJX=<#w6BQSwR^O8lGX}=a( z6oTWsr}C+fX@cRW^44?~a*!4Rjr6%$xc$M!C>9%L$?tBL*$fo>PaW}a4~O|T6%WPC zqHc-HPG|&5l}ATd6ug3?bj{{We1_4tHyIDbFbH%c6Zu==`O%dlijW^U_xZV$mvtPp z0~rrxU@3yG>lxQ8(VTzs8OfQI@Zf=%pk!gXr!-v>2(Vk4`pEMmu`JdnI{PFGj0e8+QjN&ZJwXiCT<|hoUtHHyVv5JV^yz*PKqfwif`C zfW|ZaOE}C0S@FRyBcTI>#0qm#qhqWvA_BTR3<`S@BS*^n?X7zLtP>!_M!O z-B|?`CrN_lDb@5`Bhbh`zrJNx{7K0hP-l}wpvU-1AZePnS9s2M`t~N!sJ-iE4%wP$ zk_alFCnxLADuI*Eg<0rZaD_&4zM>?|PItj)Fud^m9p%;suxS5~XwOxV^fhP2LqE3i zR}Bv>gwLj|21Vne;O6BCbyh+t^ff2rp%@mv=hbe{rL8a_Z1?1@v<|51BN{r{or6v# z(1`B6yiVJWjcfwNiv$9&qu*ip5_75CIJ&qXk`@Au=xz^Hd|n+_4jlG4ZP`1l1X``w zS5{`iA;Lw)LmB6C7G2=oSP3lVD>CM*wSaA>;m%E*v?41WD)spDvSlxGz~pgyYfUBy z!~{kh`xr0rou21pJQTw^RXZd#zrKgtee?zo8}ec4nz6pM^W z?-zPK9wY|cSDWEp3gh-TYJU!+vs+NvgS2m`8*Js6%QTJ5+i)4?lHX4b^Q5OvvkneKF#XQn4k1qqvD~AFw9d#RWw=Us?5fiG3zy1#FUgWCpbdt{VkI<}m=(q>D9`WLbX z%&me=Efdui%4NYP`hm@j5h?UMC(wv)vw^Qp9n>uYS=U_-Wh$kDNX^+(-X+mJU5bjN z;-QSiI&v?6J~|7H9L5GJrtWpw^C#^u$mePDTNpUVa zH{^qvN&0J6Jg2jI5SomKqGwx_B5Gx{vq3}jx={4;OE_oP)JaDVa*$L4jpk!l8%vM8 z&QAid4+o`&<`saS7w24;4|z||H3E(Lcg&+lsmEO(0{pmq2y#oHt4MnP81 z(R?gLp!Hng=u+4)|4R9qhDH#wRW!_?Kiz6agr?%5C`Ogwx${;g4D%JcI%0qyG(LAI zMEL{NKyI zT=;ynv||+Q>KInd(fB=Wn7c^ysdjKA$zJF7t@DgB<&OA8ebWyt79XRdQ?g1aTQtyJ#J16PF%mlog6 zhO}cS84qP(Q*t^UTGU(0ux^Wx#dxq-F|Rp?@|sKp30ve&kH z6#kIOsK2JhYhqhe_P87I<^U{4VdbR%o~=>GWP34w6d7 zLovp+8NBkPt7G6(TVun=Vuj$=%ZHbeuG9GyQO(JCC!E`e!R?Sg+YYl7}8}Oh5-d!@`plEOk9CbFSAtRYXi?QON7xFKZ9QyDE z4B0A^aCGN87`OSwCW(*N=p~gE4=p#Db}>n=9qjKao+`I05B6_tcV96jlN&UW^E#>5 zvUA)Uq0K10HD*&1fqua1O}<8SaUw)F6%S<$2@y0%I`IJrS!)hPb~+d{-~^A$nJ4r- zXT?K5?X}y!KCu|SXc^ii)E{|>ekCb+iMDWqMs|zDzF6%8AF7~n=!wT&1ubBXRkYv1 z5p*+2RC885bZ?+u>CAKK0Dd>xo357uO^2@vI`ZQtH)W&ods9MQf?r}IxT^WWTWnJy zcrP_idEKHmdNeB@nlrLC=ego95G~oif#S@F(&8Cc7y8RfPkBfyya<0*( zzPRH{BaA#3kgM7l3kOW9nVYuo5Ixlf*U;>TgT=1axw!=%CfcKq_;mLtNz;+RnCjSK-xJJf9 zF|6P4F|$0zHp3|s`R!s9^I^i%{kh|oR?*j-KqI;X)>NL*UiA(>6K=eERQVztZ7^)@ zg&QyEYtD*?3Tf{@ZolU}m@P9fd*VSAo#zPrc;pe#GBb8;qy7Inw z=dd5}dqH}*RSXB)fsBWuH}1%XA5QRm38rcvD+yc17h_h=`)cBLy7dmIY$_hgfblCI z#ecsBi{1oZVECqhSL*WemUHUqxkkl9u@UVpdFf*5Fy`3!le09k;F+V2mN5(He44DX zk$t{skn5V#V-@h0N0|(ErxNgeTX(JXDTnc!iia}ZHJvUiUt0{#@~anb_?!%mCSb2a ze{mSUS@F=H!@o4kpf_qAF+!1tT^~WD;kom=j z;42N0JdITa^wL7cLoqDYT(M9ssR6bPGyb@-b{AZ5obRBm);D^dlkrdncH~p$Sd$S= zV9X3b$N44Ya9e43)~yK~xJJc8vD_hrJpL*HT?6T?UX2nBaE8kdG5tRf5ON%xo{w#n;=SS~+rCLId zrsAR4z4=mFk2kf!AcMU7x`)2Q83Jd23StlFxkkl98AJI3olY)BtJhP#c=Ri>LE-gf z{Y~c4tuaCEO`wt8GV48m?w-N*&|fWk)25jZfX3~sLj)&quzJXND1+fKbj%YJyjT8a zZuErzav@NC7Fkh%z6e086^W>#3Wh1%oe=kYcQ%Z+&kR%lUJJ+QB?eh}aOj}~8rdxe z#BYWcZE1m{wKVLW%zq0E=WoCFf9>6AT#Z}UKk&UvnUk5!awtPIDZ{avG^bHWNku9t zB~6ksLuMlLkSVi_8M4Z(NOMZ_sChRh(zBfHoLA?z&#UM2|NNidE3d9~t?PTOweI`2 zlEJ~<`x^FF2jj-5TL*Jj??0Pus=U4nW*=}$CQ#5;yg=(8Gex4acm)8a-9hbO8 z_h%*ezV{LQzTf}chnCb%EkE8rhe-8yjFG)`pMJlp@Uc_-cXyJkWB=TTesKRD6}|O4 zjotdHM#Vpy8mZi@67v!9=JD%3l*hX~v{7a%d-{Lc?z}!@Z5ovu+G~U9```VH$v?T- z_xFQ_j!&CU@HH+gq;0(3+(1{<)J&Vv6!-gn|8pOzxYH%5uLDnG%Pc=>@xu)?cTPYA zpZy&AZ&%B&`%oV5aAJU&Oa3XkPBv`+*;D~Ft-Q5&iQa$8&HA0C7k|;esgABU?o#cp z*hGSQw%ZyV7xC8f>pqmnugg4jCLy|zhDLl;Jj;InT<_6Syxv|f{Qb=Tz7OR;?mvHV z?Y<)Fn#z6@J0XLV-H!5}RQ|i4Ut{LK?n8Mz_wdX$mX{mpyjdx0?h^rBkrY=hyY>2i z$<69ss36b#F}{{gmv2>>EF&N`#og|HnBGEuYyRs#l*c>MwR*$Dr~)b(t2B7v#dsQ@ z6zYD=Uc@^8`#zLs6{4r!*zFUk)HZ8+eDo6aTdkq5Rd(a|GynTOl>N(f@8#4}Kvi!_ z9+ypMqN_)*i;E2v@m2HdK9u!)NQTmCN%ns#`E)t=Mo>!{txw*3(YyM;ENOB}3e^#8SlR@asO5&u_bzZJTc$PizOKO0K zhxSxZ_tB5fq}^U7>h+sgOm~=F36tt8Vu${^4`shetSC<<{7xm+PMi99mu?f?v!J(( zjMDG^T@71nf8U4lznzU48K>Dq%8LJp?ntO1ZQEbgE$S`e&Ewa7D37OLcafiUF z{F3_oJ8cg?&`Rlo+jDyq5w$%xCau_&^k1UcxfZ=n&f)2>5)xY;U@$xV4Rt1OpU5>Q z{(iOmb06wHL~3rWTnzcVLp-J7csZ4hR+$&0nDYDFzwSeMyd6C}GsCh>S8nZF}U;fL?*>}koC&?oQZY?BG zL4VhM_LB;N#@8#xKIc;VfA2T=Vnz^v00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHaf zKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_ z009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz z00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_< z0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZaf&Y5~g+6T+vb{C= z{G%hC@`q(5k$rY2Hg6mwAm;5UL4LW7g#XJ#?AM>po}P{l-aMW#P>au3+i_&<6aRCh z-eZr1*`|7uGeXL_>3$jgeV{gSWsp3lPz z6{k0+&~tNoJX~oVOXrVmmDKDS$eD4P;$9l}6X;z&g)3Jh(Ikq?~e_iqrzY0k_+bc zC9}Tg(8w+iY-~=R=TsGDWPOy*?lmiOODk>D{V>FRS~VH{aJr^T-$D_AI()vivC+`$ zE|p~Jpoxc;ITsS=>;qk#@(MVoI^wb6xIccb)wX%5Wa{&T78qdwM;iGE4lfY8Q6WZ$?dq5r(Cl8GCqilOE63D!)+7aL|+W z{MgosVa5+{@HAFvY)G4JRU;yh_0h3&Xjn?#dve25>B{O;Su|^TcmjQ$AR>^h)kdEu z8#8)+p&|1&d>pm)8*To)DJ$BwNJJppk2~Ax(^#=)`t0naRCA3S!V7C}nxs-nxVF`z z&XY@i*2tUc)24WQ)GK;-{>Otim0hWmQsLmL&!PfZAI-I1N;fTQqz&iBsg86lChoK6 ze*JN&oO7zejI72~KPs6Lxtl7u%rX|F=hDl|+k$%*)rpuYTdRDlhr3?8wv!7wc9&nv z7ZKN-_ZDBaa&N21{famKsxd)5`h$MbO4?t4*L6&!&zIk(~D{1zP%0ZHWHkNl`xR4Rpl`?XLL`ekOlLeDleMzm1qsQ;uc%{ofAhG z$ojY^V)O%3-F&)yzFhyD@2TX-;A9!?@Jvpe)-kWYu^RdONTT~YhSYX#IB9Qg5*EtQ7^sIV1U+>^ixu(y=D{RSujpw;fIblZD$12|RciEbiw6E>V z@11>8sAkAKsiGBWoHYsqb@;p~CdcI5FNtZio4twN7#>ZDgG5p{d+v7V-nB8Dz-r`G zYHf2moR>&uK5XzfdZCsI?DG92!;&~N3IkamJqPz18(WZ1YGWkl41bhO9j`a<^h)K% z5eBkzHRPa}MdpZVI=|jnb>`&;^6*7VrPHN+PE}zbJ9FAQ4;o(F-9(4GuX7shdx9>w zcQW_3lYn!o$}5IUm-~6<@Q29v{d09KH9bDUGvjS3k<5HlyWu?dDJKkM&qu$N^)p7_ zDXc99qk**7QmJg3n|W2{W@Z^@#?5Wlx3U`fd^zodh z`SYIJ?mgQ(iQT$|Hhcbv;>;)vWczW>*YADyc5kMA{}@#0E6dXutb9mYDy3dTAbVfk z=O(Ec$a_U>#&2s(l=whr1xf5!Ggu&Es%))3daat^;vY_z551EvA*iOa#!Swcs#DAv zIC@yd82O)T)lBw@l=Z$6dSsqnl!j9&>Hnx;lX3rY5rJAfo(j**LDj#LZho^j=~zky z705~_ZVcvrA`3IJK8m{xtm*vrhI%bKljpoLg(fTV!uGAo5|NSBxFMaixtHhBOPd<^ z>)K_};6A>e-Oi?Q26|q1Tg__Z^Th(UNp5-EM#!HkP{m*AgH9Y#W`zAEY6co)380Dk39$c35s$BVj97M?y=SY~v235p{`25gD7rG`Jq+ z!i?%C?pKc#}!h zG$Fpv9OeG?oKqDBvi%rztXRqXRxGg#zol!KUP5~qZd%#>k~rt{Q5eW-EOS07e&I?z zIe4vIEnKyPtV$5Cdv~2LB9OfUoZ6^9_4tBjk`SNLxWvwv6u8FFZpY&|r+Tga=~Y%E zkGG=v?xTWY@z^1AR@^_eyn&v#iGM8dxQ=qE3Io}GOxNuBZsN>B8g7=?_m=aAs`ZZ2tAH8U=@TTfUBZOFtJ6a@v!eOcy$jP_KW*{k|#;WNWo=U$w{2 z`_+?8)^!VlFK3YPp~;iZUak_6k)1ho8t-%L$ZF!f@K&8pk2-pxRqczTv8buC{ivE{ z)_dLgR60OB?9iV5C3LsgT>I~rpK)e17*M7+>1Q9~x1Q7QWmG{=Y+JN8Ix>N#O!5i# zmN`kd`X~%!XO8jQh`FC+TWPpnp0e!RFc;Z^e z??l7n@Hwpm+%*a_vS&x11rlwB&l^e7YV{jpFG}c;e$wJvK~0=fJ$f9aw^~FXtI;T4 z(O$4TkB$p0eNvxONxltf^&~&ui3ntUe5V|oY~j~L>rX}R`<_%qH@)U5hsdW=u6Beh zQr|rJ=d{5iiN3F8uqS%(%aFgn?|Wda2%;tbeGHJTn`;MyVr* z$cI^X>vy1vbE?8XwjX20yna`=w4AQ#J$do6DXG+T)~)v;L*hjQvi&GHXk%l4tb~l{ zR`G4{cL9;PpxAfpe2xMz@dv1r;-GdDzf2q{-i-kMrg0Z_KE=fP;43rIW7&YbR^HEA8QOZWShP>52 zpO&naNj#I<^Ur&8XA}mq8pmr7%(-HCk?iyQR2OIXie7neCT>z4_h+CmkoEDSUyRhY zfL+vV;K0#Q$Fk^y<#LPa+PN2iFp#ZP@2fE<7az^18)n9w*r=63-CF&(_bSSuTx-?4 zcgrePBaiPL_hs^kh#I=n&Chsw=tp9j!Jm`q%KeQj3}nv^DXV7nS0?2&;a#@b^RNce zf98Ztm6nSlGP36I;G)T)iEraX1hN_xw&?cRKDC-I?|1Hw z)$=cOT-nvRIUBjZJK|1UIm;r3Io}hqj&U6ukFE^MD;9{`!h3{q*S3xUM363=EmB?gER|C+u(}z0k^piD`6n(W1dxB>afLCbgW_bci&`h(4kYB z&$=IY$%!KjWNX#+m~7?BUEip*_Hh}L2m#p^CoMLkphQF<>tofBC)4}SswH_f-#6}U z%%j6gpFHR@G?z2bc&*pmsXzDQXW1vd?zSEDnR9brmFj%5-${D(9)5Suw?|!bd* zhSq`K+llo+_w&6huLELZ1 z!a&x?%eAp)#+TQUH68vu$q2(l(mv z@AbIn_%Bo{!_r`vCil@H%*f81zRT^EV}jx+Q9M>=vG)$;C6A20cRiCcV`cd9qMv=_ zS?#cYGe<9j5y)z+$nV#1Fu#f%?RBH9&$D`BoO0{W=Ml-2 z>m(Ou{Q2w%ReMqKDV@#;te9yYf0ZWJNM@%!$`g^1eOqa(6lXYZ7SP1uW+UzQRg$h< ziSmJH?oZwSdpC8c7y=N000bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_ z009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz z00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_< z0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb z2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$## zAOHafKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;| zfB*y_009U<00RFj0xoKcmRV^n_gU@F&M)@_|P=sW_zt( zqxLTKe;_lx|9%0-5Nn*wHI*C`6<4{kSrjS6Oi5YYO#809`E8_i2lch7H0b&`!7^qb12MPDoA4nfnf|+ljf#%3 zm!*mMGDkLxsB6`_i{I?06S{96#vhYI7Jk2EqT6zVMp~!;IQ*c6V-}7x=KGqs`7JQ9 zVneK#eD}U$S|dGJXI2q$r-q!|E#Y^4{};-CkR@&HU(7Ltg;9X+>KQIdrdq5c51Oat ztqgia)^?k;Yn(w5@$t2CIBF42SKbg$aeLmtQOLAqPMXV1&AjziGmB+qo(HZktfKRe z$Y0rYE|vPlcr8{Nl0q9gqF#R6|B7Q4CSLBA>bh#~?iOqoS7skt7k9da>{;dFqBb#_ zm`~W=`><>Q4VbgC>U4WK#}H-#TGLktdIg#ovmti6kNK!oQBFq9TIq4XtCmcDZnU!f zP!m0$D!=J;)mM%}bOZh8t1A1f)b(UTbc=bS|17PO*qcVlPCd|7!$M$E^Y(55k=i!O zX^lb?#}Lz|&+ze3@iAYc#)b$882Of5Y#@geW;-^iHIf2_`) zGY{}GF`KTbqsN*$|9(G{%Q6kLVCCbldKqP8LC}u`gTZf!gM3+zO=bbdEF4xlu5wvu zK37|r&EnMC$h1!nUlNt0_j7ma6p{`_Tm4(J1@w6POQl(6r5r>mWM;1~K{+7%^*#eFs9JHKuSL+z5Xgl!P7`7Abr0!G@ zfyKMw6SsEI){v`9o}8?s{sp5`jz8ta{`(#=)iXCSSmvN)Y0gIY8d4wB&##JZu`|on z+F3(-j~%dA-zJm}IN`P6?CwJH??VKv)^bvwZth~O$Y$}et$6b3UF{@z!2o}Y_&lmL zB6COD>q7ccO@B9o^hT_Ui50Lc04})6J!&%^Vhv zy59Qc&RWXGR&0p2IeT?|A{*$6TaV7PH>6RO^=`H~ON+_lOKx`sAFDWq(4Xd}s%+%% z@8Q7;8RiijX<%PQO+9+gEE(2G0vEL0nL0L=o~-N8&K#1@F@&D`0v9cp!0FxwY>1aF zHgmU5`a(m7gie@Z`i^vVkN)wtwL*T+=xh!PH7y4(O+DXLW;$$vf40u(Rg!&= z$W$4ag^VvHzN;GEdW}e=A0m!gdWJM}3}Lj;#AemBWe$$ZnMK%Bx%DrGrP27sy+3;k zKGC}hi}Eab))OPgfZn>T>E&xef