diff --git a/RELEASE.rst b/RELEASE.rst index 610e9254289aa..efa4950a36bb3 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -178,6 +178,7 @@ pandas 0.11.0 - added the method ``select_column`` to select a single column from a table as a Series. - deprecated the ``unique`` method, can be replicated by ``select_column(key,column).unique()`` + - ``min_itemsize`` parameter will now automatically create data_columns for passed keys - Downcast on pivot if possible (GH3283_), adds argument ``downcast`` to ``fillna`` diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index db311e9be9ecb..f2779e90f206a 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -282,6 +282,9 @@ The :ref:`HDFStores ` docs `Troubleshoot HDFStore exceptions `__ +`Setting min_itemsize with strings +`__ + Storing Attributes to a group node .. ipython:: python diff --git a/doc/source/io.rst b/doc/source/io.rst index 25c42780afd65..9001ae393d552 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1391,7 +1391,7 @@ of rows in an object. Multiple Table Queries ~~~~~~~~~~~~~~~~~~~~~~ -New in 0.10.1 are the methods ``append_to_multple`` and +New in 0.10.1 are the methods ``append_to_multiple`` and ``select_as_multiple``, that can perform appending/selecting from multiple tables at once. The idea is to have one table (call it the selector table) that you index most/all of the columns, and perform your @@ -1535,24 +1535,6 @@ Notes & Caveats ``tables``. The sizes of a string based indexing column (e.g. *columns* or *minor_axis*) are determined as the maximum size of the elements in that axis or by passing the parameter - ``min_itemsize`` on the first table creation (``min_itemsize`` can - be an integer or a dict of column name to an integer). If - subsequent appends introduce elements in the indexing axis that are - larger than the supported indexer, an Exception will be raised - (otherwise you could have a silent truncation of these indexers, - leading to loss of information). Just to be clear, this fixed-width - restriction applies to **indexables** (the indexing columns) and - **string values** in a mixed_type table. - - .. ipython:: python - - store.append('wp_big_strings', wp, min_itemsize = { 'minor_axis' : 30 }) - wp = wp.rename_axis(lambda x: x + '_big_strings', axis=2) - store.append('wp_big_strings', wp) - store.select('wp_big_strings') - - # we have provided a minimum minor_axis indexable size - store.root.wp_big_strings.table DataTypes ~~~~~~~~~ @@ -1589,6 +1571,34 @@ conversion may not be necessary in future versions of pandas) df df.dtypes +String Columns +~~~~~~~~~~~~~~ + +The underlying implementation of ``HDFStore`` uses a fixed column width (itemsize) for string columns. A string column itemsize is calculated as the maximum of the +length of data (for that column) that is passed to the ``HDFStore``, **in the first append**. Subsequent appends, may introduce a string for a column **larger** than the column can hold, an Exception will be raised (otherwise you could have a silent truncation of these columns, leading to loss of information). In the future we may relax this and allow a user-specified truncation to occur. + +Pass ``min_itemsize`` on the first table creation to a-priori specifiy the minimum length of a particular string column. ``min_itemsize`` can be an integer, or a dict mapping a column name to an integer. You can pass ``values`` as a key to allow all *indexables* or *data_columns* to have this min_itemsize. + +Starting in 0.11, passing a ``min_itemsize`` dict will cause all passed columns to be created as *data_columns* automatically. + +.. note:: + + If you are not passing any *data_columns*, then the min_itemsize will be the maximum of the length of any string passed + +.. ipython:: python + + dfs = DataFrame(dict(A = 'foo', B = 'bar'),index=range(5)) + dfs + + # A and B have a size of 30 + store.append('dfs', dfs, min_itemsize = 30) + store.get_storer('dfs').table + + # A is created as a data_column with a size of 30 + # B is size is calculated + store.append('dfs2', dfs, min_itemsize = { 'A' : 30 }) + store.get_storer('dfs2').table + External Compatibility ~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.11.0.txt b/doc/source/v0.11.0.txt index 9c0a6d5a421c7..834b23c92d3b5 100644 --- a/doc/source/v0.11.0.txt +++ b/doc/source/v0.11.0.txt @@ -229,9 +229,11 @@ API changes - Added to_series() method to indicies, to facilitate the creation of indexers (GH3275_) - - In ``HDFStore``, added the method ``select_column`` to select a single column from a table as a Series. + - ``HDFStore`` - - In ``HDFStore``, deprecated the ``unique`` method, can be replicated by ``select_column(key,column).unique()`` + - added the method ``select_column`` to select a single column from a table as a Series. + - deprecated the ``unique`` method, can be replicated by ``select_column(key,column).unique()`` + - ``min_itemsize`` parameter to ``append`` will now automatically create data_columns for passed keys Enhancements ~~~~~~~~~~~~ @@ -244,25 +246,26 @@ Enhancements - Bottleneck is now a :ref:`Recommended Dependencies `, to accelerate certain types of ``nan`` operations - - For ``HDFStore``, support ``read_hdf/to_hdf`` API similar to ``read_csv/to_csv`` + - ``HDFStore`` - .. ipython:: python + - support ``read_hdf/to_hdf`` API similar to ``read_csv/to_csv`` - df = DataFrame(dict(A=range(5), B=range(5))) - df.to_hdf('store.h5','table',append=True) - read_hdf('store.h5', 'table', where = ['index>2']) + .. ipython:: python - .. ipython:: python - :suppress: - :okexcept: + df = DataFrame(dict(A=range(5), B=range(5))) + df.to_hdf('store.h5','table',append=True) + read_hdf('store.h5', 'table', where = ['index>2']) + + .. ipython:: python + :suppress: + :okexcept: - os.remove('store.h5') + os.remove('store.h5') - - In ``HDFStore``, provide dotted attribute access to ``get`` from stores - (e.g. ``store.df == store['df']``) + - provide dotted attribute access to ``get`` from stores, e.g. ``store.df == store['df']`` - - In ``HDFStore``, new keywords ``iterator=boolean``, and ``chunksize=number_in_a_chunk`` are - provided to support iteration on ``select`` and ``select_as_multiple`` (GH3076_) + - new keywords ``iterator=boolean``, and ``chunksize=number_in_a_chunk`` are + provided to support iteration on ``select`` and ``select_as_multiple`` (GH3076_) - You can now select timestamps from an *unordered* timeseries similarly to an *ordered* timeseries (GH2437_) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 0568ee7f7f8bf..da4077165add2 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2181,7 +2181,7 @@ def validate_min_itemsize(self, min_itemsize): if k == 'values': continue if k not in q: - raise ValueError("min_itemsize has [%s] which is not an axis or data_column" % k) + raise ValueError("min_itemsize has the key [%s] which is not an axis or data_column" % k) @property def indexables(self): @@ -2293,6 +2293,30 @@ def get_object(self, obj): """ return the data for this obj """ return obj + def validate_data_columns(self, data_columns, min_itemsize): + """ take the input data_columns and min_itemize and create a data_columns spec """ + + if not len(self.non_index_axes): + return [] + + axis_labels = self.non_index_axes[0][1] + + # evaluate the passed data_columns, True == use all columns + # take only valide axis labels + if data_columns is True: + data_columns = axis_labels + elif data_columns is None: + data_columns = [] + + # if min_itemsize is a dict, add the keys (exclude 'values') + if isinstance(min_itemsize,dict): + + existing_data_columns = set(data_columns) + data_columns.extend([ k for k in min_itemsize.keys() if k != 'values' and k not in existing_data_columns ]) + + # return valid columns in the order of our axis + return [c for c in data_columns if c in axis_labels] + def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, min_itemsize=None, **kwargs): """ create and return the axes leagcy tables create an indexable column, indexable index, non-indexable fields @@ -2380,26 +2404,18 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, for a in self.non_index_axes: obj = obj.reindex_axis(a[1], axis=a[0], copy=False) - # get out blocks + # figure out data_columns and get out blocks block_obj = self.get_object(obj) - blocks = None - - if data_columns is not None and len(self.non_index_axes): - axis = self.non_index_axes[0][0] - axis_labels = self.non_index_axes[0][1] - if data_columns is True: - data_columns = axis_labels - - data_columns = [c for c in data_columns if c in axis_labels] + blocks = block_obj._data.blocks + if len(self.non_index_axes): + axis, axis_labels = self.non_index_axes[0] + data_columns = self.validate_data_columns(data_columns, min_itemsize) if len(data_columns): blocks = block_obj.reindex_axis(Index(axis_labels) - Index( - data_columns), axis=axis, copy=False)._data.blocks + data_columns), axis=axis, copy=False)._data.blocks for c in data_columns: blocks.extend(block_obj.reindex_axis( - [c], axis=axis, copy=False)._data.blocks) - - if blocks is None: - blocks = block_obj._data.blocks + [c], axis=axis, copy=False)._data.blocks) # add my values self.values_axes = [] diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 6acf17b1220a7..598812373538c 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -694,25 +694,41 @@ def check_col(key,name,size): with ensure_clean(self.path) as store: - # infer the .typ on subsequent appends + def check_col(key,name,size): + self.assert_(getattr(store.get_storer(key).table.description,name).itemsize == size) + df = DataFrame(dict(A = 'foo', B = 'bar'),index=range(10)) + + # a min_itemsize that creates a data_column + store.remove('df') + store.append('df', df, min_itemsize={'A' : 200 }) + check_col('df', 'A', 200) + self.assert_(store.get_storer('df').data_columns == ['A']) + + # a min_itemsize that creates a data_column2 + store.remove('df') + store.append('df', df, data_columns = ['B'], min_itemsize={'A' : 200 }) + check_col('df', 'A', 200) + self.assert_(store.get_storer('df').data_columns == ['B','A']) + + # a min_itemsize that creates a data_column2 + store.remove('df') + store.append('df', df, data_columns = ['B'], min_itemsize={'values' : 200 }) + check_col('df', 'B', 200) + check_col('df', 'values_block_0', 200) + self.assert_(store.get_storer('df').data_columns == ['B']) + + # infer the .typ on subsequent appends store.remove('df') store.append('df', df[:5], min_itemsize=200) store.append('df', df[5:], min_itemsize=200) tm.assert_frame_equal(store['df'], df) # invalid min_itemsize keys - df = DataFrame(['foo','foo','foo','barh','barh','barh'],columns=['A']) - store.remove('df') self.assertRaises(ValueError, store.append, 'df', df, min_itemsize={'foo' : 20, 'foobar' : 20}) - # invalid sizes - store.remove('df') - store.append('df', df[:3], min_itemsize=3) - self.assertRaises(ValueError, store.append, 'df', df[3:]) - def test_append_with_data_columns(self): with ensure_clean(self.path) as store: