Merge pull request #3357 from jreback/hdf_fix

jreback · jreback · commit c45e7693f0ec · 2013-04-14T05:59:34.000-07:00
ENH: HDFStore now auto creates data_columns if they are specified in min_itemsize
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -178,6 +178,7 @@ pandas 0.11.0
 
      - added the method ``select_column`` to select a single column from a table as a Series.
      - deprecated the ``unique`` method, can be replicated by ``select_column(key,column).unique()``
+     - ``min_itemsize`` parameter will now automatically create data_columns for passed keys
 
   - Downcast on pivot if possible (GH3283_), adds argument ``downcast`` to ``fillna``
 
diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst
@@ -282,6 +282,9 @@ The :ref:`HDFStores <io.hdf5>` docs
 `Troubleshoot HDFStore exceptions
 <http://stackoverflow.com/questions/15488809/how-to-trouble-shoot-hdfstore-exception-cannot-find-the-correct-atom-type>`__
 
+`Setting min_itemsize with strings
+<http://stackoverflow.com/questions/15988871/hdfstore-appendstring-dataframe-fails-when-string-column-contents-are-longer>`__
+
 Storing Attributes to a group node
 
 .. ipython:: python
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -1391,7 +1391,7 @@ of rows in an object.
 Multiple Table Queries
 ~~~~~~~~~~~~~~~~~~~~~~
 
-New in 0.10.1 are the methods ``append_to_multple`` and
+New in 0.10.1 are the methods ``append_to_multiple`` and
 ``select_as_multiple``, that can perform appending/selecting from
 multiple tables at once. The idea is to have one table (call it the
 selector table) that you index most/all of the columns, and perform your
@@ -1535,24 +1535,6 @@ Notes & Caveats
      ``tables``. The sizes of a string based indexing column
      (e.g. *columns* or *minor_axis*) are determined as the maximum size
      of the elements in that axis or by passing the parameter
-     ``min_itemsize`` on the first table creation (``min_itemsize`` can
-     be an integer or a dict of column name to an integer). If
-     subsequent appends introduce elements in the indexing axis that are
-     larger than the supported indexer, an Exception will be raised
-     (otherwise you could have a silent truncation of these indexers,
-     leading to loss of information). Just to be clear, this fixed-width
-     restriction applies to **indexables** (the indexing columns) and
-     **string values** in a mixed_type table.
-
-     .. ipython:: python
-
-       store.append('wp_big_strings', wp, min_itemsize = { 'minor_axis' : 30 })
-       wp = wp.rename_axis(lambda x: x + '_big_strings', axis=2)
-       store.append('wp_big_strings', wp)
-       store.select('wp_big_strings')
-
-       # we have provided a minimum minor_axis indexable size
-       store.root.wp_big_strings.table
 
 DataTypes
 ~~~~~~~~~
@@ -1589,6 +1571,34 @@ conversion may not be necessary in future versions of pandas)
        df
        df.dtypes
 
+String Columns
+~~~~~~~~~~~~~~
+
+The underlying implementation of ``HDFStore`` uses a fixed column width (itemsize) for string columns. A string column itemsize is calculated as the maximum of the
+length of data (for that column) that is passed to the ``HDFStore``, **in the first append**. Subsequent appends, may introduce a string for a column **larger** than the column can hold, an Exception will be raised (otherwise you could have a silent truncation of these columns, leading to loss of information). In the future we may relax this and allow a user-specified truncation to occur.
+
+Pass ``min_itemsize`` on the first table creation to a-priori specifiy the minimum length of a particular string column. ``min_itemsize`` can be an integer, or a dict mapping a column name to an integer. You can pass ``values`` as a key to allow all *indexables* or *data_columns* to have this min_itemsize.
+
+Starting in 0.11, passing a ``min_itemsize`` dict will cause all passed columns to be created as *data_columns* automatically.
+
+.. note::
+
+   If you are not passing any *data_columns*, then the min_itemsize will be the maximum of the length of any string passed
+
+.. ipython:: python
+
+   dfs = DataFrame(dict(A = 'foo', B = 'bar'),index=range(5))
+   dfs
+
+   # A and B have a size of 30
+   store.append('dfs', dfs, min_itemsize = 30)
+   store.get_storer('dfs').table
+
+   # A is created as a data_column with a size of 30
+   # B is size is calculated
+   store.append('dfs2', dfs, min_itemsize = { 'A' : 30 })
+   store.get_storer('dfs2').table
+
 External Compatibility
 ~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/doc/source/v0.11.0.txt b/doc/source/v0.11.0.txt
@@ -229,9 +229,11 @@ API changes
   - Added to_series() method to indicies, to facilitate the creation of indexers
     (GH3275_)
 
-  - In ``HDFStore``, added the method ``select_column`` to select a single column from a table as a Series.
+  - ``HDFStore``
 
-  - In ``HDFStore``, deprecated the ``unique`` method, can be replicated by ``select_column(key,column).unique()``
+    - added the method ``select_column`` to select a single column from a table as a Series.
+    - deprecated the ``unique`` method, can be replicated by ``select_column(key,column).unique()``
+    - ``min_itemsize`` parameter to ``append`` will now automatically create data_columns for passed keys
 
 Enhancements
 ~~~~~~~~~~~~
@@ -244,25 +246,26 @@ Enhancements
   - Bottleneck is now a :ref:`Recommended Dependencies <install.recommended_dependencies>`, to accelerate certain
     types of ``nan`` operations
 
-  - For ``HDFStore``, support ``read_hdf/to_hdf`` API similar to ``read_csv/to_csv``
+  - ``HDFStore``
 
-    .. ipython:: python
+    - support ``read_hdf/to_hdf`` API similar to ``read_csv/to_csv``
 
-       df = DataFrame(dict(A=range(5), B=range(5)))
-       df.to_hdf('store.h5','table',append=True)
-       read_hdf('store.h5', 'table', where = ['index>2'])
+      .. ipython:: python
 
-    .. ipython:: python
-       :suppress:
-       :okexcept:
+          df = DataFrame(dict(A=range(5), B=range(5)))
+          df.to_hdf('store.h5','table',append=True)
+          read_hdf('store.h5', 'table', where = ['index>2'])
+
+      .. ipython:: python
+          :suppress:
+          :okexcept:
 
-       os.remove('store.h5')
+          os.remove('store.h5')
 
-  - In ``HDFStore``, provide dotted attribute access to ``get`` from stores
-    (e.g. ``store.df == store['df']``)
+    - provide dotted attribute access to ``get`` from stores, e.g. ``store.df == store['df']``
 
-  - In ``HDFStore``, new keywords ``iterator=boolean``, and ``chunksize=number_in_a_chunk`` are
-    provided to support iteration on ``select`` and ``select_as_multiple`` (GH3076_)
+    - new keywords ``iterator=boolean``, and ``chunksize=number_in_a_chunk`` are
+      provided to support iteration on ``select`` and ``select_as_multiple`` (GH3076_)
 
   - You can now select timestamps from an *unordered* timeseries similarly to an *ordered* timeseries (GH2437_)
 
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -2181,7 +2181,7 @@ def validate_min_itemsize(self, min_itemsize):
             if k == 'values':
                 continue
             if k not in q:
-                raise ValueError("min_itemsize has [%s] which is not an axis or data_column" % k)
+                raise ValueError("min_itemsize has the key [%s] which is not an axis or data_column" % k)
 
     @property
     def indexables(self):
@@ -2293,6 +2293,30 @@ def get_object(self, obj):
         """ return the data for this obj """
         return obj
 
+    def validate_data_columns(self, data_columns, min_itemsize):
+        """ take the input data_columns and min_itemize and create a data_columns spec """
+
+        if not len(self.non_index_axes):
+            return []
+
+        axis_labels = self.non_index_axes[0][1]
+
+        # evaluate the passed data_columns, True == use all columns
+        # take only valide axis labels
+        if data_columns is True:
+            data_columns = axis_labels
+        elif data_columns is None:
+            data_columns = []
+
+        # if min_itemsize is a dict, add the keys (exclude 'values')
+        if isinstance(min_itemsize,dict):
+
+            existing_data_columns = set(data_columns)
+            data_columns.extend([ k for k in min_itemsize.keys() if k != 'values' and k not in existing_data_columns ])
+
+        # return valid columns in the order of our axis
+        return [c for c in data_columns if c in axis_labels]
+
     def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, min_itemsize=None, **kwargs):
         """ create and return the axes
               leagcy tables create an indexable column, indexable index, non-indexable fields
@@ -2380,26 +2404,18 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None,
         for a in self.non_index_axes:
             obj = obj.reindex_axis(a[1], axis=a[0], copy=False)
 
-        # get out blocks
+        # figure out data_columns and get out blocks
         block_obj = self.get_object(obj)
-        blocks = None
-
-        if data_columns is not None and len(self.non_index_axes):
-            axis = self.non_index_axes[0][0]
-            axis_labels = self.non_index_axes[0][1]
-            if data_columns is True:
-                data_columns = axis_labels
-
-            data_columns = [c for c in data_columns if c in axis_labels]
+        blocks = block_obj._data.blocks
+        if len(self.non_index_axes):
+            axis, axis_labels = self.non_index_axes[0]
+            data_columns = self.validate_data_columns(data_columns, min_itemsize)
             if len(data_columns):
                 blocks = block_obj.reindex_axis(Index(axis_labels) - Index(
-                    data_columns), axis=axis, copy=False)._data.blocks
+                        data_columns), axis=axis, copy=False)._data.blocks
                 for c in data_columns:
                     blocks.extend(block_obj.reindex_axis(
-                        [c], axis=axis, copy=False)._data.blocks)
-
-        if blocks is None:
-            blocks = block_obj._data.blocks
+                            [c], axis=axis, copy=False)._data.blocks)
 
         # add my values
         self.values_axes = []
diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py
@@ -694,25 +694,41 @@ def check_col(key,name,size):
 
         with ensure_clean(self.path) as store:
 
-            # infer the .typ on subsequent appends
+            def check_col(key,name,size):
+                self.assert_(getattr(store.get_storer(key).table.description,name).itemsize == size)
+
             df = DataFrame(dict(A = 'foo', B = 'bar'),index=range(10))
+
+            # a min_itemsize that creates a data_column
+            store.remove('df')
+            store.append('df', df, min_itemsize={'A' : 200 })
+            check_col('df', 'A', 200)
+            self.assert_(store.get_storer('df').data_columns == ['A'])
+
+            # a min_itemsize that creates a data_column2
+            store.remove('df')
+            store.append('df', df, data_columns = ['B'], min_itemsize={'A' : 200 })
+            check_col('df', 'A', 200)
+            self.assert_(store.get_storer('df').data_columns == ['B','A'])
+
+            # a min_itemsize that creates a data_column2
+            store.remove('df')
+            store.append('df', df, data_columns = ['B'], min_itemsize={'values' : 200 })
+            check_col('df', 'B', 200)
+            check_col('df', 'values_block_0', 200)
+            self.assert_(store.get_storer('df').data_columns == ['B'])
+
+            # infer the .typ on subsequent appends
             store.remove('df')
             store.append('df', df[:5], min_itemsize=200)
             store.append('df', df[5:], min_itemsize=200)
             tm.assert_frame_equal(store['df'], df)
 
             # invalid min_itemsize keys
-
             df = DataFrame(['foo','foo','foo','barh','barh','barh'],columns=['A'])
-
             store.remove('df')
             self.assertRaises(ValueError, store.append, 'df', df, min_itemsize={'foo' : 20, 'foobar' : 20})
 
-            # invalid sizes
-            store.remove('df')
-            store.append('df', df[:3], min_itemsize=3)
-            self.assertRaises(ValueError, store.append, 'df', df[3:])
-
     def test_append_with_data_columns(self):
 
         with ensure_clean(self.path) as store: