Merge pull request #4714 from jreback/hdf_nan

jreback · jreback · commit 307383568332 · 2013-08-31T07:09:35.000-07:00
API:  for HDFStore, add the keyword dropna=True to append to change whether to write ALL nan rows to the store (GH4625)
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -109,6 +109,9 @@ pandas 0.13
       be raised if you try to use ``mode='w'`` with an OPEN file handle (:issue:`4367`)
     - allow a passed locations array or mask as a ``where`` condition (:issue:`4467`)
     - the ``fmt`` keyword now replaces the ``table`` keyword; allowed values are ``s|t``
+    - add the keyword ``dropna=True`` to ``append`` to change whether ALL nan rows are not written
+      to the store (default is ``True``, ALL nan rows are NOT written), also settable
+      via the option ``io.hdf.dropna_table`` (:issue:`4625`)
   - ``JSON``
 
     - added ``date_unit`` parameter to specify resolution of timestamps. Options
diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt
@@ -98,6 +98,9 @@ API changes
 
          import os
          os.remove(path)
+    - add the keyword ``dropna=True`` to ``append`` to change whether ALL nan rows are not written
+      to the store (default is ``True``, ALL nan rows are NOT written), also settable
+      via the option ``io.hdf.dropna_table`` (:issue:`4625`)
 
   - Changes to how ``Index`` and ``MultiIndex`` handle metadata (``levels``,
     ``labels``, and ``names``) (:issue:`4039`):
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -32,6 +32,7 @@
 from pandas.tools.merge import concat
 from pandas import compat
 from pandas.io.common import PerformanceWarning
+from pandas.core.config import get_option
 
 import pandas.lib as lib
 import pandas.algos as algos
@@ -165,6 +166,17 @@ class DuplicateWarning(Warning):
     Panel4D: [1, 2, 3],
 }
 
+# register our configuration options
+from pandas.core import config
+dropna_doc = """
+: boolean
+    drop ALL nan rows when appending to a table
+"""
+
+with config.config_prefix('io.hdf'):
+    config.register_option('dropna_table', True, dropna_doc,
+                           validator=config.is_bool)
+
 # oh the troubles to reduce import time
 _table_mod = None
 _table_supports_index = False
@@ -730,7 +742,7 @@ def remove(self, key, where=None, start=None, stop=None):
                     'can only remove with where on objects written as tables')
             return s.delete(where=where, start=start, stop=stop)
 
-    def append(self, key, value, fmt=None, append=True, columns=None, **kwargs):
+    def append(self, key, value, fmt=None, append=True, columns=None, dropna=None, **kwargs):
         """
         Append to Table in file. Node must already exist and be Table
         format.
@@ -751,7 +763,8 @@ def append(self, key, value, fmt=None, append=True, columns=None, **kwargs):
         chunksize    : size to chunk the writing
         expectedrows : expected TOTAL row size of this table
         encoding     : default None, provide an encoding for strings
-
+        dropna       : boolean, default True, do not write an ALL nan row to the store
+                       settable by the option 'io.hdf.dropna_table'
         Notes
         -----
         Does *not* check if data being appended overlaps with existing
@@ -761,8 +774,10 @@ def append(self, key, value, fmt=None, append=True, columns=None, **kwargs):
             raise Exception(
                 "columns is not a supported keyword in append, try data_columns")
 
+        if dropna is None:
+            dropna = get_option("io.hdf.dropna_table")
         kwargs = self._validate_format(fmt or 't', kwargs)
-        self._write_to_group(key, value, append=append, **kwargs)
+        self._write_to_group(key, value, append=append, dropna=dropna, **kwargs)
 
     def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, **kwargs):
         """
@@ -3219,7 +3234,7 @@ class AppendableTable(LegacyTable):
 
     def write(self, obj, axes=None, append=False, complib=None,
               complevel=None, fletcher32=None, min_itemsize=None, chunksize=None,
-              expectedrows=None, **kwargs):
+              expectedrows=None, dropna=True, **kwargs):
 
         if not append and self.is_exists:
             self._handle.removeNode(self.group, 'table')
@@ -3254,29 +3269,36 @@ def write(self, obj, axes=None, append=False, complib=None,
             a.validate_and_set(table, append)
 
         # add the rows
-        self.write_data(chunksize)
+        self.write_data(chunksize, dropna=dropna)
 
-    def write_data(self, chunksize):
+    def write_data(self, chunksize, dropna=True):
         """ we form the data into a 2-d including indexes,values,mask
             write chunk-by-chunk """
 
         names = self.dtype.names
         nrows = self.nrows_expected
 
-        # create the masks & values
-        masks = []
-        for a in self.values_axes:
+        # if dropna==True, then drop ALL nan rows
+        if dropna:
+
+            masks = []
+            for a in self.values_axes:
+
+                # figure the mask: only do if we can successfully process this
+                # column, otherwise ignore the mask
+                mask = com.isnull(a.data).all(axis=0)
+                masks.append(mask.astype('u1'))
 
-            # figure the mask: only do if we can successfully process this
-            # column, otherwise ignore the mask
-            mask = com.isnull(a.data).all(axis=0)
-            masks.append(mask.astype('u1'))
+            # consolidate masks
+            mask = masks[0]
+            for m in masks[1:]:
+                mask = mask & m
+            mask = mask.ravel()
+
+        else:
 
-        # consolidate masks
-        mask = masks[0]
-        for m in masks[1:]:
-            mask = mask & m
-        mask = mask.ravel()
+            mask = np.empty(nrows, dtype='u1')
+            mask.fill(False)
 
         # broadcast the indexes if needed
         indexes = [a.cvalues for a in self.index_axes]
diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py
@@ -757,45 +757,76 @@ def test_append_some_nans(self):
             store.append('df3', df3[10:])
             tm.assert_frame_equal(store['df3'], df3)
 
-            ##### THIS IS A BUG, should not drop these all-nan rows
-            ##### BUT need to store the index which we don't want to do....
-            # nan some entire rows
+    def test_append_all_nans(self):
+
+        with ensure_clean(self.path) as store:
+
             df = DataFrame({'A1' : np.random.randn(20),
                             'A2' : np.random.randn(20)},
                            index=np.arange(20))
+            df.ix[0:15,:] = np.nan
+
+
+            # nan some entire rows (dropna=True)
+            _maybe_remove(store, 'df')
+            store.append('df', df[:10], dropna=True)
+            store.append('df', df[10:], dropna=True)
+            tm.assert_frame_equal(store['df'], df[-4:])
+
+            # nan some entire rows (dropna=False)
+            _maybe_remove(store, 'df2')
+            store.append('df2', df[:10], dropna=False)
+            store.append('df2', df[10:], dropna=False)
+            tm.assert_frame_equal(store['df2'], df)
+
+            # tests the option io.hdf.dropna_table
+            pandas.set_option('io.hdf.dropna_table',False)
+            _maybe_remove(store, 'df3')
+            store.append('df3', df[:10])
+            store.append('df3', df[10:])
+            tm.assert_frame_equal(store['df3'], df)
 
+            pandas.set_option('io.hdf.dropna_table',True)
             _maybe_remove(store, 'df4')
-            df.ix[0:15,:] = np.nan
             store.append('df4', df[:10])
             store.append('df4', df[10:])
             tm.assert_frame_equal(store['df4'], df[-4:])
-            self.assert_(store.get_storer('df4').nrows == 4)
 
             # nan some entire rows (string are still written!)
             df = DataFrame({'A1' : np.random.randn(20),
                             'A2' : np.random.randn(20),
                             'B' : 'foo', 'C' : 'bar'},
                            index=np.arange(20))
 
-            _maybe_remove(store, 'df5')
             df.ix[0:15,:] = np.nan
-            store.append('df5', df[:10])
-            store.append('df5', df[10:])
-            tm.assert_frame_equal(store['df5'], df)
-            self.assert_(store.get_storer('df5').nrows == 20)
+
+            _maybe_remove(store, 'df')
+            store.append('df', df[:10], dropna=True)
+            store.append('df', df[10:], dropna=True)
+            tm.assert_frame_equal(store['df'], df)
+
+            _maybe_remove(store, 'df2')
+            store.append('df2', df[:10], dropna=False)
+            store.append('df2', df[10:], dropna=False)
+            tm.assert_frame_equal(store['df2'], df)
 
             # nan some entire rows (but since we have dates they are still written!)
             df = DataFrame({'A1' : np.random.randn(20),
                             'A2' : np.random.randn(20),
                             'B' : 'foo', 'C' : 'bar', 'D' : Timestamp("20010101"), 'E' : datetime.datetime(2001,1,2,0,0) },
                            index=np.arange(20))
 
-            _maybe_remove(store, 'df6')
             df.ix[0:15,:] = np.nan
-            store.append('df6', df[:10])
-            store.append('df6', df[10:])
-            tm.assert_frame_equal(store['df6'], df)
-            self.assert_(store.get_storer('df6').nrows == 20)
+
+            _maybe_remove(store, 'df')
+            store.append('df', df[:10], dropna=True)
+            store.append('df', df[10:], dropna=True)
+            tm.assert_frame_equal(store['df'], df)
+
+            _maybe_remove(store, 'df2')
+            store.append('df2', df[:10], dropna=False)
+            store.append('df2', df[10:], dropna=False)
+            tm.assert_frame_equal(store['df2'], df)
 
     def test_append_frame_column_oriented(self):