Merge pull request #10097 from nickeubank/patch-1

jreback · jreback · commit 41579024d9da · 2015-07-31T18:37:48.000-04:00
Default values for dropna to "False" (issue 9382)
diff --git a/.gitignore b/.gitignore
@@ -17,6 +17,7 @@
 .idea
 .vagrant
 .noseids
+.ipynb_checkpoints
 
 # Compiled source #
 ###################
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -2410,6 +2410,10 @@ for some advanced strategies
 
    There is a ``PyTables`` indexing bug which may appear when querying stores using an index.  If you see a subset of results being returned, upgrade to ``PyTables`` >= 3.2.  Stores created previously will need to be rewritten using the updated version.
 
+.. warning::
+
+   As of version 0.17.0, ``HDFStore`` will not drop rows that have all missing values by default. Previously, if all values (except the index) were missing, ``HDFStore`` would not write those rows to disk. 
+
 .. ipython:: python
    :suppress:
    :okexcept:
@@ -2486,6 +2490,8 @@ Closing a Store, Context Manager
    import os
    os.remove('store.h5')
 
+
+
 Read/Write API
 ~~~~~~~~~~~~~~
 
@@ -2504,6 +2510,65 @@ similar to how ``read_csv`` and ``to_csv`` work. (new in 0.11.0)
 
    os.remove('store_tl.h5')
 
+
+As of version 0.17.0, HDFStore will no longer drop rows that are all missing by default. This behavior can be enabled by setting ``dropna=True``.  
+
+.. ipython:: python
+   :suppress:
+
+   import os
+
+.. ipython:: python
+
+   df_with_missing = pd.DataFrame({'col1':[0, np.nan, 2], 
+                                   'col2':[1, np.nan, np.nan]})
+   df_with_missing
+
+   df_with_missing.to_hdf('file.h5', 'df_with_missing', 
+                           format = 'table', mode='w')
+   
+   pd.read_hdf('file.h5', 'df_with_missing')
+
+   df_with_missing.to_hdf('file.h5', 'df_with_missing', 
+                           format = 'table', mode='w', dropna=True)
+   pd.read_hdf('file.h5', 'df_with_missing')
+
+
+.. ipython:: python
+   :suppress:
+
+   os.remove('file.h5')
+
+This is also true for the major axis of a ``Panel``:
+
+.. ipython:: python
+
+   matrix = [[[np.nan, np.nan, np.nan],[1,np.nan,np.nan]],
+          [[np.nan, np.nan, np.nan], [np.nan,5,6]],
+          [[np.nan, np.nan, np.nan],[np.nan,3,np.nan]]]
+
+   panel_with_major_axis_all_missing = Panel(matrix, 
+           items=['Item1', 'Item2','Item3'],
+           major_axis=[1,2],
+           minor_axis=['A', 'B', 'C'])
+
+   panel_with_major_axis_all_missing
+
+   panel_with_major_axis_all_missing.to_hdf('file.h5', 'panel',
+                                           dropna = True, 
+                                           format='table', 
+                                           mode='w')
+   reloaded = read_hdf('file.h5', 'panel')
+   reloaded
+
+
+.. ipython:: python
+   :suppress:
+
+   os.remove('file.h5')
+
+
+
 .. _io.hdf5-fixed:
 
 Fixed Format
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -337,6 +337,9 @@ Usually you simply want to know which values are null.
       None == None
       np.nan == np.nan
 
+
+.. _whatsnew_0170.api_breaking.other:
+
 Other API Changes
 ^^^^^^^^^^^^^^^^^
 
@@ -372,6 +375,52 @@ Other API Changes
   ``raise ValueError``                All other public methods (names not beginning with underscores)
   ===============================     ===============================================================
 
+
+- default behavior for HDFStore write functions with ``format='table'`` is now to keep rows that are all missing except for index. Previously, the behavior was to drop rows that were all missing save the index. The previous behavior can be replicated using the ``dropna=True`` option. (:issue:`9382`)
+
+Previously,
+
+.. ipython:: python
+
+   df_with_missing = pd.DataFrame({'col1':[0, np.nan, 2], 
+                                   'col2':[1, np.nan, np.nan]})
+   
+   df_with_missing
+
+
+.. code-block:: python
+
+   In [28]: 
+   df_with_missing.to_hdf('file.h5', 'df_with_missing', format='table', mode='w')
+   
+   pd.read_hdf('file.h5', 'df_with_missing')
+   
+   Out [28]: 
+         col1  col2
+     0     0     1
+     2     2   NaN
+
+
+New behavior: 
+
+.. ipython:: python
+   :suppress:
+
+   import os
+
+.. ipython:: python
+
+   df_with_missing.to_hdf('file.h5', 'df_with_missing', format = 'table', mode='w')
+   
+   pd.read_hdf('file.h5', 'df_with_missing')
+
+.. ipython:: python
+   :suppress:
+
+   os.remove('file.h5')
+
+See :ref:`documentation <io.hdf5>` for more details.  
+
 .. _whatsnew_0170.deprecations:
 
 Deprecations
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -922,6 +922,8 @@ def to_hdf(self, path_or_buf, key, **kwargs):
             in the store wherever possible
         fletcher32 : bool, default False
             If applying compression use the fletcher32 checksum
+        dropna : boolean, default False. 
+            If true, ALL nan rows will not be written to store.
 
         """
 
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -220,7 +220,7 @@ class DuplicateWarning(Warning):
 """
 
 with config.config_prefix('io.hdf'):
-    config.register_option('dropna_table', True, dropna_doc,
+    config.register_option('dropna_table', False, dropna_doc,
                            validator=config.is_bool)
     config.register_option(
         'default_format', None, format_doc,
@@ -817,7 +817,7 @@ def put(self, key, value, format=None, append=False, **kwargs):
             This will force Table format, append the input data to the
             existing.
         encoding : default None, provide an encoding for strings
-        dropna   : boolean, default True, do not write an ALL nan row to
+        dropna   : boolean, default False, do not write an ALL nan row to
             the store settable by the option 'io.hdf.dropna_table'
         """
         if format is None:
@@ -899,7 +899,7 @@ def append(self, key, value, format=None, append=True, columns=None,
         chunksize    : size to chunk the writing
         expectedrows : expected TOTAL row size of this table
         encoding     : default None, provide an encoding for strings
-        dropna       : boolean, default True, do not write an ALL nan row to
+        dropna       : boolean, default False, do not write an ALL nan row to
             the store settable by the option 'io.hdf.dropna_table'
         Notes
         -----
@@ -919,7 +919,7 @@ def append(self, key, value, format=None, append=True, columns=None,
                              **kwargs)
 
     def append_to_multiple(self, d, value, selector, data_columns=None,
-                           axes=None, dropna=True, **kwargs):
+                           axes=None, dropna=False, **kwargs):
         """
         Append to multiple tables
 
@@ -934,7 +934,7 @@ def append_to_multiple(self, d, value, selector, data_columns=None,
         data_columns : list of columns to create as data columns, or True to
             use all columns
         dropna : if evaluates to True, drop rows from all tables if any single
-                 row in each table has all NaN
+                 row in each table has all NaN. Default False.
 
         Notes
         -----
@@ -3787,7 +3787,7 @@ class AppendableTable(LegacyTable):
 
     def write(self, obj, axes=None, append=False, complib=None,
               complevel=None, fletcher32=None, min_itemsize=None,
-              chunksize=None, expectedrows=None, dropna=True, **kwargs):
+              chunksize=None, expectedrows=None, dropna=False, **kwargs):
 
         if not append and self.is_exists:
             self._handle.remove_node(self.group, 'table')
@@ -3827,7 +3827,7 @@ def write(self, obj, axes=None, append=False, complib=None,
         # add the rows
         self.write_data(chunksize, dropna=dropna)
 
-    def write_data(self, chunksize, dropna=True):
+    def write_data(self, chunksize, dropna=False):
         """ we form the data into a 2-d including indexes,values,mask
             write chunk-by-chunk """
 
diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py
@@ -1040,6 +1040,28 @@ def test_append_all_nans(self):
             store.append('df2', df[10:], dropna=False)
             tm.assert_frame_equal(store['df2'], df)
 
+        # Test to make sure defaults are to not drop. 
+        # Corresponding to Issue 9382
+        df_with_missing = DataFrame({'col1':[0, np.nan, 2], 'col2':[1, np.nan,  np.nan]})
+
+        with ensure_clean_path(self.path) as path:
+            df_with_missing.to_hdf(path, 'df_with_missing', format = 'table')
+            reloaded = read_hdf(path, 'df_with_missing')
+            tm.assert_frame_equal(df_with_missing, reloaded)
+
+        matrix = [[[np.nan, np.nan, np.nan],[1,np.nan,np.nan]],
+            [[np.nan, np.nan, np.nan], [np.nan,5,6]],
+            [[np.nan, np.nan, np.nan],[np.nan,3,np.nan]]]
+
+        panel_with_missing = Panel(matrix, items=['Item1', 'Item2','Item3'],
+                   major_axis=[1,2],
+                   minor_axis=['A', 'B', 'C'])
+
+        with ensure_clean_path(self.path) as path:
+           panel_with_missing.to_hdf(path, 'panel_with_missing', format='table')
+           reloaded_panel = read_hdf(path, 'panel_with_missing') 
+           tm.assert_panel_equal(panel_with_missing, reloaded_panel)
+
     def test_append_frame_column_oriented(self):
 
         with ensure_clean_store(self.path) as store:
@@ -4885,7 +4907,6 @@ def test_complex_append(self):
             result = store.select('df')
             assert_frame_equal(pd.concat([df, df], 0), result)
 
-
 def _test_sort(obj):
     if isinstance(obj, DataFrame):
         return obj.reindex(sorted(obj.index))