BUG: pytables not writing rows where all-nan in a part of a block

jreback · jreback · commit be3568458d9e · 2013-03-11T13:31:52.000-04:00
TST: more tests with all-nan rows/columns

DOC: corrections, add warning about dropping all-nan rows
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -118,6 +118,7 @@ pandas 0.11.0
       underscore)
     - fixes for query parsing to correctly interpret boolean and != (GH2849_, GH2973_)
     - fixes for pathological case on SparseSeries with 0-len array and compression (GH2931_)
+    - fixes bug with writing rows if part of a block was all-nan (GH3012_)
 
   - Bug showing up in applymap where some object type columns are converted (GH2909_)
     had an incorrect default in convert_objects
@@ -165,6 +166,7 @@ pandas 0.11.0
 .. _GH2982: https://github.com/pydata/pandas/issues/2982
 .. _GH2989: https://github.com/pydata/pandas/issues/2989
 .. _GH3002: https://github.com/pydata/pandas/issues/3002
+.. _GH3012: https://github.com/pydata/pandas/issues/3012
 
 
 pandas 0.10.1
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -1461,8 +1461,7 @@ beginning. You can use the supplied ``PyTables`` utility
 ``ptrepack``. In addition, ``ptrepack`` can change compression levels
 after the fact.
 
-   - ``ptrepack --chunkshape=auto --propindexes --complevel=9
-       --complib=blosc in.h5 out.h5``
+   - ``ptrepack --chunkshape=auto --propindexes --complevel=9 --complib=blosc in.h5 out.h5``
 
 Furthermore ``ptrepack in.h5 out.h5`` will *repack* the file to allow
 you to reuse previously deleted space. Aalternatively, one can simply
@@ -1473,6 +1472,10 @@ Notes & Caveats
 
    - Once a ``table`` is created its items (Panel) / columns (DataFrame)
      are fixed; only exactly the same columns can be appended
+   - If a row has ``np.nan`` for **EVERY COLUMN** (having a ``nan``
+     in a string, or a ``NaT`` in a datetime-like column counts as having
+     a value), then those rows **WILL BE DROPPED IMPLICITLY**. This limitation
+     *may* be addressed in the future.
    - You can not append/select/delete to a non-table (table creation is
      determined on the first append, or by passing ``table=True`` in a
      put operation)
@@ -1498,13 +1501,13 @@ Notes & Caveats
 
      .. ipython:: python
 
-     store.append('wp_big_strings', wp, min_itemsize = { 'minor_axis' : 30 })
-     wp = wp.rename_axis(lambda x: x + '_big_strings', axis=2)
-     store.append('wp_big_strings', wp)
-     store.select('wp_big_strings')
+       store.append('wp_big_strings', wp, min_itemsize = { 'minor_axis' : 30 })
+       wp = wp.rename_axis(lambda x: x + '_big_strings', axis=2)
+       store.append('wp_big_strings', wp)
+       store.select('wp_big_strings')
 
-     # we have provided a minimum minor_axis indexable size
-     store.root.wp_big_strings.table
+       # we have provided a minimum minor_axis indexable size
+       store.root.wp_big_strings.table
 
 DataTypes
 ~~~~~~~~~
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -2577,7 +2577,7 @@ def write_data(self, chunksize):
         # consolidate masks
         mask = masks[0]
         for m in masks[1:]:
-            m = mask & m
+            mask = mask & m
 
         # the arguments
         indexes = [a.cvalues for a in self.index_axes]
diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py
@@ -417,6 +417,85 @@ def test_append(self):
             store.append('df', df)
             tm.assert_frame_equal(store['df'], df)
 
+    def test_append_some_nans(self):
+
+        with ensure_clean(self.path) as store:
+            df = DataFrame({'A' : Series(np.random.randn(20)).astype('int32'),
+                            'A1' : np.random.randn(20),
+                            'A2' : np.random.randn(20),
+                            'B' : 'foo', 'C' : 'bar', 'D' : Timestamp("20010101"), 'E' : datetime.datetime(2001,1,2,0,0) },
+                           index=np.arange(20))
+            # some nans
+            store.remove('df1')
+            df.ix[0:15,['A1','B','D','E']] = np.nan
+            store.append('df1', df[:10])
+            store.append('df1', df[10:])
+            tm.assert_frame_equal(store['df1'], df)
+
+            # first column
+            df1 = df.copy()
+            df1.ix[:,'A1'] = np.nan
+            store.remove('df1')
+            store.append('df1', df1[:10])
+            store.append('df1', df1[10:])
+            tm.assert_frame_equal(store['df1'], df1)
+
+            # 2nd column
+            df2 = df.copy()
+            df2.ix[:,'A2'] = np.nan
+            store.remove('df2')
+            store.append('df2', df2[:10])
+            store.append('df2', df2[10:])
+            tm.assert_frame_equal(store['df2'], df2)
+
+            # datetimes
+            df3 = df.copy()
+            df3.ix[:,'E'] = np.nan
+            store.remove('df3')
+            store.append('df3', df3[:10])
+            store.append('df3', df3[10:])
+            tm.assert_frame_equal(store['df3'], df3)
+
+            ##### THIS IS A BUG, should not drop these all-nan rows
+            ##### BUT need to store the index which we don't want to do....
+            # nan some entire rows
+            df = DataFrame({'A1' : np.random.randn(20),
+                            'A2' : np.random.randn(20)},
+                           index=np.arange(20))
+
+            store.remove('df4')
+            df.ix[0:15,:] = np.nan
+            store.append('df4', df[:10])
+            store.append('df4', df[10:])
+            tm.assert_frame_equal(store['df4'], df[-4:])
+            self.assert_(store.get_storer('df4').nrows == 4)
+
+            # nan some entire rows (string are still written!)
+            df = DataFrame({'A1' : np.random.randn(20),
+                            'A2' : np.random.randn(20),
+                            'B' : 'foo', 'C' : 'bar'},
+                           index=np.arange(20))
+
+            store.remove('df5')
+            df.ix[0:15,:] = np.nan
+            store.append('df5', df[:10])
+            store.append('df5', df[10:])
+            tm.assert_frame_equal(store['df5'], df)
+            self.assert_(store.get_storer('df5').nrows == 20)
+
+            # nan some entire rows (but since we have dates they are still written!)
+            df = DataFrame({'A1' : np.random.randn(20),
+                            'A2' : np.random.randn(20),
+                            'B' : 'foo', 'C' : 'bar', 'D' : Timestamp("20010101"), 'E' : datetime.datetime(2001,1,2,0,0) },
+                           index=np.arange(20))
+
+            store.remove('df6')
+            df.ix[0:15,:] = np.nan
+            store.append('df6', df[:10])
+            store.append('df6', df[10:])
+            tm.assert_frame_equal(store['df6'], df)
+            self.assert_(store.get_storer('df6').nrows == 20)
+
     def test_append_frame_column_oriented(self):
 
         with ensure_clean(self.path) as store: