From be3568458d9e119da11ba67771de03909b98e353 Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 11 Mar 2013 11:21:04 -0400 Subject: [PATCH] BUG: pytables not writing rows where all-nan in a part of a block TST: more tests with all-nan rows/columns DOC: corrections, add warning about dropping all-nan rows --- RELEASE.rst | 2 + doc/source/io.rst | 19 ++++---- pandas/io/pytables.py | 2 +- pandas/io/tests/test_pytables.py | 79 ++++++++++++++++++++++++++++++++ 4 files changed, 93 insertions(+), 9 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 2b911b0ed8170..9deafd56ccc10 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -118,6 +118,7 @@ pandas 0.11.0 underscore) - fixes for query parsing to correctly interpret boolean and != (GH2849_, GH2973_) - fixes for pathological case on SparseSeries with 0-len array and compression (GH2931_) + - fixes bug with writing rows if part of a block was all-nan (GH3012_) - Bug showing up in applymap where some object type columns are converted (GH2909_) had an incorrect default in convert_objects @@ -165,6 +166,7 @@ pandas 0.11.0 .. _GH2982: https://github.com/pydata/pandas/issues/2982 .. _GH2989: https://github.com/pydata/pandas/issues/2989 .. _GH3002: https://github.com/pydata/pandas/issues/3002 +.. _GH3012: https://github.com/pydata/pandas/issues/3012 pandas 0.10.1 diff --git a/doc/source/io.rst b/doc/source/io.rst index 914506fb0d3cd..01ed06cd6a60f 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1461,8 +1461,7 @@ beginning. You can use the supplied ``PyTables`` utility ``ptrepack``. In addition, ``ptrepack`` can change compression levels after the fact. - - ``ptrepack --chunkshape=auto --propindexes --complevel=9 - --complib=blosc in.h5 out.h5`` + - ``ptrepack --chunkshape=auto --propindexes --complevel=9 --complib=blosc in.h5 out.h5`` Furthermore ``ptrepack in.h5 out.h5`` will *repack* the file to allow you to reuse previously deleted space. Aalternatively, one can simply @@ -1473,6 +1472,10 @@ Notes & Caveats - Once a ``table`` is created its items (Panel) / columns (DataFrame) are fixed; only exactly the same columns can be appended + - If a row has ``np.nan`` for **EVERY COLUMN** (having a ``nan`` + in a string, or a ``NaT`` in a datetime-like column counts as having + a value), then those rows **WILL BE DROPPED IMPLICITLY**. This limitation + *may* be addressed in the future. - You can not append/select/delete to a non-table (table creation is determined on the first append, or by passing ``table=True`` in a put operation) @@ -1498,13 +1501,13 @@ Notes & Caveats .. ipython:: python - store.append('wp_big_strings', wp, min_itemsize = { 'minor_axis' : 30 }) - wp = wp.rename_axis(lambda x: x + '_big_strings', axis=2) - store.append('wp_big_strings', wp) - store.select('wp_big_strings') + store.append('wp_big_strings', wp, min_itemsize = { 'minor_axis' : 30 }) + wp = wp.rename_axis(lambda x: x + '_big_strings', axis=2) + store.append('wp_big_strings', wp) + store.select('wp_big_strings') - # we have provided a minimum minor_axis indexable size - store.root.wp_big_strings.table + # we have provided a minimum minor_axis indexable size + store.root.wp_big_strings.table DataTypes ~~~~~~~~~ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index c635c0b231c48..6b3b36f231c1a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2577,7 +2577,7 @@ def write_data(self, chunksize): # consolidate masks mask = masks[0] for m in masks[1:]: - m = mask & m + mask = mask & m # the arguments indexes = [a.cvalues for a in self.index_axes] diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 4efe87fceebc0..c3a8990962ca1 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -417,6 +417,85 @@ def test_append(self): store.append('df', df) tm.assert_frame_equal(store['df'], df) + def test_append_some_nans(self): + + with ensure_clean(self.path) as store: + df = DataFrame({'A' : Series(np.random.randn(20)).astype('int32'), + 'A1' : np.random.randn(20), + 'A2' : np.random.randn(20), + 'B' : 'foo', 'C' : 'bar', 'D' : Timestamp("20010101"), 'E' : datetime.datetime(2001,1,2,0,0) }, + index=np.arange(20)) + # some nans + store.remove('df1') + df.ix[0:15,['A1','B','D','E']] = np.nan + store.append('df1', df[:10]) + store.append('df1', df[10:]) + tm.assert_frame_equal(store['df1'], df) + + # first column + df1 = df.copy() + df1.ix[:,'A1'] = np.nan + store.remove('df1') + store.append('df1', df1[:10]) + store.append('df1', df1[10:]) + tm.assert_frame_equal(store['df1'], df1) + + # 2nd column + df2 = df.copy() + df2.ix[:,'A2'] = np.nan + store.remove('df2') + store.append('df2', df2[:10]) + store.append('df2', df2[10:]) + tm.assert_frame_equal(store['df2'], df2) + + # datetimes + df3 = df.copy() + df3.ix[:,'E'] = np.nan + store.remove('df3') + store.append('df3', df3[:10]) + store.append('df3', df3[10:]) + tm.assert_frame_equal(store['df3'], df3) + + ##### THIS IS A BUG, should not drop these all-nan rows + ##### BUT need to store the index which we don't want to do.... + # nan some entire rows + df = DataFrame({'A1' : np.random.randn(20), + 'A2' : np.random.randn(20)}, + index=np.arange(20)) + + store.remove('df4') + df.ix[0:15,:] = np.nan + store.append('df4', df[:10]) + store.append('df4', df[10:]) + tm.assert_frame_equal(store['df4'], df[-4:]) + self.assert_(store.get_storer('df4').nrows == 4) + + # nan some entire rows (string are still written!) + df = DataFrame({'A1' : np.random.randn(20), + 'A2' : np.random.randn(20), + 'B' : 'foo', 'C' : 'bar'}, + index=np.arange(20)) + + store.remove('df5') + df.ix[0:15,:] = np.nan + store.append('df5', df[:10]) + store.append('df5', df[10:]) + tm.assert_frame_equal(store['df5'], df) + self.assert_(store.get_storer('df5').nrows == 20) + + # nan some entire rows (but since we have dates they are still written!) + df = DataFrame({'A1' : np.random.randn(20), + 'A2' : np.random.randn(20), + 'B' : 'foo', 'C' : 'bar', 'D' : Timestamp("20010101"), 'E' : datetime.datetime(2001,1,2,0,0) }, + index=np.arange(20)) + + store.remove('df6') + df.ix[0:15,:] = np.nan + store.append('df6', df[:10]) + store.append('df6', df[10:]) + tm.assert_frame_equal(store['df6'], df) + self.assert_(store.get_storer('df6').nrows == 20) + def test_append_frame_column_oriented(self): with ensure_clean(self.path) as store: