From 9ab8c23e50cfadd45995cc3ce5885ebd32e9f68a Mon Sep 17 00:00:00 2001 From: Nick Eubank Date: Fri, 13 Feb 2015 11:39:51 -0800 Subject: [PATCH 1/9] Default values for dropna to "False" (issue 9382) PLEASE REVIEW: This is my commit to a major project, and would appreciate a quick once over! As per discussion in Issue 9382, changes all HDF functions from having default of dropping all rows with NA in all non-index rows. --- pandas/io/pytables.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 05510f655f7be..7b695a82fc711 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -219,7 +219,7 @@ class DuplicateWarning(Warning): """ with config.config_prefix('io.hdf'): - config.register_option('dropna_table', True, dropna_doc, + config.register_option('dropna_table', False, dropna_doc, validator=config.is_bool) config.register_option( 'default_format', None, format_doc, @@ -801,8 +801,8 @@ def put(self, key, value, format=None, append=False, **kwargs): This will force Table format, append the input data to the existing. encoding : default None, provide an encoding for strings - dropna : boolean, default True, do not write an ALL nan row to - the store settable by the option 'io.hdf.dropna_table' + dropna : boolean, default False. if True do not write an ALL nan + row to the store. Settable by the option 'io.hdf.dropna_table' """ if format is None: format = get_option("io.hdf.default_format") or 'fixed' @@ -883,8 +883,8 @@ def append(self, key, value, format=None, append=True, columns=None, chunksize : size to chunk the writing expectedrows : expected TOTAL row size of this table encoding : default None, provide an encoding for strings - dropna : boolean, default True, do not write an ALL nan row to - the store settable by the option 'io.hdf.dropna_table' + dropna : boolean, default False. If true, do not write an ALL nan + row to the store. settable by the option 'io.hdf.dropna_table' Notes ----- Does *not* check if data being appended overlaps with existing @@ -903,7 +903,7 @@ def append(self, key, value, format=None, append=True, columns=None, **kwargs) def append_to_multiple(self, d, value, selector, data_columns=None, - axes=None, dropna=True, **kwargs): + axes=None, dropna=False, **kwargs): """ Append to multiple tables @@ -918,7 +918,7 @@ def append_to_multiple(self, d, value, selector, data_columns=None, data_columns : list of columns to create as data columns, or True to use all columns dropna : if evaluates to True, drop rows from all tables if any single - row in each table has all NaN + row in each table has all NaN. Default False. Notes ----- @@ -3740,7 +3740,7 @@ class AppendableTable(LegacyTable): def write(self, obj, axes=None, append=False, complib=None, complevel=None, fletcher32=None, min_itemsize=None, - chunksize=None, expectedrows=None, dropna=True, **kwargs): + chunksize=None, expectedrows=None, dropna=False, **kwargs): if not append and self.is_exists: self._handle.remove_node(self.group, 'table') @@ -3777,7 +3777,7 @@ def write(self, obj, axes=None, append=False, complib=None, # add the rows self.write_data(chunksize, dropna=dropna) - def write_data(self, chunksize, dropna=True): + def write_data(self, chunksize, dropna=False): """ we form the data into a 2-d including indexes,values,mask write chunk-by-chunk """ From 1d7808c9e6b78d1c073d194eceee46675e0b9964 Mon Sep 17 00:00:00 2001 From: Nick Eubank Date: Fri, 13 Feb 2015 14:04:21 -0800 Subject: [PATCH 2/9] Update v0.16.0.txt --- doc/source/whatsnew/v0.16.0.txt | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index bcae3327828bf..c0a89b5fe7b8f 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -130,6 +130,33 @@ methods (:issue:`9088`). d 7 dtype: int64 + - default behavior for HDF write functions is now to keep rows that are all missing except for index. (:issue:`9382`) + + Previously, + + .. ipython:: python + In [1]: myFile = HDFStore('file.hdf') + seriesWithMissingRow = pd.Series([0, np.nan, 2], index = ['user1', 'user2', 'user3']) + myFile.append('fileKey', seriesWithMissingRow, append = False) + myFile['fileKey'] + + Out[1]: + user1 0 + user3 2 + dtype: float64 + + New behavior: + .. ipython:: python + In [2]: myFile = HDFStore('file.hdf') + seriesWithMissingRow = pd.Series([0, np.nan, 2], index = ['user1', 'user2', 'user3']) + myFile.append('fileKey', seriesWithMissingRow, append = False) + myFile['fileKey'] + + Out[2]: + user1 0 + user2 NaN + user3 2 + dtype: float64 Deprecations From 66dfc6b4c8e49c765195554cffa7b01a4c1bbd88 Mon Sep 17 00:00:00 2001 From: Nick Eubank Date: Mon, 16 Feb 2015 19:19:10 -0800 Subject: [PATCH 3/9] Update v0.16.0.txt --- doc/source/whatsnew/v0.16.0.txt | 63 ++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index c0a89b5fe7b8f..b161469f6c663 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -130,33 +130,54 @@ methods (:issue:`9088`). d 7 dtype: int64 - - default behavior for HDF write functions is now to keep rows that are all missing except for index. (:issue:`9382`) +- default behavior for HDF write functions with "table" format is now to keep rows that are all missing except for index. (:issue:`9382`) Previously, + + .. code-block:: python + In [1]: + df_with_missing = pd.DataFrame({'col1':[0, np.nan, 2], 'col2':[1, np.nan, 3]}) + df_with_missing.to_hdf('file.h5', 'df_with_missing', format = 't') + + df_without_missing = pd.DataFrame({'col1':[0, -1, 2], 'col2':[1, -1, 3]}) + df_without_missing.to_hdf('file.h5', 'df_without_missing') + + print(pd.read_hdf('file.h5', 'df_with_missing')) + print(pd.read_hdf('file.h5', 'df_without_missing')) - .. ipython:: python - In [1]: myFile = HDFStore('file.hdf') - seriesWithMissingRow = pd.Series([0, np.nan, 2], index = ['user1', 'user2', 'user3']) - myFile.append('fileKey', seriesWithMissingRow, append = False) - myFile['fileKey'] + Out [1]: + col1 col2 + 0 0 1 + 2 2 3 + col1 col2 + 0 0 1 + 1 -1 -1 + 2 2 3 - Out[1]: - user1 0 - user3 2 - dtype: float64 - New behavior: - .. ipython:: python - In [2]: myFile = HDFStore('file.hdf') - seriesWithMissingRow = pd.Series([0, np.nan, 2], index = ['user1', 'user2', 'user3']) - myFile.append('fileKey', seriesWithMissingRow, append = False) - myFile['fileKey'] - Out[2]: - user1 0 - user2 NaN - user3 2 - dtype: float64 +New behavior: do + + .. code-block:: python + In [1]: + df_with_missing = pd.DataFrame({'col1':[0, np.nan, 2], 'col2':[1, np.nan, 3]}) + df_with_missing.to_hdf('file.h5', 'df_with_missing', format = 't') + + df_without_missing = pd.DataFrame({'col1':[0, -1, 2], 'col2':[1, -1, 3]}) + df_without_missing.to_hdf('file.h5', 'df_without_missing') + + print(pd.read_hdf('file.h5', 'df_with_missing')) + print(pd.read_hdf('file.h5', 'df_without_missing')) + + Out [2]: + col1 col2 + 0 0 1 + 1 NaN NaN + 2 2 3 + col1 col2 + 0 0 1 + 1 -1 -1 + 2 2 3 Deprecations From 3e2a718ad044a7ea6810645c8b51a07e1ae0dc82 Mon Sep 17 00:00:00 2001 From: Nick Eubank Date: Mon, 16 Feb 2015 19:30:34 -0800 Subject: [PATCH 4/9] Test for change of default setting for dropna --- pandas/io/tests/test_pytables.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index e95d46f66f17f..ff6b49ccfb622 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4678,6 +4678,13 @@ def test_duplicate_column_name(self): other = read_hdf(path, 'df') tm.assert_frame_equal(df, other) + def test_all_missing_values(self): + df_with_missing = pd.DataFrame({'col1':[np.nan]}) + + with ensure_clean_path(self.path) as path: + df_with_missing.to_hdf(path, 'df_with_missing', format = 't') + reloaded = pd.read_hdf(path, 'df_with_missing') + tm.assert_frame_equal(df_with_missing, reloaded) def _test_sort(obj): if isinstance(obj, DataFrame): From de022a92057ec652f09fe0af10b436a37128194d Mon Sep 17 00:00:00 2001 From: Nick Eubank Date: Tue, 17 Feb 2015 11:56:47 -0800 Subject: [PATCH 5/9] dropped pd. prefix for pandas operations. --- pandas/io/tests/test_pytables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index ff6b49ccfb622..1a825df1a626a 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4679,11 +4679,11 @@ def test_duplicate_column_name(self): tm.assert_frame_equal(df, other) def test_all_missing_values(self): - df_with_missing = pd.DataFrame({'col1':[np.nan]}) + df_with_missing = DataFrame({'col1':[np.nan]}) with ensure_clean_path(self.path) as path: df_with_missing.to_hdf(path, 'df_with_missing', format = 't') - reloaded = pd.read_hdf(path, 'df_with_missing') + reloaded = read_hdf(path, 'df_with_missing') tm.assert_frame_equal(df_with_missing, reloaded) def _test_sort(obj): From 5f2eae8e239dfee150ff00fd679b2eed9412e72a Mon Sep 17 00:00:00 2001 From: Nick Eubank Date: Tue, 17 Feb 2015 12:03:13 -0800 Subject: [PATCH 6/9] More complicated data frame object. --- pandas/io/tests/test_pytables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 1a825df1a626a..88261bf458cdc 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4679,7 +4679,7 @@ def test_duplicate_column_name(self): tm.assert_frame_equal(df, other) def test_all_missing_values(self): - df_with_missing = DataFrame({'col1':[np.nan]}) + df_with_missing = DataFrame({'col1':[0, np.nan, 2], 'col2':[1, np.nan, np.nan]}) with ensure_clean_path(self.path) as path: df_with_missing.to_hdf(path, 'df_with_missing', format = 't') From 892835b285a2a0ecd43791973cf7afb565988a36 Mon Sep 17 00:00:00 2001 From: Nick Eubank Date: Tue, 17 Feb 2015 16:54:13 -0800 Subject: [PATCH 7/9] add issue number in comment --- pandas/io/tests/test_pytables.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 88261bf458cdc..736d8a5b2bb90 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4679,6 +4679,7 @@ def test_duplicate_column_name(self): tm.assert_frame_equal(df, other) def test_all_missing_values(self): + # Test corresponding to Issue 9382 df_with_missing = DataFrame({'col1':[0, np.nan, 2], 'col2':[1, np.nan, np.nan]}) with ensure_clean_path(self.path) as path: From 137c4c0b6545e4244d51e12ce7409129fc622e60 Mon Sep 17 00:00:00 2001 From: Nick Eubank Date: Tue, 17 Feb 2015 16:58:18 -0800 Subject: [PATCH 8/9] Updated to reflect suggested changes by Jeff --- doc/source/whatsnew/v0.16.0.txt | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index b161469f6c663..1e08ece251fdc 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -142,8 +142,8 @@ methods (:issue:`9088`). df_without_missing = pd.DataFrame({'col1':[0, -1, 2], 'col2':[1, -1, 3]}) df_without_missing.to_hdf('file.h5', 'df_without_missing') - print(pd.read_hdf('file.h5', 'df_with_missing')) - print(pd.read_hdf('file.h5', 'df_without_missing')) + pd.read_hdf('file.h5', 'df_with_missing') + pd.read_hdf('file.h5', 'df_without_missing') Out [1]: col1 col2 @@ -156,28 +156,17 @@ methods (:issue:`9088`). -New behavior: do +New behavior: - .. code-block:: python - In [1]: + .. ipython-block:: python df_with_missing = pd.DataFrame({'col1':[0, np.nan, 2], 'col2':[1, np.nan, 3]}) df_with_missing.to_hdf('file.h5', 'df_with_missing', format = 't') df_without_missing = pd.DataFrame({'col1':[0, -1, 2], 'col2':[1, -1, 3]}) df_without_missing.to_hdf('file.h5', 'df_without_missing') - print(pd.read_hdf('file.h5', 'df_with_missing')) - print(pd.read_hdf('file.h5', 'df_without_missing')) - - Out [2]: - col1 col2 - 0 0 1 - 1 NaN NaN - 2 2 3 - col1 col2 - 0 0 1 - 1 -1 -1 - 2 2 3 + pd.read_hdf('file.h5', 'df_with_missing') + pd.read_hdf('file.h5', 'df_without_missing') Deprecations From 1a119d24cd6a70124fd796566bb676ee4e4b5636 Mon Sep 17 00:00:00 2001 From: Nick Eubank Date: Thu, 26 Mar 2015 19:32:02 -0700 Subject: [PATCH 9/9] moved docs to whatsnew 16.1 from 16.0 --- doc/source/whatsnew/v0.16.1.txt | 40 +++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 3c3742c968642..012a35d9d2f6e 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -29,6 +29,46 @@ API changes +Backwards incompatible API changes +~~~~~~~~~~~ +- default behavior for HDF write functions with "table" format is now to keep rows that are all missing except for index. (:issue:`9382`) + + Previously, + + .. code-block:: python + In [1]: + df_with_missing = pd.DataFrame({'col1':[0, np.nan, 2], 'col2':[1, np.nan, 3]}) + df_with_missing.to_hdf('file.h5', 'df_with_missing', format = 't') + + df_without_missing = pd.DataFrame({'col1':[0, -1, 2], 'col2':[1, -1, 3]}) + df_without_missing.to_hdf('file.h5', 'df_without_missing') + + pd.read_hdf('file.h5', 'df_with_missing') + pd.read_hdf('file.h5', 'df_without_missing') + + Out [1]: + col1 col2 + 0 0 1 + 2 2 3 + col1 col2 + 0 0 1 + 1 -1 -1 + 2 2 3 + + + +New behavior: + + .. ipython-block:: python + df_with_missing = pd.DataFrame({'col1':[0, np.nan, 2], 'col2':[1, np.nan, 3]}) + df_with_missing.to_hdf('file.h5', 'df_with_missing', format = 't') + + df_without_missing = pd.DataFrame({'col1':[0, -1, 2], 'col2':[1, -1, 3]}) + df_without_missing.to_hdf('file.h5', 'df_without_missing') + + pd.read_hdf('file.h5', 'df_with_missing') + pd.read_hdf('file.h5', 'df_without_missing') +