Skip to content

Default values for dropna to "False" (issue 9382) #9484

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
40 changes: 40 additions & 0 deletions doc/source/whatsnew/v0.16.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,46 @@ API changes



Backwards incompatible API changes
~~~~~~~~~~~
- default behavior for HDF write functions with "table" format is now to keep rows that are all missing except for index. (:issue:`9382`)

Previously,

.. code-block:: python
In [1]:
df_with_missing = pd.DataFrame({'col1':[0, np.nan, 2], 'col2':[1, np.nan, 3]})
df_with_missing.to_hdf('file.h5', 'df_with_missing', format = 't')

df_without_missing = pd.DataFrame({'col1':[0, -1, 2], 'col2':[1, -1, 3]})
df_without_missing.to_hdf('file.h5', 'df_without_missing')

pd.read_hdf('file.h5', 'df_with_missing')
pd.read_hdf('file.h5', 'df_without_missing')

Out [1]:
col1 col2
0 0 1
2 2 3
col1 col2
0 0 1
1 -1 -1
2 2 3



New behavior:

.. ipython-block:: python
df_with_missing = pd.DataFrame({'col1':[0, np.nan, 2], 'col2':[1, np.nan, 3]})
df_with_missing.to_hdf('file.h5', 'df_with_missing', format = 't')

df_without_missing = pd.DataFrame({'col1':[0, -1, 2], 'col2':[1, -1, 3]})
df_without_missing.to_hdf('file.h5', 'df_without_missing')

pd.read_hdf('file.h5', 'df_with_missing')
pd.read_hdf('file.h5', 'df_without_missing')




Expand Down
18 changes: 9 additions & 9 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ class DuplicateWarning(Warning):
"""

with config.config_prefix('io.hdf'):
config.register_option('dropna_table', True, dropna_doc,
config.register_option('dropna_table', False, dropna_doc,
validator=config.is_bool)
config.register_option(
'default_format', None, format_doc,
Expand Down Expand Up @@ -801,8 +801,8 @@ def put(self, key, value, format=None, append=False, **kwargs):
This will force Table format, append the input data to the
existing.
encoding : default None, provide an encoding for strings
dropna : boolean, default True, do not write an ALL nan row to
the store settable by the option 'io.hdf.dropna_table'
dropna : boolean, default False. if True do not write an ALL nan
row to the store. Settable by the option 'io.hdf.dropna_table'
"""
if format is None:
format = get_option("io.hdf.default_format") or 'fixed'
Expand Down Expand Up @@ -883,8 +883,8 @@ def append(self, key, value, format=None, append=True, columns=None,
chunksize : size to chunk the writing
expectedrows : expected TOTAL row size of this table
encoding : default None, provide an encoding for strings
dropna : boolean, default True, do not write an ALL nan row to
the store settable by the option 'io.hdf.dropna_table'
dropna : boolean, default False. If true, do not write an ALL nan
row to the store. settable by the option 'io.hdf.dropna_table'
Notes
-----
Does *not* check if data being appended overlaps with existing
Expand All @@ -903,7 +903,7 @@ def append(self, key, value, format=None, append=True, columns=None,
**kwargs)

def append_to_multiple(self, d, value, selector, data_columns=None,
axes=None, dropna=True, **kwargs):
axes=None, dropna=False, **kwargs):
"""
Append to multiple tables

Expand All @@ -918,7 +918,7 @@ def append_to_multiple(self, d, value, selector, data_columns=None,
data_columns : list of columns to create as data columns, or True to
use all columns
dropna : if evaluates to True, drop rows from all tables if any single
row in each table has all NaN
row in each table has all NaN. Default False.

Notes
-----
Expand Down Expand Up @@ -3741,7 +3741,7 @@ class AppendableTable(LegacyTable):

def write(self, obj, axes=None, append=False, complib=None,
complevel=None, fletcher32=None, min_itemsize=None,
chunksize=None, expectedrows=None, dropna=True, **kwargs):
chunksize=None, expectedrows=None, dropna=False, **kwargs):

if not append and self.is_exists:
self._handle.remove_node(self.group, 'table')
Expand Down Expand Up @@ -3778,7 +3778,7 @@ def write(self, obj, axes=None, append=False, complib=None,
# add the rows
self.write_data(chunksize, dropna=dropna)

def write_data(self, chunksize, dropna=True):
def write_data(self, chunksize, dropna=False):
""" we form the data into a 2-d including indexes,values,mask
write chunk-by-chunk """

Expand Down
8 changes: 8 additions & 0 deletions pandas/io/tests/test_pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -4588,6 +4588,14 @@ def test_duplicate_column_name(self):
other = read_hdf(path, 'df')
tm.assert_frame_equal(df, other)

def test_all_missing_values(self):
# Test corresponding to Issue 9382
df_with_missing = DataFrame({'col1':[0, np.nan, 2], 'col2':[1, np.nan, np.nan]})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add the issue number as a comment


with ensure_clean_path(self.path) as path:
df_with_missing.to_hdf(path, 'df_with_missing', format = 't')
reloaded = read_hdf(path, 'df_with_missing')
tm.assert_frame_equal(df_with_missing, reloaded)

def _test_sort(obj):
if isinstance(obj, DataFrame):
Expand Down