Skip to content

Commit 4157902

Browse files
committed
Merge pull request #10097 from nickeubank/patch-1
Default values for dropna to "False" (issue 9382)
2 parents b281e65 + 2377b5c commit 4157902

File tree

6 files changed

+146
-8
lines changed

6 files changed

+146
-8
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
.idea
1818
.vagrant
1919
.noseids
20+
.ipynb_checkpoints
2021

2122
# Compiled source #
2223
###################

doc/source/io.rst

+65
Original file line numberDiff line numberDiff line change
@@ -2410,6 +2410,10 @@ for some advanced strategies
24102410

24112411
There is a ``PyTables`` indexing bug which may appear when querying stores using an index. If you see a subset of results being returned, upgrade to ``PyTables`` >= 3.2. Stores created previously will need to be rewritten using the updated version.
24122412

2413+
.. warning::
2414+
2415+
As of version 0.17.0, ``HDFStore`` will not drop rows that have all missing values by default. Previously, if all values (except the index) were missing, ``HDFStore`` would not write those rows to disk.
2416+
24132417
.. ipython:: python
24142418
:suppress:
24152419
:okexcept:
@@ -2486,6 +2490,8 @@ Closing a Store, Context Manager
24862490
import os
24872491
os.remove('store.h5')
24882492
2493+
2494+
24892495
Read/Write API
24902496
~~~~~~~~~~~~~~
24912497

@@ -2504,6 +2510,65 @@ similar to how ``read_csv`` and ``to_csv`` work. (new in 0.11.0)
25042510
25052511
os.remove('store_tl.h5')
25062512
2513+
2514+
As of version 0.17.0, HDFStore will no longer drop rows that are all missing by default. This behavior can be enabled by setting ``dropna=True``.
2515+
2516+
.. ipython:: python
2517+
:suppress:
2518+
2519+
import os
2520+
2521+
.. ipython:: python
2522+
2523+
df_with_missing = pd.DataFrame({'col1':[0, np.nan, 2],
2524+
'col2':[1, np.nan, np.nan]})
2525+
df_with_missing
2526+
2527+
df_with_missing.to_hdf('file.h5', 'df_with_missing',
2528+
format = 'table', mode='w')
2529+
2530+
pd.read_hdf('file.h5', 'df_with_missing')
2531+
2532+
df_with_missing.to_hdf('file.h5', 'df_with_missing',
2533+
format = 'table', mode='w', dropna=True)
2534+
pd.read_hdf('file.h5', 'df_with_missing')
2535+
2536+
2537+
.. ipython:: python
2538+
:suppress:
2539+
2540+
os.remove('file.h5')
2541+
2542+
This is also true for the major axis of a ``Panel``:
2543+
2544+
.. ipython:: python
2545+
2546+
matrix = [[[np.nan, np.nan, np.nan],[1,np.nan,np.nan]],
2547+
[[np.nan, np.nan, np.nan], [np.nan,5,6]],
2548+
[[np.nan, np.nan, np.nan],[np.nan,3,np.nan]]]
2549+
2550+
panel_with_major_axis_all_missing = Panel(matrix,
2551+
items=['Item1', 'Item2','Item3'],
2552+
major_axis=[1,2],
2553+
minor_axis=['A', 'B', 'C'])
2554+
2555+
panel_with_major_axis_all_missing
2556+
2557+
panel_with_major_axis_all_missing.to_hdf('file.h5', 'panel',
2558+
dropna = True,
2559+
format='table',
2560+
mode='w')
2561+
reloaded = read_hdf('file.h5', 'panel')
2562+
reloaded
2563+
2564+
2565+
.. ipython:: python
2566+
:suppress:
2567+
2568+
os.remove('file.h5')
2569+
2570+
2571+
25072572
.. _io.hdf5-fixed:
25082573

25092574
Fixed Format

doc/source/whatsnew/v0.17.0.txt

+49
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,9 @@ Usually you simply want to know which values are null.
337337
None == None
338338
np.nan == np.nan
339339

340+
341+
.. _whatsnew_0170.api_breaking.other:
342+
340343
Other API Changes
341344
^^^^^^^^^^^^^^^^^
342345

@@ -372,6 +375,52 @@ Other API Changes
372375
``raise ValueError`` All other public methods (names not beginning with underscores)
373376
=============================== ===============================================================
374377

378+
379+
- default behavior for HDFStore write functions with ``format='table'`` is now to keep rows that are all missing except for index. Previously, the behavior was to drop rows that were all missing save the index. The previous behavior can be replicated using the ``dropna=True`` option. (:issue:`9382`)
380+
381+
Previously,
382+
383+
.. ipython:: python
384+
385+
df_with_missing = pd.DataFrame({'col1':[0, np.nan, 2],
386+
'col2':[1, np.nan, np.nan]})
387+
388+
df_with_missing
389+
390+
391+
.. code-block:: python
392+
393+
In [28]:
394+
df_with_missing.to_hdf('file.h5', 'df_with_missing', format='table', mode='w')
395+
396+
pd.read_hdf('file.h5', 'df_with_missing')
397+
398+
Out [28]:
399+
col1 col2
400+
0 0 1
401+
2 2 NaN
402+
403+
404+
New behavior:
405+
406+
.. ipython:: python
407+
:suppress:
408+
409+
import os
410+
411+
.. ipython:: python
412+
413+
df_with_missing.to_hdf('file.h5', 'df_with_missing', format = 'table', mode='w')
414+
415+
pd.read_hdf('file.h5', 'df_with_missing')
416+
417+
.. ipython:: python
418+
:suppress:
419+
420+
os.remove('file.h5')
421+
422+
See :ref:`documentation <io.hdf5>` for more details.
423+
375424
.. _whatsnew_0170.deprecations:
376425

377426
Deprecations

pandas/core/generic.py

+2
Original file line numberDiff line numberDiff line change
@@ -922,6 +922,8 @@ def to_hdf(self, path_or_buf, key, **kwargs):
922922
in the store wherever possible
923923
fletcher32 : bool, default False
924924
If applying compression use the fletcher32 checksum
925+
dropna : boolean, default False.
926+
If true, ALL nan rows will not be written to store.
925927
926928
"""
927929

pandas/io/pytables.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ class DuplicateWarning(Warning):
220220
"""
221221

222222
with config.config_prefix('io.hdf'):
223-
config.register_option('dropna_table', True, dropna_doc,
223+
config.register_option('dropna_table', False, dropna_doc,
224224
validator=config.is_bool)
225225
config.register_option(
226226
'default_format', None, format_doc,
@@ -817,7 +817,7 @@ def put(self, key, value, format=None, append=False, **kwargs):
817817
This will force Table format, append the input data to the
818818
existing.
819819
encoding : default None, provide an encoding for strings
820-
dropna : boolean, default True, do not write an ALL nan row to
820+
dropna : boolean, default False, do not write an ALL nan row to
821821
the store settable by the option 'io.hdf.dropna_table'
822822
"""
823823
if format is None:
@@ -899,7 +899,7 @@ def append(self, key, value, format=None, append=True, columns=None,
899899
chunksize : size to chunk the writing
900900
expectedrows : expected TOTAL row size of this table
901901
encoding : default None, provide an encoding for strings
902-
dropna : boolean, default True, do not write an ALL nan row to
902+
dropna : boolean, default False, do not write an ALL nan row to
903903
the store settable by the option 'io.hdf.dropna_table'
904904
Notes
905905
-----
@@ -919,7 +919,7 @@ def append(self, key, value, format=None, append=True, columns=None,
919919
**kwargs)
920920

921921
def append_to_multiple(self, d, value, selector, data_columns=None,
922-
axes=None, dropna=True, **kwargs):
922+
axes=None, dropna=False, **kwargs):
923923
"""
924924
Append to multiple tables
925925
@@ -934,7 +934,7 @@ def append_to_multiple(self, d, value, selector, data_columns=None,
934934
data_columns : list of columns to create as data columns, or True to
935935
use all columns
936936
dropna : if evaluates to True, drop rows from all tables if any single
937-
row in each table has all NaN
937+
row in each table has all NaN. Default False.
938938
939939
Notes
940940
-----
@@ -3787,7 +3787,7 @@ class AppendableTable(LegacyTable):
37873787

37883788
def write(self, obj, axes=None, append=False, complib=None,
37893789
complevel=None, fletcher32=None, min_itemsize=None,
3790-
chunksize=None, expectedrows=None, dropna=True, **kwargs):
3790+
chunksize=None, expectedrows=None, dropna=False, **kwargs):
37913791

37923792
if not append and self.is_exists:
37933793
self._handle.remove_node(self.group, 'table')
@@ -3827,7 +3827,7 @@ def write(self, obj, axes=None, append=False, complib=None,
38273827
# add the rows
38283828
self.write_data(chunksize, dropna=dropna)
38293829

3830-
def write_data(self, chunksize, dropna=True):
3830+
def write_data(self, chunksize, dropna=False):
38313831
""" we form the data into a 2-d including indexes,values,mask
38323832
write chunk-by-chunk """
38333833

pandas/io/tests/test_pytables.py

+22-1
Original file line numberDiff line numberDiff line change
@@ -1040,6 +1040,28 @@ def test_append_all_nans(self):
10401040
store.append('df2', df[10:], dropna=False)
10411041
tm.assert_frame_equal(store['df2'], df)
10421042

1043+
# Test to make sure defaults are to not drop.
1044+
# Corresponding to Issue 9382
1045+
df_with_missing = DataFrame({'col1':[0, np.nan, 2], 'col2':[1, np.nan, np.nan]})
1046+
1047+
with ensure_clean_path(self.path) as path:
1048+
df_with_missing.to_hdf(path, 'df_with_missing', format = 'table')
1049+
reloaded = read_hdf(path, 'df_with_missing')
1050+
tm.assert_frame_equal(df_with_missing, reloaded)
1051+
1052+
matrix = [[[np.nan, np.nan, np.nan],[1,np.nan,np.nan]],
1053+
[[np.nan, np.nan, np.nan], [np.nan,5,6]],
1054+
[[np.nan, np.nan, np.nan],[np.nan,3,np.nan]]]
1055+
1056+
panel_with_missing = Panel(matrix, items=['Item1', 'Item2','Item3'],
1057+
major_axis=[1,2],
1058+
minor_axis=['A', 'B', 'C'])
1059+
1060+
with ensure_clean_path(self.path) as path:
1061+
panel_with_missing.to_hdf(path, 'panel_with_missing', format='table')
1062+
reloaded_panel = read_hdf(path, 'panel_with_missing')
1063+
tm.assert_panel_equal(panel_with_missing, reloaded_panel)
1064+
10431065
def test_append_frame_column_oriented(self):
10441066

10451067
with ensure_clean_store(self.path) as store:
@@ -4885,7 +4907,6 @@ def test_complex_append(self):
48854907
result = store.select('df')
48864908
assert_frame_equal(pd.concat([df, df], 0), result)
48874909

4888-
48894910
def _test_sort(obj):
48904911
if isinstance(obj, DataFrame):
48914912
return obj.reindex(sorted(obj.index))

0 commit comments

Comments
 (0)