Skip to content

Commit 3073835

Browse files
committed
Merge pull request #4714 from jreback/hdf_nan
API: for HDFStore, add the keyword dropna=True to append to change whether to write ALL nan rows to the store (GH4625)
2 parents c472099 + 59d55fe commit 3073835

File tree

4 files changed

+92
-33
lines changed

4 files changed

+92
-33
lines changed

doc/source/release.rst

+3
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,9 @@ pandas 0.13
109109
be raised if you try to use ``mode='w'`` with an OPEN file handle (:issue:`4367`)
110110
- allow a passed locations array or mask as a ``where`` condition (:issue:`4467`)
111111
- the ``fmt`` keyword now replaces the ``table`` keyword; allowed values are ``s|t``
112+
- add the keyword ``dropna=True`` to ``append`` to change whether ALL nan rows are not written
113+
to the store (default is ``True``, ALL nan rows are NOT written), also settable
114+
via the option ``io.hdf.dropna_table`` (:issue:`4625`)
112115
- ``JSON``
113116

114117
- added ``date_unit`` parameter to specify resolution of timestamps. Options

doc/source/v0.13.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,9 @@ API changes
9898

9999
import os
100100
os.remove(path)
101+
- add the keyword ``dropna=True`` to ``append`` to change whether ALL nan rows are not written
102+
to the store (default is ``True``, ALL nan rows are NOT written), also settable
103+
via the option ``io.hdf.dropna_table`` (:issue:`4625`)
101104

102105
- Changes to how ``Index`` and ``MultiIndex`` handle metadata (``levels``,
103106
``labels``, and ``names``) (:issue:`4039`):

pandas/io/pytables.py

+40-18
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
from pandas.tools.merge import concat
3333
from pandas import compat
3434
from pandas.io.common import PerformanceWarning
35+
from pandas.core.config import get_option
3536

3637
import pandas.lib as lib
3738
import pandas.algos as algos
@@ -165,6 +166,17 @@ class DuplicateWarning(Warning):
165166
Panel4D: [1, 2, 3],
166167
}
167168

169+
# register our configuration options
170+
from pandas.core import config
171+
dropna_doc = """
172+
: boolean
173+
drop ALL nan rows when appending to a table
174+
"""
175+
176+
with config.config_prefix('io.hdf'):
177+
config.register_option('dropna_table', True, dropna_doc,
178+
validator=config.is_bool)
179+
168180
# oh the troubles to reduce import time
169181
_table_mod = None
170182
_table_supports_index = False
@@ -730,7 +742,7 @@ def remove(self, key, where=None, start=None, stop=None):
730742
'can only remove with where on objects written as tables')
731743
return s.delete(where=where, start=start, stop=stop)
732744

733-
def append(self, key, value, fmt=None, append=True, columns=None, **kwargs):
745+
def append(self, key, value, fmt=None, append=True, columns=None, dropna=None, **kwargs):
734746
"""
735747
Append to Table in file. Node must already exist and be Table
736748
format.
@@ -751,7 +763,8 @@ def append(self, key, value, fmt=None, append=True, columns=None, **kwargs):
751763
chunksize : size to chunk the writing
752764
expectedrows : expected TOTAL row size of this table
753765
encoding : default None, provide an encoding for strings
754-
766+
dropna : boolean, default True, do not write an ALL nan row to the store
767+
settable by the option 'io.hdf.dropna_table'
755768
Notes
756769
-----
757770
Does *not* check if data being appended overlaps with existing
@@ -761,8 +774,10 @@ def append(self, key, value, fmt=None, append=True, columns=None, **kwargs):
761774
raise Exception(
762775
"columns is not a supported keyword in append, try data_columns")
763776

777+
if dropna is None:
778+
dropna = get_option("io.hdf.dropna_table")
764779
kwargs = self._validate_format(fmt or 't', kwargs)
765-
self._write_to_group(key, value, append=append, **kwargs)
780+
self._write_to_group(key, value, append=append, dropna=dropna, **kwargs)
766781

767782
def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, **kwargs):
768783
"""
@@ -3219,7 +3234,7 @@ class AppendableTable(LegacyTable):
32193234

32203235
def write(self, obj, axes=None, append=False, complib=None,
32213236
complevel=None, fletcher32=None, min_itemsize=None, chunksize=None,
3222-
expectedrows=None, **kwargs):
3237+
expectedrows=None, dropna=True, **kwargs):
32233238

32243239
if not append and self.is_exists:
32253240
self._handle.removeNode(self.group, 'table')
@@ -3254,29 +3269,36 @@ def write(self, obj, axes=None, append=False, complib=None,
32543269
a.validate_and_set(table, append)
32553270

32563271
# add the rows
3257-
self.write_data(chunksize)
3272+
self.write_data(chunksize, dropna=dropna)
32583273

3259-
def write_data(self, chunksize):
3274+
def write_data(self, chunksize, dropna=True):
32603275
""" we form the data into a 2-d including indexes,values,mask
32613276
write chunk-by-chunk """
32623277

32633278
names = self.dtype.names
32643279
nrows = self.nrows_expected
32653280

3266-
# create the masks & values
3267-
masks = []
3268-
for a in self.values_axes:
3281+
# if dropna==True, then drop ALL nan rows
3282+
if dropna:
3283+
3284+
masks = []
3285+
for a in self.values_axes:
3286+
3287+
# figure the mask: only do if we can successfully process this
3288+
# column, otherwise ignore the mask
3289+
mask = com.isnull(a.data).all(axis=0)
3290+
masks.append(mask.astype('u1'))
32693291

3270-
# figure the mask: only do if we can successfully process this
3271-
# column, otherwise ignore the mask
3272-
mask = com.isnull(a.data).all(axis=0)
3273-
masks.append(mask.astype('u1'))
3292+
# consolidate masks
3293+
mask = masks[0]
3294+
for m in masks[1:]:
3295+
mask = mask & m
3296+
mask = mask.ravel()
3297+
3298+
else:
32743299

3275-
# consolidate masks
3276-
mask = masks[0]
3277-
for m in masks[1:]:
3278-
mask = mask & m
3279-
mask = mask.ravel()
3300+
mask = np.empty(nrows, dtype='u1')
3301+
mask.fill(False)
32803302

32813303
# broadcast the indexes if needed
32823304
indexes = [a.cvalues for a in self.index_axes]

pandas/io/tests/test_pytables.py

+46-15
Original file line numberDiff line numberDiff line change
@@ -757,45 +757,76 @@ def test_append_some_nans(self):
757757
store.append('df3', df3[10:])
758758
tm.assert_frame_equal(store['df3'], df3)
759759

760-
##### THIS IS A BUG, should not drop these all-nan rows
761-
##### BUT need to store the index which we don't want to do....
762-
# nan some entire rows
760+
def test_append_all_nans(self):
761+
762+
with ensure_clean(self.path) as store:
763+
763764
df = DataFrame({'A1' : np.random.randn(20),
764765
'A2' : np.random.randn(20)},
765766
index=np.arange(20))
767+
df.ix[0:15,:] = np.nan
768+
769+
770+
# nan some entire rows (dropna=True)
771+
_maybe_remove(store, 'df')
772+
store.append('df', df[:10], dropna=True)
773+
store.append('df', df[10:], dropna=True)
774+
tm.assert_frame_equal(store['df'], df[-4:])
775+
776+
# nan some entire rows (dropna=False)
777+
_maybe_remove(store, 'df2')
778+
store.append('df2', df[:10], dropna=False)
779+
store.append('df2', df[10:], dropna=False)
780+
tm.assert_frame_equal(store['df2'], df)
781+
782+
# tests the option io.hdf.dropna_table
783+
pandas.set_option('io.hdf.dropna_table',False)
784+
_maybe_remove(store, 'df3')
785+
store.append('df3', df[:10])
786+
store.append('df3', df[10:])
787+
tm.assert_frame_equal(store['df3'], df)
766788

789+
pandas.set_option('io.hdf.dropna_table',True)
767790
_maybe_remove(store, 'df4')
768-
df.ix[0:15,:] = np.nan
769791
store.append('df4', df[:10])
770792
store.append('df4', df[10:])
771793
tm.assert_frame_equal(store['df4'], df[-4:])
772-
self.assert_(store.get_storer('df4').nrows == 4)
773794

774795
# nan some entire rows (string are still written!)
775796
df = DataFrame({'A1' : np.random.randn(20),
776797
'A2' : np.random.randn(20),
777798
'B' : 'foo', 'C' : 'bar'},
778799
index=np.arange(20))
779800

780-
_maybe_remove(store, 'df5')
781801
df.ix[0:15,:] = np.nan
782-
store.append('df5', df[:10])
783-
store.append('df5', df[10:])
784-
tm.assert_frame_equal(store['df5'], df)
785-
self.assert_(store.get_storer('df5').nrows == 20)
802+
803+
_maybe_remove(store, 'df')
804+
store.append('df', df[:10], dropna=True)
805+
store.append('df', df[10:], dropna=True)
806+
tm.assert_frame_equal(store['df'], df)
807+
808+
_maybe_remove(store, 'df2')
809+
store.append('df2', df[:10], dropna=False)
810+
store.append('df2', df[10:], dropna=False)
811+
tm.assert_frame_equal(store['df2'], df)
786812

787813
# nan some entire rows (but since we have dates they are still written!)
788814
df = DataFrame({'A1' : np.random.randn(20),
789815
'A2' : np.random.randn(20),
790816
'B' : 'foo', 'C' : 'bar', 'D' : Timestamp("20010101"), 'E' : datetime.datetime(2001,1,2,0,0) },
791817
index=np.arange(20))
792818

793-
_maybe_remove(store, 'df6')
794819
df.ix[0:15,:] = np.nan
795-
store.append('df6', df[:10])
796-
store.append('df6', df[10:])
797-
tm.assert_frame_equal(store['df6'], df)
798-
self.assert_(store.get_storer('df6').nrows == 20)
820+
821+
_maybe_remove(store, 'df')
822+
store.append('df', df[:10], dropna=True)
823+
store.append('df', df[10:], dropna=True)
824+
tm.assert_frame_equal(store['df'], df)
825+
826+
_maybe_remove(store, 'df2')
827+
store.append('df2', df[:10], dropna=False)
828+
store.append('df2', df[10:], dropna=False)
829+
tm.assert_frame_equal(store['df2'], df)
799830

800831
def test_append_frame_column_oriented(self):
801832

0 commit comments

Comments
 (0)