Skip to content

Commit 83fa1a3

Browse files
author
Chang She
committed
ENH: sparse data structures in HDFStore. #85
1 parent e22ede3 commit 83fa1a3

File tree

3 files changed

+182
-11
lines changed

3 files changed

+182
-11
lines changed

pandas/io/pytables.py

+120-5
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
from pandas import (
1313
Series, TimeSeries, DataFrame, Panel, Index, MultiIndex, Int64Index
1414
)
15+
from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel
16+
from pandas.sparse.array import BlockIndex, IntIndex
1517
from pandas.tseries.api import PeriodIndex, DatetimeIndex
1618
from pandas.core.common import adjoin
1719
from pandas.core.algorithms import match, unique
@@ -28,17 +30,23 @@
2830
# reading and writing the full object in one go
2931
_TYPE_MAP = {
3032
Series : 'series',
33+
SparseSeries : 'sparse_series',
3134
TimeSeries : 'series',
3235
DataFrame : 'frame',
33-
Panel : 'wide'
36+
SparseDataFrame : 'sparse_frame',
37+
Panel : 'wide',
38+
SparsePanel : 'sparse_panel'
3439
}
3540

3641
_NAME_MAP = {
3742
'series' : 'Series',
3843
'time_series' : 'TimeSeries',
44+
'sparse_series' : 'SparseSeries',
3945
'frame' : 'DataFrame',
46+
'sparse_frame' : 'SparseDataFrame',
4047
'frame_table' : 'DataFrame (Table)',
4148
'wide' : 'Panel',
49+
'sparse_panel' : 'SparsePanel',
4250
'wide_table' : 'Panel (Table)',
4351
'long' : 'LongPanel',
4452
# legacy h5 files
@@ -406,6 +414,78 @@ def _write_series(self, group, series):
406414
self._write_array(group, 'values', series.values)
407415
group._v_attrs.name = series.name
408416

417+
def _write_sparse_series(self, group, series):
418+
self._write_index(group, 'index', series.index)
419+
self._write_index(group, 'sp_index', series.sp_index)
420+
self._write_array(group, 'sp_values', series.sp_values)
421+
group._v_attrs.name = series.name
422+
group._v_attrs.fill_value = series.fill_value
423+
group._v_attrs.kind = series.kind
424+
425+
def _read_sparse_series(self, group, where=None):
426+
index = self._read_index(group, 'index')
427+
sp_values = _read_array(group, 'sp_values')
428+
sp_index = self._read_index(group, 'sp_index')
429+
name = getattr(group._v_attrs, 'name', None)
430+
fill_value = getattr(group._v_attrs, 'fill_value', None)
431+
kind = getattr(group._v_attrs, 'kind', 'block')
432+
return SparseSeries(sp_values, index=index, sparse_index=sp_index,
433+
kind=kind, fill_value=fill_value,
434+
name=name)
435+
436+
def _write_sparse_frame(self, group, sdf):
437+
for name, ss in sdf.iteritems():
438+
key = 'sparse_series_%s' % name
439+
if key not in group._v_children:
440+
node = self.handle.createGroup(group, key)
441+
else:
442+
node = getattr(group, key)
443+
self._write_sparse_series(node, ss)
444+
setattr(group._v_attrs, 'default_fill_value',
445+
sdf.default_fill_value)
446+
setattr(group._v_attrs, 'default_kind',
447+
sdf.default_kind)
448+
self._write_index(group, 'columns', sdf.columns)
449+
450+
def _read_sparse_frame(self, group, where=None):
451+
columns = self._read_index(group, 'columns')
452+
sdict = {}
453+
for c in columns:
454+
key = 'sparse_series_%s' % c
455+
node = getattr(group, key)
456+
sdict[c] = self._read_sparse_series(node)
457+
default_kind = getattr(group._v_attrs, 'default_kind')
458+
default_fill_value = getattr(group._v_attrs, 'default_fill_value')
459+
return SparseDataFrame(sdict, columns=columns,
460+
default_kind=default_kind,
461+
default_fill_value=default_fill_value)
462+
463+
def _write_sparse_panel(self, group, swide):
464+
setattr(group._v_attrs, 'default_fill_value', swide.default_fill_value)
465+
setattr(group._v_attrs, 'default_kind', swide.default_kind)
466+
self._write_index(group, 'items', swide.items)
467+
468+
for name, sdf in swide.iteritems():
469+
key = 'sparse_frame_%s' % name
470+
if key not in group._v_children:
471+
node = self.handle.createGroup(group, key)
472+
else:
473+
node = getattr(group, key)
474+
self._write_sparse_frame(node, sdf)
475+
476+
def _read_sparse_panel(self, group, where=None):
477+
default_fill_value = getattr(group._v_attrs, 'default_fill_value')
478+
default_kind = getattr(group._v_attrs, 'default_kind')
479+
items = self._read_index(group, 'items')
480+
481+
sdict = {}
482+
for name in items:
483+
key = 'sparse_frame_%s' % name
484+
node = getattr(group, key)
485+
sdict[name] = self._read_sparse_frame(node)
486+
return SparsePanel(sdict, items=items, default_kind=default_kind,
487+
default_fill_value=default_fill_value)
488+
409489
def _write_frame(self, group, df):
410490
self._write_block_manager(group, df._data)
411491

@@ -474,21 +554,32 @@ def _read_wide_table(self, group, where=None):
474554
return self._read_panel_table(group, where)
475555

476556
def _write_index(self, group, key, index):
477-
if len(index) == 0:
478-
raise ValueError('Can not write empty structure, axis length was 0')
479-
480557
if isinstance(index, MultiIndex):
558+
if len(index) == 0:
559+
raise ValueError('Can not write empty structure, '
560+
'axis length was 0')
561+
481562
setattr(group._v_attrs, '%s_variety' % key, 'multi')
482563
self._write_multi_index(group, key, index)
564+
elif isinstance(index, BlockIndex):
565+
setattr(group._v_attrs, '%s_variety' % key, 'block')
566+
self._write_block_index(group, key, index)
567+
elif isinstance(index, IntIndex):
568+
setattr(group._v_attrs, '%s_variety' % key, 'sparseint')
569+
self._write_sparse_intindex(group, key, index)
483570
else:
571+
if len(index) == 0:
572+
raise ValueError('Can not write empty structure, '
573+
'axis length was 0')
574+
484575
setattr(group._v_attrs, '%s_variety' % key, 'regular')
485576
converted, kind, _ = _convert_index(index)
486577
self._write_array(group, key, converted)
487578
node = getattr(group, key)
488579
node._v_attrs.kind = kind
489580
node._v_attrs.name = index.name
490581

491-
if isinstance(index, (DatetimeIndex, PeriodIndex)):
582+
if isinstance(index, (DatetimeIndex, PeriodIndex, IntIndex)):
492583
node._v_attrs.index_class = type(index)
493584

494585
if hasattr(index, 'freq'):
@@ -499,12 +590,36 @@ def _read_index(self, group, key):
499590

500591
if variety == 'multi':
501592
return self._read_multi_index(group, key)
593+
elif variety == 'block':
594+
return self._read_block_index(group, key)
595+
elif variety == 'sparseint':
596+
return self._read_sparse_intindex(group, key)
502597
elif variety == 'regular':
503598
_, index = self._read_index_node(getattr(group, key))
504599
return index
505600
else: # pragma: no cover
506601
raise Exception('unrecognized index variety: %s' % variety)
507602

603+
def _write_block_index(self, group, key, index):
604+
self._write_array(group, '%s_blocs' % key, index.blocs)
605+
self._write_array(group, '%s_blengths' % key, index.blengths)
606+
setattr(group._v_attrs, '%s_length' % key, index.length)
607+
608+
def _read_block_index(self, group, key):
609+
length = getattr(group._v_attrs, '%s_length' % key)
610+
blocs = _read_array(group, '%s_blocs' % key)
611+
blengths = _read_array(group, '%s_blengths' % key)
612+
return BlockIndex(length, blocs, blengths)
613+
614+
def _write_sparse_intindex(self, group, key, index):
615+
self._write_array(group, '%s_indices' % key, index.indices)
616+
setattr(group._v_attrs, '%s_length' % key, index.length)
617+
618+
def _read_sparse_intindex(self, group, key):
619+
length = getattr(group._v_attrs, '%s_length' % key)
620+
indices = _read_array(group, '%s_indices' % key)
621+
return IntIndex(length, indices)
622+
508623
def _write_multi_index(self, group, key, index):
509624
setattr(group._v_attrs, '%s_nlevels' % key, index.nlevels)
510625

pandas/io/tests/test_pytables.py

+50-3
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
_default_compressor = LooseVersion(tables.__version__) >= '2.2' \
2525
and 'blosc' or 'zlib'
2626

27-
class TesttHDFStore(unittest.TestCase):
27+
class TestHDFStore(unittest.TestCase):
2828
path = '__test__.h5'
2929
scratchpath = '__scratch__.h5'
3030

@@ -201,6 +201,53 @@ def test_series(self):
201201
ts = tm.makeTimeSeries()
202202
self._check_roundtrip(ts, tm.assert_series_equal)
203203

204+
def test_sparse_series(self):
205+
s = tm.makeStringSeries()
206+
s[3:5] = np.nan
207+
ss = s.to_sparse()
208+
self._check_roundtrip(ss, tm.assert_series_equal,
209+
check_series_type=True)
210+
211+
ss2 = s.to_sparse(kind='integer')
212+
self._check_roundtrip(ss2, tm.assert_series_equal,
213+
check_series_type=True)
214+
215+
ss3 = s.to_sparse(fill_value=0)
216+
self._check_roundtrip(ss3, tm.assert_series_equal,
217+
check_series_type=True)
218+
219+
def test_sparse_frame(self):
220+
s = tm.makeDataFrame()
221+
s.ix[3:5, 1:3] = np.nan
222+
s.ix[8:10, -2] = np.nan
223+
ss = s.to_sparse()
224+
self._check_roundtrip(ss, tm.assert_frame_equal,
225+
check_frame_type=True)
226+
227+
ss2 = s.to_sparse(kind='integer')
228+
self._check_roundtrip(ss2, tm.assert_frame_equal,
229+
check_frame_type=True)
230+
231+
ss3 = s.to_sparse(fill_value=0)
232+
self._check_roundtrip(ss3, tm.assert_frame_equal,
233+
check_frame_type=True)
234+
235+
def test_sparse_panel(self):
236+
items = ['x', 'y', 'z']
237+
p = Panel({i : tm.makeDataFrame() for i in items})
238+
sp = p.to_sparse()
239+
240+
self._check_roundtrip(sp, tm.assert_panel_equal,
241+
check_panel_type=True)
242+
243+
sp2 = p.to_sparse(kind='integer')
244+
self._check_roundtrip(sp2, tm.assert_panel_equal,
245+
check_panel_type=True)
246+
247+
sp3 = p.to_sparse(fill_value=0)
248+
self._check_roundtrip(sp3, tm.assert_panel_equal,
249+
check_panel_type=True)
250+
204251
def test_float_index(self):
205252
# GH #454
206253
index = np.random.randn(10)
@@ -486,7 +533,7 @@ def test_select_filter_corner(self):
486533
result = self.store.select('frame', [crit])
487534
tm.assert_frame_equal(result, df.ix[:, df.columns[:75]])
488535

489-
def _check_roundtrip(self, obj, comparator, compression=False):
536+
def _check_roundtrip(self, obj, comparator, compression=False, **kwargs):
490537
options = {}
491538
if compression:
492539
options['complib'] = _default_compressor
@@ -495,7 +542,7 @@ def _check_roundtrip(self, obj, comparator, compression=False):
495542
try:
496543
store['obj'] = obj
497544
retrieved = store['obj']
498-
comparator(retrieved, obj)
545+
comparator(retrieved, obj, **kwargs)
499546
finally:
500547
store.close()
501548
os.remove(self.scratchpath)

pandas/util/testing.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,10 @@ def assert_dict_equal(a, b, compare_keys=True):
119119

120120
def assert_series_equal(left, right, check_dtype=True,
121121
check_index_type=False,
122-
check_index_freq=False):
122+
check_index_freq=False,
123+
check_series_type=False):
124+
if check_series_type:
125+
assert(type(left) == type(right))
123126
assert_almost_equal(left.values, right.values)
124127
if check_dtype:
125128
assert(left.dtype == right.dtype)
@@ -133,7 +136,10 @@ def assert_series_equal(left, right, check_dtype=True,
133136
getattr(right, 'freqstr', None))
134137

135138
def assert_frame_equal(left, right, check_index_type=False,
136-
check_column_type=False):
139+
check_column_type=False,
140+
check_frame_type=False):
141+
if check_frame_type:
142+
assert(type(left) == type(right))
137143
assert(isinstance(left, DataFrame))
138144
assert(isinstance(right, DataFrame))
139145
for col, series in left.iterkv():
@@ -152,7 +158,10 @@ def assert_frame_equal(left, right, check_index_type=False,
152158
assert(left.columns.dtype == right.columns.dtype)
153159
assert(left.columns.inferred_type == right.columns.inferred_type)
154160

155-
def assert_panel_equal(left, right):
161+
def assert_panel_equal(left, right, check_panel_type=False):
162+
if check_panel_type:
163+
assert(type(left) == type(right))
164+
156165
assert(left.items.equals(right.items))
157166
assert(left.major_axis.equals(right.major_axis))
158167
assert(left.minor_axis.equals(right.minor_axis))

0 commit comments

Comments
 (0)