Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit c9a9e3e

Browse files
committedMay 4, 2013
ENH: added support for Panel,SparseSeries,SparseDataFrame,SparsePanel,IntIndex,BlockIndex
1 parent 4870ad9 commit c9a9e3e

File tree

4 files changed

+156
-70
lines changed

4 files changed

+156
-70
lines changed
 

‎doc/source/io.rst

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1008,6 +1008,21 @@ You can pass a list of objects and you will receive them back on deserialization
10081008
pd.to_msgpack('foo.msg', df, 'foo', np.array([1,2,3]), s)
10091009
pd.read_msgpack('foo.msg')
10101010
1011+
You can pass ``iterator=True`` to iterator over the unpacked results
1012+
1013+
.. ipython:: python
1014+
1015+
for o in pd.read_msgpack('foo.msg',iterator=True):
1016+
print o
1017+
1018+
1019+
You can pass ``append=True`` to the writer to append to an existing pack
1020+
1021+
.. ipython:: python
1022+
1023+
df.to_msgpack('foo.msg',append=True)
1024+
pd.read_msgpack('foo.msg')
1025+
10111026
.. ipython:: python
10121027
:suppress:
10131028
:okexcept:

‎doc/source/v0.12.0.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,13 @@ Enhancements
2424
pd.to_msgpack('foo.msg', df, s)
2525
pd.read_msgpack('foo.msg')
2626

27+
You can pass ``iterator=True`` to iterator over the unpacked results
28+
29+
.. ipython:: python
30+
31+
for o in pd.read_msgpack('foo.msg',iterator=True):
32+
print o
33+
2734
.. ipython:: python
2835
:suppress:
2936
:okexcept:

‎pandas/io/packers.py

Lines changed: 54 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
from pandas.tseries.api import PeriodIndex, DatetimeIndex
5858
from pandas.core.index import Int64Index, _ensure_index
5959
import pandas.core.common as com
60+
from pandas.core.generic import NDFrame
6061
from pandas.core.common import needs_i8_conversion
6162
from pandas.core.internals import BlockManager, make_block
6263
import pandas.core.internals as internals
@@ -162,6 +163,7 @@ def encode(obj):
162163
Data encoder
163164
"""
164165

166+
tobj = type(obj)
165167
if isinstance(obj, Index):
166168
if isinstance(obj, PeriodIndex):
167169
return {'typ' : 'period_index',
@@ -191,25 +193,47 @@ def encode(obj):
191193
'data': obj.tolist() }
192194
elif isinstance(obj, Series):
193195
if isinstance(obj, SparseSeries):
194-
import pdb; pdb.set_trace()
196+
d = {'typ' : 'sparse_series',
197+
'klass' : obj.__class__.__name__,
198+
'dtype': obj.dtype.name,
199+
'index' : obj.index,
200+
'sp_index' : obj.sp_index,
201+
'sp_values' : convert(obj.sp_values)}
202+
for f in ['name','fill_value','kind']:
203+
d[f] = getattr(obj,f,None)
204+
return d
195205
else:
196206
return {'typ' : 'series',
197207
'klass' : obj.__class__.__name__,
198208
'name' : getattr(obj,'name',None),
199209
'index' : obj.index,
200210
'dtype': obj.dtype.name,
201211
'data': convert(obj.values) }
202-
elif isinstance(obj, DataFrame):
212+
elif issubclass(tobj, NDFrame):
203213
if isinstance(obj, SparseDataFrame):
204-
import pdb; pdb.set_trace()
214+
d = {'typ' : 'sparse_dataframe',
215+
'klass' : obj.__class__.__name__,
216+
'columns' : obj.columns }
217+
for f in ['default_fill_value','default_kind']:
218+
d[f] = getattr(obj,f,None)
219+
d['data'] = dict([ (name,ss) for name,ss in obj.iteritems() ])
220+
return d
221+
elif isinstance(obj, SparsePanel):
222+
d = {'typ' : 'sparse_panel',
223+
'klass' : obj.__class__.__name__,
224+
'items' : obj.items }
225+
for f in ['default_fill_value','default_kind']:
226+
d[f] = getattr(obj,f,None)
227+
d['data'] = dict([ (name,df) for name,df in obj.iteritems() ])
228+
return d
205229
else:
206230

207231
data = obj._data
208232
if not data.is_consolidated():
209233
data = data.consolidate()
210234

211235
# the block manager
212-
return {'typ' : 'dataframe',
236+
return {'typ' : 'block_manager',
213237
'klass' : obj.__class__.__name__,
214238
'axes' : data.axes,
215239
'blocks' : [ { 'items' : b.items,
@@ -237,6 +261,17 @@ def encode(obj):
237261
return {'typ' : 'period',
238262
'ordinal' : obj.ordinal,
239263
'freq' : obj.freq }
264+
elif isinstance(obj, BlockIndex):
265+
return { 'typ' : 'block_index',
266+
'klass' : obj.__class__.__name__,
267+
'blocs' : obj.blocs,
268+
'blengths' : obj.blengths,
269+
'length' : obj.length }
270+
elif isinstance(obj, IntIndex):
271+
return { 'typ' : 'int_index',
272+
'klass' : obj.__class__.__name__,
273+
'indices' : obj.indices,
274+
'length' : obj.length }
240275
elif isinstance(obj, np.ndarray):
241276
return {'typ' : 'ndarray',
242277
'shape': obj.shape,
@@ -288,7 +323,7 @@ def decode(obj):
288323
dtype = dtype_for(obj['dtype'])
289324
index = obj['index']
290325
return globals()[obj['klass']](obj['data'],index=index,dtype=dtype,name=obj['name'])
291-
elif typ == 'dataframe':
326+
elif typ == 'block_manager':
292327
axes = obj['axes']
293328

294329
def create_block(b):
@@ -300,6 +335,20 @@ def create_block(b):
300335
elif typ == 'datetime':
301336
import pdb; pdb.set_trace()
302337
return datetime.fromtimestamp(obj['data'])
338+
elif typ == 'sparse_series':
339+
dtype = dtype_for(obj['dtype'])
340+
return globals()[obj['klass']](np.array(obj['sp_values'],dtype=dtype),sparse_index=obj['sp_index'],
341+
index=obj['index'],fill_value=obj['fill_value'],kind=obj['kind'],name=obj['name'])
342+
elif typ == 'sparse_dataframe':
343+
return globals()[obj['klass']](obj['data'],
344+
columns=obj['columns'],default_fill_value=obj['default_fill_value'],default_kind=obj['default_kind'])
345+
elif typ == 'sparse_panel':
346+
return globals()[obj['klass']](obj['data'],
347+
items=obj['items'],default_fill_value=obj['default_fill_value'],default_kind=obj['default_kind'])
348+
elif typ == 'block_index':
349+
return globals()[obj['klass']](obj['length'],obj['blocs'],obj['blengths'])
350+
elif typ == 'int_index':
351+
return globals()[obj['klass']](obj['length'],obj['indices'])
303352
elif typ == 'ndarray':
304353
return np.array(obj['data'],
305354
dtype=np.typeDict[obj['dtype']],

‎pandas/io/tests/test_packers.py

Lines changed: 80 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@
1414
from pandas.util.testing import ensure_clean
1515
from pandas.tests.test_series import assert_series_equal
1616
from pandas.tests.test_frame import assert_frame_equal
17+
from pandas.tests.test_panel import assert_panel_equal
18+
19+
import pandas
20+
from pandas.sparse.tests.test_sparse import assert_sp_series_equal, assert_sp_frame_equal
1721
from pandas import concat, Timestamp, tslib
1822

1923
from numpy.testing.decorators import slow
@@ -32,6 +36,8 @@ def check_arbitrary(a, b):
3236
assert(len(a) == len(b))
3337
for a_, b_ in zip(a,b):
3438
check_arbitrary(a_,b_)
39+
elif isinstance(a,Panel):
40+
assert_panel_equal(a,b)
3541
elif isinstance(a,DataFrame):
3642
assert_frame_equal(a,b)
3743
elif isinstance(a,Series):
@@ -225,10 +231,10 @@ def test_basic(self):
225231
i_rec = self.encode_decode(i)
226232
assert_series_equal(i,i_rec)
227233

228-
class TestFrame(Test):
234+
class TestNDFrame(Test):
229235

230236
def setUp(self):
231-
super(TestFrame, self).setUp()
237+
super(TestNDFrame, self).setUp()
232238

233239
data = {
234240
'A': [0., 1., 2., 3., np.nan],
@@ -238,98 +244,107 @@ def setUp(self):
238244
'E' : [0., 1, Timestamp('20100101'),'foo',2.],
239245
}
240246

241-
self.d = { 'float' : DataFrame(dict(A = data['A'], B = Series(data['A']) + 1)),
242-
'int' : DataFrame(dict(A = data['B'], B = Series(data['B']) + 1)),
243-
'mixed' : DataFrame(dict([ (k,data[k]) for k in ['A','B','C','D']])) }
247+
self.frame = { 'float' : DataFrame(dict(A = data['A'], B = Series(data['A']) + 1)),
248+
'int' : DataFrame(dict(A = data['B'], B = Series(data['B']) + 1)),
249+
'mixed' : DataFrame(dict([ (k,data[k]) for k in ['A','B','C','D']])) }
250+
251+
self.panel = { 'float' : Panel(dict(ItemA = self.frame['float'], ItemB = self.frame['float']+1)) }
244252

245-
def test_basic(self):
253+
def test_basic_frame(self):
246254

247-
for s, i in self.d.items():
255+
for s, i in self.frame.items():
248256
i_rec = self.encode_decode(i)
249257
assert_frame_equal(i,i_rec)
250258

259+
def test_basic_panel(self):
260+
261+
for s, i in self.panel.items():
262+
i_rec = self.encode_decode(i)
263+
assert_panel_equal(i,i_rec)
264+
251265
def test_multi(self):
252266

253-
i_rec = self.encode_decode(self.d)
254-
for k in self.d.keys():
255-
assert_frame_equal(self.d[k],i_rec[k])
267+
i_rec = self.encode_decode(self.frame)
268+
for k in self.frame.keys():
269+
assert_frame_equal(self.frame[k],i_rec[k])
256270

257-
l = tuple([ self.d['float'], self.d['float'].A, self.d['float'].B, None ])
271+
l = tuple([ self.frame['float'], self.frame['float'].A, self.frame['float'].B, None ])
258272
l_rec = self.encode_decode(l)
259273
check_arbitrary(l,l_rec)
260274

261275
# this is an oddity in that packed lists will be returned as tuples
262-
l = [ self.d['float'], self.d['float'].A, self.d['float'].B, None ]
276+
l = [ self.frame['float'], self.frame['float'].A, self.frame['float'].B, None ]
263277
l_rec = self.encode_decode(l)
264278
self.assert_(isinstance(l_rec,tuple))
265279
check_arbitrary(l,l_rec)
266280

267281
def test_iterator(self):
268282

269-
l = [ self.d['float'], self.d['float'].A, self.d['float'].B, None ]
283+
l = [ self.frame['float'], self.frame['float'].A, self.frame['float'].B, None ]
270284

271285
with ensure_clean(self.path) as path:
272286
to_msgpack(path,*l)
273287
for i, packed in enumerate(read_msgpack(path, iterator=True)):
274288
check_arbitrary(packed,l[i])
275289

276-
def _create_sp_series():
290+
class TestSparse(Test):
277291

278-
# nan-based
279-
arr = np.arange(15, dtype=float)
280-
index = np.arange(15)
281-
arr[7:12] = nan
282-
arr[-1:] = nan
292+
def _check_roundtrip(self, obj, comparator, **kwargs):
283293

284-
date_index = bdate_range('1/1/2011', periods=len(index))
285-
bseries = SparseSeries(arr, index=index, kind='block')
286-
bseries.name = 'bseries'
287-
return bseries
294+
i_rec = self.encode_decode(obj)
295+
comparator(obj,i_rec,**kwargs)
288296

289-
def _create_sp_frame():
297+
def test_sparse_series(self):
290298

291-
data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
292-
'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
293-
'C': np.arange(10),
294-
'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]}
295-
296-
dates = bdate_range('1/1/2011', periods=10)
297-
return SparseDataFrame(data, index=dates)
299+
s = tm.makeStringSeries()
300+
s[3:5] = np.nan
301+
ss = s.to_sparse()
302+
self._check_roundtrip(ss, tm.assert_series_equal,
303+
check_series_type=True)
304+
305+
ss2 = s.to_sparse(kind='integer')
306+
self._check_roundtrip(ss2, tm.assert_series_equal,
307+
check_series_type=True)
308+
309+
ss3 = s.to_sparse(fill_value=0)
310+
self._check_roundtrip(ss3, tm.assert_series_equal,
311+
check_series_type=True)
312+
313+
def test_sparse_frame(self):
314+
315+
s = tm.makeDataFrame()
316+
s.ix[3:5, 1:3] = np.nan
317+
s.ix[8:10, -2] = np.nan
318+
ss = s.to_sparse()
319+
320+
self._check_roundtrip(ss, tm.assert_frame_equal,
321+
check_frame_type=True)
322+
323+
ss2 = s.to_sparse(kind='integer')
324+
self._check_roundtrip(ss2, tm.assert_frame_equal,
325+
check_frame_type=True)
326+
327+
ss3 = s.to_sparse(fill_value=0)
328+
self._check_roundtrip(ss3, tm.assert_frame_equal,
329+
check_frame_type=True)
330+
331+
def test_sparse_panel(self):
332+
333+
items = ['x', 'y', 'z']
334+
p = Panel(dict((i, tm.makeDataFrame().ix[:2, :2]) for i in items))
335+
sp = p.to_sparse()
336+
337+
self._check_roundtrip(sp, tm.assert_panel_equal,
338+
check_panel_type=True)
339+
340+
sp2 = p.to_sparse(kind='integer')
341+
self._check_roundtrip(sp2, tm.assert_panel_equal,
342+
check_panel_type=True)
343+
344+
sp3 = p.to_sparse(fill_value=0)
345+
self._check_roundtrip(sp3, tm.assert_panel_equal,
346+
check_panel_type=True)
298347

299-
def create_data():
300-
""" create the pickle data """
301-
302-
data = {
303-
'A': [0., 1., 2., 3., np.nan],
304-
'B': [0, 1, 0, 1, 0],
305-
'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
306-
'D': date_range('1/1/2009', periods=5),
307-
'E' : [0., 1, Timestamp('20100101'),'foo',2.],
308-
}
309-
310-
index = dict(int = Index(np.arange(10)),
311-
date = date_range('20130101',periods=10))
312-
mi = dict(reg = MultiIndex.from_tuples(zip([['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
313-
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]),
314-
names=['first', 'second']))
315-
series = dict(float = Series(data['A']),
316-
int = Series(data['B']),
317-
mixed = Series(data['E']))
318-
frame = dict(float = DataFrame(dict(A = series['float'], B = series['float'] + 1)),
319-
int = DataFrame(dict(A = series['int'] , B = series['int'] + 1)),
320-
mixed = DataFrame(dict([ (k,data[k]) for k in ['A','B','C','D']])))
321-
panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)))
322-
323-
324-
325-
return dict( series = series,
326-
frame = frame,
327-
panel = panel,
328-
index = index,
329-
mi = mi,
330-
sp_series = dict(float = _create_sp_series()),
331-
sp_frame = dict(float = _create_sp_frame())
332-
)
333348

334349
if __name__ == '__main__':
335350
import nose

0 commit comments

Comments
 (0)
Please sign in to comment.