Skip to content

Commit 17151ca

Browse files
committed
ENH: first cut at SparseList data structure, #436
1 parent 3e87592 commit 17151ca

File tree

6 files changed

+240
-26
lines changed

6 files changed

+240
-26
lines changed

pandas/sparse/api.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from pandas.sparse.array import SparseArray
2+
from pandas.sparse.list import SparseList
23
from pandas.sparse.series import SparseSeries
34
from pandas.sparse.frame import SparseDataFrame
45
from pandas.sparse.panel import SparsePanel

pandas/sparse/array.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ class SparseArray(np.ndarray):
105105
sp_index = None
106106
fill_value = None
107107

108-
def __new__(cls, data, sparse_index=None, kind='block', fill_value=None,
108+
def __new__(cls, data, sparse_index=None, kind='integer', fill_value=None,
109109
copy=False):
110110

111111
is_sparse_array = isinstance(data, SparseArray)

pandas/sparse/list.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
import numpy as np
2+
3+
from pandas.sparse.array import SparseArray
4+
import pandas._sparse as splib
5+
6+
class SparseList(object):
7+
"""
8+
Data structure for accumulating data to be converted into a
9+
SparseArray. Has similar API to the standard Python list
10+
"""
11+
12+
def __init__(self, data=None, fill_value=np.nan):
13+
self.fill_value = fill_value
14+
self._chunks = []
15+
16+
if data is not None:
17+
self.append(data)
18+
19+
def __repr__(self):
20+
contents = '\n'.join(repr(c) for c in self._chunks)
21+
return '%s\n%s' % (object.__repr__(self), contents)
22+
23+
def __len__(self):
24+
return sum(len(c) for c in self._chunks)
25+
26+
def __getitem__(self, i):
27+
if i < 0:
28+
if i + len(self) < 0: # pragma: no cover
29+
raise ValueError('%d out of range' % i)
30+
i += len(self)
31+
32+
passed = 0
33+
j = 0
34+
while i >= passed + len(self._chunks[j]):
35+
passed += len(self._chunks[j])
36+
j += 1
37+
return self._chunks[j][i - passed]
38+
39+
def __setitem__(self, i, value):
40+
raise NotImplementedError
41+
42+
@property
43+
def nchunks(self):
44+
return len(self._chunks)
45+
46+
@property
47+
def is_consolidated(self):
48+
return self.nchunks == 1
49+
50+
def consolidate(self, inplace=True):
51+
"""
52+
Internally consolidate chunks of data
53+
54+
Parameters
55+
----------
56+
inplace : boolean, default True
57+
Modify the calling object instead of constructing a new one
58+
59+
Returns
60+
-------
61+
splist : SparseList
62+
If inplace=False, new object, otherwise reference to existing
63+
object
64+
"""
65+
if not inplace:
66+
result = self.copy()
67+
else:
68+
result = self
69+
70+
if result.is_consolidated:
71+
return result
72+
73+
result._consolidate_inplace()
74+
return result
75+
76+
def _consolidate_inplace(self):
77+
new_values = np.concatenate([c.sp_values for c in self._chunks])
78+
new_index = _concat_sparse_indexes([c.sp_index for c in self._chunks])
79+
new_arr = SparseArray(new_values, sparse_index=new_index,
80+
fill_value=self.fill_value)
81+
self._chunks = [new_arr]
82+
83+
def copy(self):
84+
"""
85+
Return copy of the list
86+
87+
Returns
88+
-------
89+
new_list : SparseList
90+
"""
91+
new_splist = SparseList(fill_value=self.fill_value)
92+
new_splist._chunks = list(self._chunks)
93+
return new_splist
94+
95+
def to_array(self):
96+
"""
97+
Return SparseArray from data stored in the SparseList
98+
99+
Returns
100+
-------
101+
sparr : SparseArray
102+
"""
103+
self.consolidate(inplace=True)
104+
return self._chunks[0]
105+
106+
def append(self, value):
107+
if np.isscalar(value):
108+
value = [value]
109+
110+
sparr = SparseArray(value, fill_value=self.fill_value)
111+
self._chunks.append(sparr)
112+
self._consolidated = False
113+
114+
115+
def _concat_sparse_indexes(indexes):
116+
all_indices = []
117+
total_length = 0
118+
119+
for index in indexes:
120+
# increment by offset
121+
inds = index.to_int_index().indices + total_length
122+
123+
all_indices.append(inds)
124+
total_length += index.length
125+
126+
return splib.IntIndex(total_length, np.concatenate(all_indices))

pandas/sparse/tests/test_list.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import unittest
2+
3+
from numpy import nan
4+
import numpy as np
5+
6+
from pandas.sparse.api import SparseList, SparseArray
7+
from pandas.util.testing import assert_almost_equal
8+
9+
from test_sparse import assert_sp_array_equal
10+
11+
12+
class TestSparseList(unittest.TestCase):
13+
14+
def setUp(self):
15+
self.na_data = np.array([nan, nan, 1, 2, 3, nan, 4, 5, nan, 6])
16+
self.zero_data = np.array([0, 0, 1, 2, 3, 0, 4, 5, 0, 6])
17+
18+
def test_append_na(self):
19+
arr = self.na_data
20+
splist = SparseList()
21+
splist.append(arr[:5])
22+
splist.append(arr[5])
23+
splist.append(arr[6:])
24+
25+
sparr = splist.to_array()
26+
assert_sp_array_equal(sparr, SparseArray(arr))
27+
28+
def test_append_zero(self):
29+
arr = self.zero_data
30+
splist = SparseList(fill_value=0)
31+
splist.append(arr[:5])
32+
splist.append(arr[5])
33+
splist.append(arr[6:])
34+
35+
sparr = splist.to_array()
36+
assert_sp_array_equal(sparr, SparseArray(arr, fill_value=0))
37+
38+
def test_consolidate(self):
39+
arr = self.na_data
40+
exp_sparr = SparseArray(arr)
41+
42+
splist = SparseList()
43+
splist.append(arr[:5])
44+
splist.append(arr[5])
45+
splist.append(arr[6:])
46+
47+
consol = splist.consolidate(inplace=False)
48+
self.assert_(consol.nchunks == 1)
49+
self.assert_(splist.nchunks == 3)
50+
assert_sp_array_equal(consol.to_array(), exp_sparr)
51+
52+
splist.consolidate()
53+
self.assert_(splist.nchunks == 1)
54+
assert_sp_array_equal(splist.to_array(), exp_sparr)
55+
56+
def test_copy(self):
57+
arr = self.na_data
58+
exp_sparr = SparseArray(arr)
59+
60+
splist = SparseList()
61+
splist.append(arr[:5])
62+
splist.append(arr[5])
63+
64+
cp = splist.copy()
65+
cp.append(arr[6:])
66+
self.assertEquals(splist.nchunks, 2)
67+
assert_sp_array_equal(cp.to_array(), exp_sparr)
68+
69+
def test_getitem(self):
70+
arr = self.na_data
71+
splist = SparseList()
72+
splist.append(arr[:5])
73+
splist.append(arr[5])
74+
splist.append(arr[6:])
75+
76+
for i in range(len(arr)):
77+
assert_almost_equal(splist[i], arr[i])
78+
79+
if __name__ == '__main__':
80+
import nose
81+
nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],
82+
exit=False)

pandas/sparse/tests/test_sparse.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,17 @@ def _test_data2_zero():
6060
return arr, index
6161

6262
def assert_sp_series_equal(a, b):
63-
assert_equal(a.sp_values, b.sp_values)
64-
assert(a.sp_index.equals(b.sp_index))
65-
if np.isnan(a.fill_value):
66-
assert(np.isnan(b.fill_value))
63+
assert(a.index.equals(b.index))
64+
assert_sp_array_equal(a, b)
65+
66+
def assert_sp_array_equal(left, right):
67+
assert_almost_equal(left.sp_values, right.sp_values)
68+
assert(left.sp_index.equals(right.sp_index))
69+
if np.isnan(left.fill_value):
70+
assert(np.isnan(right.fill_value))
6771
else:
68-
assert(a.fill_value == b.fill_value)
72+
assert(left.fill_value == right.fill_value)
73+
6974

7075
def assert_sp_frame_equal(left, right, exact_indices=True):
7176
"""

0 commit comments

Comments
 (0)