Skip to content

Commit 3ff5af0

Browse files
sinhrksjreback
authored andcommitted
PERF: Sparse IntIndex.make_union / Numeric ops
Author: sinhrks <[email protected]> Closes pandas-dev#13036 from sinhrks/sparse_make_union and squashes the following commits: b1cf4b5 [sinhrks] PERF: Sparse IntIndex.make_union
1 parent 3e9c320 commit 3ff5af0

File tree

4 files changed

+188
-151
lines changed

4 files changed

+188
-151
lines changed

asv_bench/benchmarks/sparse.py

+40-1
Original file line numberDiff line numberDiff line change
@@ -52,4 +52,43 @@ def setup(self):
5252
self.ss = self.s.to_sparse()
5353

5454
def time_sparse_series_to_coo(self):
55-
self.ss.to_coo(row_levels=[0, 1], column_levels=[2, 3], sort_labels=True)
55+
self.ss.to_coo(row_levels=[0, 1], column_levels=[2, 3], sort_labels=True)
56+
57+
58+
class sparse_arithmetic(object):
59+
goal_time = 0.2
60+
61+
def setup(self):
62+
np.random.seed(1)
63+
self.a_10percent = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=np.nan)
64+
self.b_10percent = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=np.nan)
65+
66+
self.a_10percent_zero = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=0)
67+
self.b_10percent_zero = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=0)
68+
69+
self.a_1percent = self.make_sparse_array(length=1000000, dense_size=10000, fill_value=np.nan)
70+
self.b_1percent = self.make_sparse_array(length=1000000, dense_size=10000, fill_value=np.nan)
71+
72+
def make_sparse_array(self, length, dense_size, fill_value):
73+
arr = np.array([fill_value] * length, dtype=np.float64)
74+
indexer = np.unique(np.random.randint(0, length, dense_size))
75+
arr[indexer] = np.random.randint(0, 100, len(indexer))
76+
return pd.SparseArray(arr, fill_value=fill_value)
77+
78+
def time_sparse_addition_10percent(self):
79+
self.a_10percent + self.b_10percent
80+
81+
def time_sparse_addition_10percent_zero(self):
82+
self.a_10percent_zero + self.b_10percent_zero
83+
84+
def time_sparse_addition_1percent(self):
85+
self.a_1percent + self.b_1percent
86+
87+
def time_sparse_division_10percent(self):
88+
self.a_10percent / self.b_10percent
89+
90+
def time_sparse_division_10percent_zero(self):
91+
self.a_10percent_zero / self.b_10percent_zero
92+
93+
def time_sparse_division_1percent(self):
94+
self.a_1percent / self.b_1percent

doc/source/whatsnew/v0.18.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,7 @@ Performance Improvements
507507
- Improved performance of ``Period`` construction and time series plotting (:issue:`12903`, :issue:`11831`).
508508
- Improved performance of ``.str.encode()`` and ``.str.decode()`` methods (:issue:`13008`)
509509
- Improved performance of ``to_numeric`` if input is numeric dtype (:issue:`12777`)
510+
- Improved performance of sparse arithmetic with ``IntIndex`` (:issue:`13036`)
510511

511512

512513

pandas/sparse/tests/test_libsparse.py

+141-111
Original file line numberDiff line numberDiff line change
@@ -43,117 +43,147 @@ def _check_case_dict(case):
4343
_check_case([], [], [], [], [], [])
4444

4545

46-
def test_index_make_union():
47-
def _check_case(xloc, xlen, yloc, ylen, eloc, elen):
48-
xindex = BlockIndex(TEST_LENGTH, xloc, xlen)
49-
yindex = BlockIndex(TEST_LENGTH, yloc, ylen)
50-
bresult = xindex.make_union(yindex)
51-
assert (isinstance(bresult, BlockIndex))
52-
assert_equal(bresult.blocs, eloc)
53-
assert_equal(bresult.blengths, elen)
54-
55-
ixindex = xindex.to_int_index()
56-
iyindex = yindex.to_int_index()
57-
iresult = ixindex.make_union(iyindex)
58-
assert (isinstance(iresult, IntIndex))
59-
assert_equal(iresult.indices, bresult.to_int_index().indices)
60-
61-
"""
62-
x: ----
63-
y: ----
64-
r: --------
65-
"""
66-
xloc = [0]
67-
xlen = [5]
68-
yloc = [5]
69-
ylen = [4]
70-
eloc = [0]
71-
elen = [9]
72-
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
73-
"""
74-
x: ----- -----
75-
y: ----- --
76-
"""
77-
xloc = [0, 10]
78-
xlen = [5, 5]
79-
yloc = [2, 17]
80-
ylen = [5, 2]
81-
eloc = [0, 10, 17]
82-
elen = [7, 5, 2]
83-
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
84-
"""
85-
x: ------
86-
y: -------
87-
r: ----------
88-
"""
89-
xloc = [1]
90-
xlen = [5]
91-
yloc = [3]
92-
ylen = [5]
93-
eloc = [1]
94-
elen = [7]
95-
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
96-
"""
97-
x: ------ -----
98-
y: -------
99-
r: -------------
100-
"""
101-
xloc = [2, 10]
102-
xlen = [4, 4]
103-
yloc = [4]
104-
ylen = [8]
105-
eloc = [2]
106-
elen = [12]
107-
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
108-
"""
109-
x: --- -----
110-
y: -------
111-
r: -------------
112-
"""
113-
xloc = [0, 5]
114-
xlen = [3, 5]
115-
yloc = [0]
116-
ylen = [7]
117-
eloc = [0]
118-
elen = [10]
119-
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
120-
"""
121-
x: ------ -----
122-
y: ------- ---
123-
r: -------------
124-
"""
125-
xloc = [2, 10]
126-
xlen = [4, 4]
127-
yloc = [4, 13]
128-
ylen = [8, 4]
129-
eloc = [2]
130-
elen = [15]
131-
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
132-
"""
133-
x: ----------------------
134-
y: ---- ---- ---
135-
r: ----------------------
136-
"""
137-
xloc = [2]
138-
xlen = [15]
139-
yloc = [4, 9, 14]
140-
ylen = [3, 2, 2]
141-
eloc = [2]
142-
elen = [15]
143-
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
144-
"""
145-
x: ---- ---
146-
y: --- ---
147-
"""
148-
xloc = [0, 10]
149-
xlen = [3, 3]
150-
yloc = [5, 15]
151-
ylen = [2, 2]
152-
eloc = [0, 5, 10, 15]
153-
elen = [3, 2, 3, 2]
154-
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
155-
156-
# TODO: different-length index objects
46+
class TestSparseIndexUnion(tm.TestCase):
47+
48+
def test_index_make_union(self):
49+
def _check_case(xloc, xlen, yloc, ylen, eloc, elen):
50+
xindex = BlockIndex(TEST_LENGTH, xloc, xlen)
51+
yindex = BlockIndex(TEST_LENGTH, yloc, ylen)
52+
bresult = xindex.make_union(yindex)
53+
assert (isinstance(bresult, BlockIndex))
54+
assert_equal(bresult.blocs, eloc)
55+
assert_equal(bresult.blengths, elen)
56+
57+
ixindex = xindex.to_int_index()
58+
iyindex = yindex.to_int_index()
59+
iresult = ixindex.make_union(iyindex)
60+
assert (isinstance(iresult, IntIndex))
61+
assert_equal(iresult.indices, bresult.to_int_index().indices)
62+
63+
"""
64+
x: ----
65+
y: ----
66+
r: --------
67+
"""
68+
xloc = [0]
69+
xlen = [5]
70+
yloc = [5]
71+
ylen = [4]
72+
eloc = [0]
73+
elen = [9]
74+
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
75+
"""
76+
x: ----- -----
77+
y: ----- --
78+
"""
79+
xloc = [0, 10]
80+
xlen = [5, 5]
81+
yloc = [2, 17]
82+
ylen = [5, 2]
83+
eloc = [0, 10, 17]
84+
elen = [7, 5, 2]
85+
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
86+
"""
87+
x: ------
88+
y: -------
89+
r: ----------
90+
"""
91+
xloc = [1]
92+
xlen = [5]
93+
yloc = [3]
94+
ylen = [5]
95+
eloc = [1]
96+
elen = [7]
97+
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
98+
"""
99+
x: ------ -----
100+
y: -------
101+
r: -------------
102+
"""
103+
xloc = [2, 10]
104+
xlen = [4, 4]
105+
yloc = [4]
106+
ylen = [8]
107+
eloc = [2]
108+
elen = [12]
109+
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
110+
"""
111+
x: --- -----
112+
y: -------
113+
r: -------------
114+
"""
115+
xloc = [0, 5]
116+
xlen = [3, 5]
117+
yloc = [0]
118+
ylen = [7]
119+
eloc = [0]
120+
elen = [10]
121+
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
122+
"""
123+
x: ------ -----
124+
y: ------- ---
125+
r: -------------
126+
"""
127+
xloc = [2, 10]
128+
xlen = [4, 4]
129+
yloc = [4, 13]
130+
ylen = [8, 4]
131+
eloc = [2]
132+
elen = [15]
133+
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
134+
"""
135+
x: ----------------------
136+
y: ---- ---- ---
137+
r: ----------------------
138+
"""
139+
xloc = [2]
140+
xlen = [15]
141+
yloc = [4, 9, 14]
142+
ylen = [3, 2, 2]
143+
eloc = [2]
144+
elen = [15]
145+
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
146+
"""
147+
x: ---- ---
148+
y: --- ---
149+
"""
150+
xloc = [0, 10]
151+
xlen = [3, 3]
152+
yloc = [5, 15]
153+
ylen = [2, 2]
154+
eloc = [0, 5, 10, 15]
155+
elen = [3, 2, 3, 2]
156+
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
157+
158+
def test_intindex_make_union(self):
159+
a = IntIndex(5, np.array([0, 3, 4], dtype=np.int32))
160+
b = IntIndex(5, np.array([0, 2], dtype=np.int32))
161+
res = a.make_union(b)
162+
exp = IntIndex(5, np.array([0, 2, 3, 4], np.int32))
163+
self.assertTrue(res.equals(exp))
164+
165+
a = IntIndex(5, np.array([], dtype=np.int32))
166+
b = IntIndex(5, np.array([0, 2], dtype=np.int32))
167+
res = a.make_union(b)
168+
exp = IntIndex(5, np.array([0, 2], np.int32))
169+
self.assertTrue(res.equals(exp))
170+
171+
a = IntIndex(5, np.array([], dtype=np.int32))
172+
b = IntIndex(5, np.array([], dtype=np.int32))
173+
res = a.make_union(b)
174+
exp = IntIndex(5, np.array([], np.int32))
175+
self.assertTrue(res.equals(exp))
176+
177+
a = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32))
178+
b = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32))
179+
res = a.make_union(b)
180+
exp = IntIndex(5, np.array([0, 1, 2, 3, 4], np.int32))
181+
self.assertTrue(res.equals(exp))
182+
183+
a = IntIndex(5, np.array([0, 1], dtype=np.int32))
184+
b = IntIndex(4, np.array([0, 1], dtype=np.int32))
185+
with tm.assertRaises(ValueError):
186+
a.make_union(b)
157187

158188

159189
class TestSparseIndexCommon(tm.TestCase):

pandas/src/sparse.pyx

+6-39
Original file line numberDiff line numberDiff line change
@@ -129,52 +129,19 @@ cdef class IntIndex(SparseIndex):
129129
return IntIndex(self.length, new_list)
130130

131131
cpdef IntIndex make_union(self, SparseIndex y_):
132-
cdef:
133-
Py_ssize_t out_length, i, xi, yi
134-
int32_t xind
135-
ndarray[int32_t, ndim=1] xindices, yindices
136-
list new_list = []
137-
IntIndex x, y
138132

139-
x = self
133+
cdef:
134+
ndarray[int32_t, ndim=1] new_indices
135+
IntIndex y
140136

141137
# if is one already, returns self
142138
y = y_.to_int_index()
143139

144140
if self.length != y.length:
145-
raise Exception('Indices must reference same underlying length')
146-
147-
xindices = self.indices
148-
yindices = y.indices
149-
150-
xi = yi = 0
151-
while True:
152-
if xi == x.npoints:
153-
while yi < y.npoints:
154-
new_list.append(yindices[yi])
155-
yi += 1
156-
break
157-
elif yi == y.npoints:
158-
while xi < x.npoints:
159-
new_list.append(xindices[xi])
160-
xi += 1
161-
break
162-
163-
xind = xindices[xi]
164-
yind = yindices[yi]
165-
166-
if xind == yind:
167-
new_list.append(xind)
168-
xi += 1
169-
yi += 1
170-
elif xind < yind:
171-
new_list.append(xind)
172-
xi += 1
173-
else:
174-
new_list.append(yind)
175-
yi += 1
141+
raise ValueError('Indices must reference same underlying length')
176142

177-
return IntIndex(x.length, new_list)
143+
new_indices = np.union1d(self.indices, y.indices)
144+
return IntIndex(self.length, new_indices)
178145

179146
@cython.wraparound(False)
180147
cpdef int lookup(self, Py_ssize_t index):

0 commit comments

Comments
 (0)