From b1cf4b5d2819a043c39feea03f8ec705f6ac990e Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 30 Apr 2016 09:46:23 +0900 Subject: [PATCH] PERF: Sparse IntIndex.make_union --- asv_bench/benchmarks/sparse.py | 41 ++++- doc/source/whatsnew/v0.18.1.txt | 1 + pandas/sparse/tests/test_libsparse.py | 252 ++++++++++++++------------ pandas/src/sparse.pyx | 45 +---- 4 files changed, 188 insertions(+), 151 deletions(-) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index d7ee58fc978ea..c9f979e323445 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -52,4 +52,43 @@ def setup(self): self.ss = self.s.to_sparse() def time_sparse_series_to_coo(self): - self.ss.to_coo(row_levels=[0, 1], column_levels=[2, 3], sort_labels=True) \ No newline at end of file + self.ss.to_coo(row_levels=[0, 1], column_levels=[2, 3], sort_labels=True) + + +class sparse_arithmetic(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1) + self.a_10percent = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=np.nan) + self.b_10percent = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=np.nan) + + self.a_10percent_zero = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=0) + self.b_10percent_zero = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=0) + + self.a_1percent = self.make_sparse_array(length=1000000, dense_size=10000, fill_value=np.nan) + self.b_1percent = self.make_sparse_array(length=1000000, dense_size=10000, fill_value=np.nan) + + def make_sparse_array(self, length, dense_size, fill_value): + arr = np.array([fill_value] * length, dtype=np.float64) + indexer = np.unique(np.random.randint(0, length, dense_size)) + arr[indexer] = np.random.randint(0, 100, len(indexer)) + return pd.SparseArray(arr, fill_value=fill_value) + + def time_sparse_addition_10percent(self): + self.a_10percent + self.b_10percent + + def time_sparse_addition_10percent_zero(self): + self.a_10percent_zero + self.b_10percent_zero + + def time_sparse_addition_1percent(self): + self.a_1percent + self.b_1percent + + def time_sparse_division_10percent(self): + self.a_10percent / self.b_10percent + + def time_sparse_division_10percent_zero(self): + self.a_10percent_zero / self.b_10percent_zero + + def time_sparse_division_1percent(self): + self.a_1percent / self.b_1percent diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 2474bf8377d6e..d6f99fbfb44a1 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -496,6 +496,7 @@ Performance Improvements - Improved performance of ``Period`` construction and time series plotting (:issue:`12903`, :issue:`11831`). - Improved performance of ``.str.encode()`` and ``.str.decode()`` methods (:issue:`13008`) - Improved performance of ``to_numeric`` if input is numeric dtype (:issue:`12777`) +- Improved performance of sparse arithmetic with ``IntIndex`` (:issue:`13036`) diff --git a/pandas/sparse/tests/test_libsparse.py b/pandas/sparse/tests/test_libsparse.py index 8d7ae012d0fe9..71a6e7fc043eb 100644 --- a/pandas/sparse/tests/test_libsparse.py +++ b/pandas/sparse/tests/test_libsparse.py @@ -43,117 +43,147 @@ def _check_case_dict(case): _check_case([], [], [], [], [], []) -def test_index_make_union(): - def _check_case(xloc, xlen, yloc, ylen, eloc, elen): - xindex = BlockIndex(TEST_LENGTH, xloc, xlen) - yindex = BlockIndex(TEST_LENGTH, yloc, ylen) - bresult = xindex.make_union(yindex) - assert (isinstance(bresult, BlockIndex)) - assert_equal(bresult.blocs, eloc) - assert_equal(bresult.blengths, elen) - - ixindex = xindex.to_int_index() - iyindex = yindex.to_int_index() - iresult = ixindex.make_union(iyindex) - assert (isinstance(iresult, IntIndex)) - assert_equal(iresult.indices, bresult.to_int_index().indices) - - """ - x: ---- - y: ---- - r: -------- - """ - xloc = [0] - xlen = [5] - yloc = [5] - ylen = [4] - eloc = [0] - elen = [9] - _check_case(xloc, xlen, yloc, ylen, eloc, elen) - """ - x: ----- ----- - y: ----- -- - """ - xloc = [0, 10] - xlen = [5, 5] - yloc = [2, 17] - ylen = [5, 2] - eloc = [0, 10, 17] - elen = [7, 5, 2] - _check_case(xloc, xlen, yloc, ylen, eloc, elen) - """ - x: ------ - y: ------- - r: ---------- - """ - xloc = [1] - xlen = [5] - yloc = [3] - ylen = [5] - eloc = [1] - elen = [7] - _check_case(xloc, xlen, yloc, ylen, eloc, elen) - """ - x: ------ ----- - y: ------- - r: ------------- - """ - xloc = [2, 10] - xlen = [4, 4] - yloc = [4] - ylen = [8] - eloc = [2] - elen = [12] - _check_case(xloc, xlen, yloc, ylen, eloc, elen) - """ - x: --- ----- - y: ------- - r: ------------- - """ - xloc = [0, 5] - xlen = [3, 5] - yloc = [0] - ylen = [7] - eloc = [0] - elen = [10] - _check_case(xloc, xlen, yloc, ylen, eloc, elen) - """ - x: ------ ----- - y: ------- --- - r: ------------- - """ - xloc = [2, 10] - xlen = [4, 4] - yloc = [4, 13] - ylen = [8, 4] - eloc = [2] - elen = [15] - _check_case(xloc, xlen, yloc, ylen, eloc, elen) - """ - x: ---------------------- - y: ---- ---- --- - r: ---------------------- - """ - xloc = [2] - xlen = [15] - yloc = [4, 9, 14] - ylen = [3, 2, 2] - eloc = [2] - elen = [15] - _check_case(xloc, xlen, yloc, ylen, eloc, elen) - """ - x: ---- --- - y: --- --- - """ - xloc = [0, 10] - xlen = [3, 3] - yloc = [5, 15] - ylen = [2, 2] - eloc = [0, 5, 10, 15] - elen = [3, 2, 3, 2] - _check_case(xloc, xlen, yloc, ylen, eloc, elen) - - # TODO: different-length index objects +class TestSparseIndexUnion(tm.TestCase): + + def test_index_make_union(self): + def _check_case(xloc, xlen, yloc, ylen, eloc, elen): + xindex = BlockIndex(TEST_LENGTH, xloc, xlen) + yindex = BlockIndex(TEST_LENGTH, yloc, ylen) + bresult = xindex.make_union(yindex) + assert (isinstance(bresult, BlockIndex)) + assert_equal(bresult.blocs, eloc) + assert_equal(bresult.blengths, elen) + + ixindex = xindex.to_int_index() + iyindex = yindex.to_int_index() + iresult = ixindex.make_union(iyindex) + assert (isinstance(iresult, IntIndex)) + assert_equal(iresult.indices, bresult.to_int_index().indices) + + """ + x: ---- + y: ---- + r: -------- + """ + xloc = [0] + xlen = [5] + yloc = [5] + ylen = [4] + eloc = [0] + elen = [9] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + """ + x: ----- ----- + y: ----- -- + """ + xloc = [0, 10] + xlen = [5, 5] + yloc = [2, 17] + ylen = [5, 2] + eloc = [0, 10, 17] + elen = [7, 5, 2] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + """ + x: ------ + y: ------- + r: ---------- + """ + xloc = [1] + xlen = [5] + yloc = [3] + ylen = [5] + eloc = [1] + elen = [7] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + """ + x: ------ ----- + y: ------- + r: ------------- + """ + xloc = [2, 10] + xlen = [4, 4] + yloc = [4] + ylen = [8] + eloc = [2] + elen = [12] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + """ + x: --- ----- + y: ------- + r: ------------- + """ + xloc = [0, 5] + xlen = [3, 5] + yloc = [0] + ylen = [7] + eloc = [0] + elen = [10] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + """ + x: ------ ----- + y: ------- --- + r: ------------- + """ + xloc = [2, 10] + xlen = [4, 4] + yloc = [4, 13] + ylen = [8, 4] + eloc = [2] + elen = [15] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + """ + x: ---------------------- + y: ---- ---- --- + r: ---------------------- + """ + xloc = [2] + xlen = [15] + yloc = [4, 9, 14] + ylen = [3, 2, 2] + eloc = [2] + elen = [15] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + """ + x: ---- --- + y: --- --- + """ + xloc = [0, 10] + xlen = [3, 3] + yloc = [5, 15] + ylen = [2, 2] + eloc = [0, 5, 10, 15] + elen = [3, 2, 3, 2] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + + def test_intindex_make_union(self): + a = IntIndex(5, np.array([0, 3, 4], dtype=np.int32)) + b = IntIndex(5, np.array([0, 2], dtype=np.int32)) + res = a.make_union(b) + exp = IntIndex(5, np.array([0, 2, 3, 4], np.int32)) + self.assertTrue(res.equals(exp)) + + a = IntIndex(5, np.array([], dtype=np.int32)) + b = IntIndex(5, np.array([0, 2], dtype=np.int32)) + res = a.make_union(b) + exp = IntIndex(5, np.array([0, 2], np.int32)) + self.assertTrue(res.equals(exp)) + + a = IntIndex(5, np.array([], dtype=np.int32)) + b = IntIndex(5, np.array([], dtype=np.int32)) + res = a.make_union(b) + exp = IntIndex(5, np.array([], np.int32)) + self.assertTrue(res.equals(exp)) + + a = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32)) + b = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32)) + res = a.make_union(b) + exp = IntIndex(5, np.array([0, 1, 2, 3, 4], np.int32)) + self.assertTrue(res.equals(exp)) + + a = IntIndex(5, np.array([0, 1], dtype=np.int32)) + b = IntIndex(4, np.array([0, 1], dtype=np.int32)) + with tm.assertRaises(ValueError): + a.make_union(b) class TestSparseIndexCommon(tm.TestCase): diff --git a/pandas/src/sparse.pyx b/pandas/src/sparse.pyx index 29f3d61033f6a..474733a1e005c 100644 --- a/pandas/src/sparse.pyx +++ b/pandas/src/sparse.pyx @@ -129,52 +129,19 @@ cdef class IntIndex(SparseIndex): return IntIndex(self.length, new_list) cpdef IntIndex make_union(self, SparseIndex y_): - cdef: - Py_ssize_t out_length, i, xi, yi - int32_t xind - ndarray[int32_t, ndim=1] xindices, yindices - list new_list = [] - IntIndex x, y - x = self + cdef: + ndarray[int32_t, ndim=1] new_indices + IntIndex y # if is one already, returns self y = y_.to_int_index() if self.length != y.length: - raise Exception('Indices must reference same underlying length') - - xindices = self.indices - yindices = y.indices - - xi = yi = 0 - while True: - if xi == x.npoints: - while yi < y.npoints: - new_list.append(yindices[yi]) - yi += 1 - break - elif yi == y.npoints: - while xi < x.npoints: - new_list.append(xindices[xi]) - xi += 1 - break - - xind = xindices[xi] - yind = yindices[yi] - - if xind == yind: - new_list.append(xind) - xi += 1 - yi += 1 - elif xind < yind: - new_list.append(xind) - xi += 1 - else: - new_list.append(yind) - yi += 1 + raise ValueError('Indices must reference same underlying length') - return IntIndex(x.length, new_list) + new_indices = np.union1d(self.indices, y.indices) + return IntIndex(self.length, new_indices) @cython.wraparound(False) cpdef int lookup(self, Py_ssize_t index):