From 7804303df362de7a340590c38037413578cb6b11 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Fri, 15 Sep 2017 08:38:36 +0900 Subject: [PATCH 01/11] BUG: Fix make_sparse mask generation not to cast when dtype is object --- pandas/core/sparse/array.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index f965c91999a03..be1b94cff57c2 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -19,6 +19,7 @@ from pandas.core.dtypes.common import ( _ensure_platform_int, is_float, is_integer, + is_object_dtype, is_integer_dtype, is_bool_dtype, is_list_like, @@ -789,7 +790,16 @@ def make_sparse(arr, kind='block', fill_value=None): if is_string_dtype(arr): arr = arr.astype(object) - mask = arr != fill_value + if is_object_dtype(arr.dtype): + mask = [] + for e in arr: + if type(e) is type(fill_value): + mask.append(e != fill_value) + else: + mask.append(True) + mask = np.array(mask) + else: + mask = arr != fill_value length = len(arr) if length != mask.size: From b64c123fe508b8d7ef6d2f655630aa4764a971b3 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Tue, 19 Sep 2017 00:37:35 +0900 Subject: [PATCH 02/11] TST: Add test of the make_sparse mask generation --- pandas/tests/sparse/test_array.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/sparse/test_array.py b/pandas/tests/sparse/test_array.py index b0a9182a265fe..63050f6ce6fec 100644 --- a/pandas/tests/sparse/test_array.py +++ b/pandas/tests/sparse/test_array.py @@ -61,6 +61,12 @@ def test_constructor_object_dtype(self): assert arr.dtype == np.object assert arr.fill_value == 'A' + data = [False, 0, 100.0, 0.0] + arr = SparseArray(data, dtype=np.object, fill_value=False) + assert arr.dtype == np.object + assert arr.fill_value is False + assert (arr == np.array(data, dtype=np.object)).to_dense().all() + def test_constructor_spindex_dtype(self): arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2])) tm.assert_sp_array_equal(arr, SparseArray([np.nan, 1, 2, np.nan])) From bff0ac0305e106af8ac4674a243f54456e517f79 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Fri, 22 Sep 2017 00:11:24 +0900 Subject: [PATCH 03/11] TST: Add GitHub PR number comment on the test --- pandas/tests/sparse/test_array.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/sparse/test_array.py b/pandas/tests/sparse/test_array.py index 63050f6ce6fec..fc6d1b7579fdf 100644 --- a/pandas/tests/sparse/test_array.py +++ b/pandas/tests/sparse/test_array.py @@ -61,6 +61,7 @@ def test_constructor_object_dtype(self): assert arr.dtype == np.object assert arr.fill_value == 'A' + # GH 17574 data = [False, 0, 100.0, 0.0] arr = SparseArray(data, dtype=np.object, fill_value=False) assert arr.dtype == np.object From 7190704e45b439c6bf0cba5c84407118ec31cfa4 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Fri, 22 Sep 2017 09:48:58 +0900 Subject: [PATCH 04/11] BUG: Fix the element-wise mask generation method in make_spase --- pandas/core/sparse/array.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index be1b94cff57c2..7cba94f09b01f 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -791,13 +791,12 @@ def make_sparse(arr, kind='block', fill_value=None): arr = arr.astype(object) if is_object_dtype(arr.dtype): - mask = [] - for e in arr: - if type(e) is type(fill_value): - mask.append(e != fill_value) - else: - mask.append(True) - mask = np.array(mask) + mask = np.ones(len(arr), dtype=np.bool) + fv_type = type(fill_value) + + itr = (type(x) is fv_type for x in arr) + cond = np.fromiter(itr, dtype=np.bool) + mask[cond] = arr[cond] != fill_value else: mask = arr != fill_value From c9d674a3b88f43dd26ed3a67723878d92459d2e9 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Fri, 22 Sep 2017 11:29:05 +0900 Subject: [PATCH 05/11] DOC: Add the description of make_sparse bug fix in whatsnew note --- doc/source/whatsnew/v0.21.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 23a98d59554e9..448ed279cdf23 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -544,6 +544,7 @@ Sparse - Bug in ``SparseSeries`` raises ``AttributeError`` when a dictionary is passed in as data (:issue:`16905`) - Bug in :func:`SparseDataFrame.fillna` not filling all NaNs when frame was instantiated from SciPy sparse matrix (:issue:`16112`) +- Bug in :func:`make_sparse` treating two numeric/boolean data, which have same bits, as same when array ``dtype`` is ``object`` (:issue:`17574`) Reshaping From 7727c5a12dac328098b65b5effedef9031bb5f47 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Fri, 22 Sep 2017 11:40:43 +0900 Subject: [PATCH 06/11] TST: Fix wrong array comparison in the make_sparse bug-fix test --- pandas/tests/sparse/test_array.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/sparse/test_array.py b/pandas/tests/sparse/test_array.py index fc6d1b7579fdf..f653ee50982ad 100644 --- a/pandas/tests/sparse/test_array.py +++ b/pandas/tests/sparse/test_array.py @@ -66,7 +66,9 @@ def test_constructor_object_dtype(self): arr = SparseArray(data, dtype=np.object, fill_value=False) assert arr.dtype == np.object assert arr.fill_value is False - assert (arr == np.array(data, dtype=np.object)).to_dense().all() + arr_expected = np.array(data, dtype=np.object) + it = (type(x) == type(y) and x == y for x, y in zip(arr, arr_expected)) + assert np.fromiter(it, dtype=np.bool).all() def test_constructor_spindex_dtype(self): arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2])) From 26cb4cae711488dc43d0c0239718947342d8d8d1 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Sat, 23 Sep 2017 00:47:58 +0900 Subject: [PATCH 07/11] BUG: Simplified the mask generation method in make_spase --- pandas/core/sparse/array.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index 7cba94f09b01f..bc8faaf432184 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -794,9 +794,9 @@ def make_sparse(arr, kind='block', fill_value=None): mask = np.ones(len(arr), dtype=np.bool) fv_type = type(fill_value) - itr = (type(x) is fv_type for x in arr) - cond = np.fromiter(itr, dtype=np.bool) - mask[cond] = arr[cond] != fill_value + for i, x in enumerate(arr): + if type(x) is fv_type: + mask[i] = arr[i] != fill_value else: mask = arr != fill_value From 8e01026b4f3e8befb58d950c3d3e10f23d5e0e9c Mon Sep 17 00:00:00 2001 From: Licht-T Date: Mon, 25 Sep 2017 00:40:58 +0900 Subject: [PATCH 08/11] TST: Add the SparseArray constructor performance test --- asv_bench/benchmarks/sparse.py | 54 +++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index 7259e8cdb7d61..4fd38fc20d412 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -2,7 +2,7 @@ from .pandas_vb_common import * import scipy.sparse -from pandas import SparseSeries, SparseDataFrame +from pandas import SparseSeries, SparseDataFrame, SparseArray class sparse_series_to_frame(object): @@ -23,6 +23,58 @@ def time_sparse_series_to_frame(self): SparseDataFrame(self.series) +class sparse_array_constructor(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1) + self.int64_10percent = self.make_numeric_array(length=1000000, dense_size=100000, fill_value=0, dtype=np.int64) + self.int64_1percent = self.make_numeric_array(length=1000000, dense_size=10000, fill_value=0, dtype=np.int64) + + self.float64_10percent = self.make_numeric_array(length=1000000, dense_size=100000, fill_value=np.nan, dtype=np.float64) + self.float64_1percent = self.make_numeric_array(length=1000000, dense_size=10000, fill_value=np.nan, dtype=np.float64) + + self.object_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=0) + self.object_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=0) + + def make_numeric_array(self, length, dense_size, fill_value, dtype): + arr = np.array([fill_value] * length, dtype=dtype) + indexer = np.unique(np.random.randint(0, length, dense_size)) + arr[indexer] = np.random.randint(0, 100, len(indexer)) + return (arr, fill_value, dtype) + + def make_object_array(self, length, dense_size, fill_value): + elems = np.array(['a', 0.0, False, 1, 2], dtype=np.object) + arr = np.array([fill_value] * length, dtype=np.object) + indexer = np.unique(np.random.randint(0, length, dense_size)) + arr[indexer] = np.random.choice(elems, len(indexer)) + return (arr, fill_value, np.object) + + def time_sparse_array_constructor_int64_10percent(self): + arr, fill_value, dtype = self.int64_10percent + SparseArray(arr, fill_value=fill_value, dtype=dtype) + + def time_sparse_array_constructor_int64_1percent(self): + arr, fill_value, dtype = self.int64_1percent + SparseArray(arr, fill_value=fill_value, dtype=dtype) + + def time_sparse_array_constructor_float64_10percent(self): + arr, fill_value, dtype = self.float64_10percent + SparseArray(arr, fill_value=fill_value, dtype=dtype) + + def time_sparse_array_constructor_float64_1percent(self): + arr, fill_value, dtype = self.float64_1percent + SparseArray(arr, fill_value=fill_value, dtype=dtype) + + def time_sparse_array_constructor_object_10percent(self): + arr, fill_value, dtype = self.object_10percent + SparseArray(arr, fill_value=fill_value, dtype=dtype) + + def time_sparse_array_constructor_object_1percent(self): + arr, fill_value, dtype = self.object_1percent + SparseArray(arr, fill_value=fill_value, dtype=dtype) + + class sparse_frame_constructor(object): goal_time = 0.2 From a48f95726faf686e50c3d3725e71c9e60be13425 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Mon, 25 Sep 2017 02:05:05 +0900 Subject: [PATCH 09/11] DOC: Add the description of make_sparse mask generation routine --- pandas/core/sparse/array.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index bc8faaf432184..72bc3fbb3e8d1 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -791,6 +791,9 @@ def make_sparse(arr, kind='block', fill_value=None): arr = arr.astype(object) if is_object_dtype(arr.dtype): + # element-wise equality check method in numpy doesn't treat + # each element type, eg. 0, 0.0, and False are treated as + # same. So we have to check the both of its type and value. mask = np.ones(len(arr), dtype=np.bool) fv_type = type(fill_value) From 35890726d2ea19758439f8d8448ab3323c7d0493 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Tue, 26 Sep 2017 22:57:03 +0900 Subject: [PATCH 10/11] PERF: Reimplement the SparseArray mask create method for object ndarray by Cython --- pandas/_libs/sparse.pyx | 19 +++++++++++++++++++ pandas/core/sparse/array.py | 7 +------ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index 1cc7f5ace95ea..fac678e531c8b 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -848,3 +848,22 @@ def reindex_integer(ndarray[float64_t, ndim=1] values, IntIndex sparse_index, ndarray[int32_t, ndim=1] indexer): pass + + +# ----------------------------------------------------------------------------- +# SparseArray mask create operations + +def make_mask_object_ndarray(ndarray[object, ndim=1] arr, object fill_value): + cdef object value + cdef Py_ssize_t i + cdef Py_ssize_t new_length = len(arr) + cdef ndarray[int8_t, ndim=1] mask + + mask = np.ones(new_length, dtype=np.int8) + + for i in range(new_length): + value = arr[i] + if value == fill_value and type(value) == type(fill_value): + mask[i] = 0 + + return mask.view(dtype=np.bool) diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index 72bc3fbb3e8d1..3b45a013734c9 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -794,12 +794,7 @@ def make_sparse(arr, kind='block', fill_value=None): # element-wise equality check method in numpy doesn't treat # each element type, eg. 0, 0.0, and False are treated as # same. So we have to check the both of its type and value. - mask = np.ones(len(arr), dtype=np.bool) - fv_type = type(fill_value) - - for i, x in enumerate(arr): - if type(x) is fv_type: - mask[i] = arr[i] != fill_value + mask = splib.make_mask_object_ndarray(arr, fill_value) else: mask = arr != fill_value From 48687b32710ff0be3fa8753aaffe8dd3f76d94d6 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Wed, 27 Sep 2017 00:41:19 +0900 Subject: [PATCH 11/11] TST: Add asv tests for the object dtype SparseArray with NaN fill value --- asv_bench/benchmarks/sparse.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index 4fd38fc20d412..a435882bbca71 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -34,8 +34,11 @@ def setup(self): self.float64_10percent = self.make_numeric_array(length=1000000, dense_size=100000, fill_value=np.nan, dtype=np.float64) self.float64_1percent = self.make_numeric_array(length=1000000, dense_size=10000, fill_value=np.nan, dtype=np.float64) - self.object_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=0) - self.object_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=0) + self.object_nan_fill_value_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=np.nan) + self.object_nan_fill_value_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=np.nan) + + self.object_non_nan_fill_value_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=0) + self.object_non_nan_fill_value_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=0) def make_numeric_array(self, length, dense_size, fill_value, dtype): arr = np.array([fill_value] * length, dtype=dtype) @@ -66,12 +69,20 @@ def time_sparse_array_constructor_float64_1percent(self): arr, fill_value, dtype = self.float64_1percent SparseArray(arr, fill_value=fill_value, dtype=dtype) - def time_sparse_array_constructor_object_10percent(self): - arr, fill_value, dtype = self.object_10percent + def time_sparse_array_constructor_object_nan_fill_value_10percent(self): + arr, fill_value, dtype = self.object_nan_fill_value_10percent + SparseArray(arr, fill_value=fill_value, dtype=dtype) + + def time_sparse_array_constructor_object_nan_fill_value_1percent(self): + arr, fill_value, dtype = self.object_nan_fill_value_1percent + SparseArray(arr, fill_value=fill_value, dtype=dtype) + + def time_sparse_array_constructor_object_non_nan_fill_value_10percent(self): + arr, fill_value, dtype = self.object_non_nan_fill_value_10percent SparseArray(arr, fill_value=fill_value, dtype=dtype) - def time_sparse_array_constructor_object_1percent(self): - arr, fill_value, dtype = self.object_1percent + def time_sparse_array_constructor_object_non_nan_fill_value_1percent(self): + arr, fill_value, dtype = self.object_non_nan_fill_value_1percent SparseArray(arr, fill_value=fill_value, dtype=dtype)