Skip to content

Commit 45bd471

Browse files
Licht-Tjreback
authored andcommitted
BUG: Fix make_sparse mask generation (#17574)
1 parent db1206a commit 45bd471

File tree

5 files changed

+101
-2
lines changed

5 files changed

+101
-2
lines changed

asv_bench/benchmarks/sparse.py

+64-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from .pandas_vb_common import *
44
import scipy.sparse
5-
from pandas import SparseSeries, SparseDataFrame
5+
from pandas import SparseSeries, SparseDataFrame, SparseArray
66

77

88
class sparse_series_to_frame(object):
@@ -23,6 +23,69 @@ def time_sparse_series_to_frame(self):
2323
SparseDataFrame(self.series)
2424

2525

26+
class sparse_array_constructor(object):
27+
goal_time = 0.2
28+
29+
def setup(self):
30+
np.random.seed(1)
31+
self.int64_10percent = self.make_numeric_array(length=1000000, dense_size=100000, fill_value=0, dtype=np.int64)
32+
self.int64_1percent = self.make_numeric_array(length=1000000, dense_size=10000, fill_value=0, dtype=np.int64)
33+
34+
self.float64_10percent = self.make_numeric_array(length=1000000, dense_size=100000, fill_value=np.nan, dtype=np.float64)
35+
self.float64_1percent = self.make_numeric_array(length=1000000, dense_size=10000, fill_value=np.nan, dtype=np.float64)
36+
37+
self.object_nan_fill_value_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=np.nan)
38+
self.object_nan_fill_value_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=np.nan)
39+
40+
self.object_non_nan_fill_value_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=0)
41+
self.object_non_nan_fill_value_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=0)
42+
43+
def make_numeric_array(self, length, dense_size, fill_value, dtype):
44+
arr = np.array([fill_value] * length, dtype=dtype)
45+
indexer = np.unique(np.random.randint(0, length, dense_size))
46+
arr[indexer] = np.random.randint(0, 100, len(indexer))
47+
return (arr, fill_value, dtype)
48+
49+
def make_object_array(self, length, dense_size, fill_value):
50+
elems = np.array(['a', 0.0, False, 1, 2], dtype=np.object)
51+
arr = np.array([fill_value] * length, dtype=np.object)
52+
indexer = np.unique(np.random.randint(0, length, dense_size))
53+
arr[indexer] = np.random.choice(elems, len(indexer))
54+
return (arr, fill_value, np.object)
55+
56+
def time_sparse_array_constructor_int64_10percent(self):
57+
arr, fill_value, dtype = self.int64_10percent
58+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
59+
60+
def time_sparse_array_constructor_int64_1percent(self):
61+
arr, fill_value, dtype = self.int64_1percent
62+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
63+
64+
def time_sparse_array_constructor_float64_10percent(self):
65+
arr, fill_value, dtype = self.float64_10percent
66+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
67+
68+
def time_sparse_array_constructor_float64_1percent(self):
69+
arr, fill_value, dtype = self.float64_1percent
70+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
71+
72+
def time_sparse_array_constructor_object_nan_fill_value_10percent(self):
73+
arr, fill_value, dtype = self.object_nan_fill_value_10percent
74+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
75+
76+
def time_sparse_array_constructor_object_nan_fill_value_1percent(self):
77+
arr, fill_value, dtype = self.object_nan_fill_value_1percent
78+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
79+
80+
def time_sparse_array_constructor_object_non_nan_fill_value_10percent(self):
81+
arr, fill_value, dtype = self.object_non_nan_fill_value_10percent
82+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
83+
84+
def time_sparse_array_constructor_object_non_nan_fill_value_1percent(self):
85+
arr, fill_value, dtype = self.object_non_nan_fill_value_1percent
86+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
87+
88+
2689
class sparse_frame_constructor(object):
2790
goal_time = 0.2
2891

doc/source/whatsnew/v0.21.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,7 @@ Sparse
631631
- Bug in ``SparseSeries`` raises ``AttributeError`` when a dictionary is passed in as data (:issue:`16905`)
632632
- Bug in :func:`SparseDataFrame.fillna` not filling all NaNs when frame was instantiated from SciPy sparse matrix (:issue:`16112`)
633633
- Bug in :func:`SparseSeries.unstack` and :func:`SparseDataFrame.stack` (:issue:`16614`, :issue:`15045`)
634+
- Bug in :func:`make_sparse` treating two numeric/boolean data, which have same bits, as same when array ``dtype`` is ``object`` (:issue:`17574`)
634635

635636
Reshaping
636637
^^^^^^^^^

pandas/_libs/sparse.pyx

+19
Original file line numberDiff line numberDiff line change
@@ -848,3 +848,22 @@ def reindex_integer(ndarray[float64_t, ndim=1] values,
848848
IntIndex sparse_index,
849849
ndarray[int32_t, ndim=1] indexer):
850850
pass
851+
852+
853+
# -----------------------------------------------------------------------------
854+
# SparseArray mask create operations
855+
856+
def make_mask_object_ndarray(ndarray[object, ndim=1] arr, object fill_value):
857+
cdef object value
858+
cdef Py_ssize_t i
859+
cdef Py_ssize_t new_length = len(arr)
860+
cdef ndarray[int8_t, ndim=1] mask
861+
862+
mask = np.ones(new_length, dtype=np.int8)
863+
864+
for i in range(new_length):
865+
value = arr[i]
866+
if value == fill_value and type(value) == type(fill_value):
867+
mask[i] = 0
868+
869+
return mask.view(dtype=np.bool)

pandas/core/sparse/array.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from pandas.core.dtypes.common import (
2020
_ensure_platform_int,
2121
is_float, is_integer,
22+
is_object_dtype,
2223
is_integer_dtype,
2324
is_bool_dtype,
2425
is_list_like,
@@ -789,7 +790,13 @@ def make_sparse(arr, kind='block', fill_value=None):
789790
if is_string_dtype(arr):
790791
arr = arr.astype(object)
791792

792-
mask = arr != fill_value
793+
if is_object_dtype(arr.dtype):
794+
# element-wise equality check method in numpy doesn't treat
795+
# each element type, eg. 0, 0.0, and False are treated as
796+
# same. So we have to check the both of its type and value.
797+
mask = splib.make_mask_object_ndarray(arr, fill_value)
798+
else:
799+
mask = arr != fill_value
793800

794801
length = len(arr)
795802
if length != mask.size:

pandas/tests/sparse/test_array.py

+9
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,15 @@ def test_constructor_object_dtype(self):
6161
assert arr.dtype == np.object
6262
assert arr.fill_value == 'A'
6363

64+
# GH 17574
65+
data = [False, 0, 100.0, 0.0]
66+
arr = SparseArray(data, dtype=np.object, fill_value=False)
67+
assert arr.dtype == np.object
68+
assert arr.fill_value is False
69+
arr_expected = np.array(data, dtype=np.object)
70+
it = (type(x) == type(y) and x == y for x, y in zip(arr, arr_expected))
71+
assert np.fromiter(it, dtype=np.bool).all()
72+
6473
def test_constructor_spindex_dtype(self):
6574
arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]))
6675
tm.assert_sp_array_equal(arr, SparseArray([np.nan, 1, 2, np.nan]))

0 commit comments

Comments
 (0)