diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 72f63a4da0f4d..c9c3c899bce61 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -823,6 +823,7 @@ Sparse - Bug in :class:`SparseDataFrame.to_csv` causing exception (:issue:`19384`) - Bug in :class:`SparseSeries.memory_usage` which caused segfault by accessing non sparse elements (:issue:`19368`) - Bug in constructing a ``SparseArray``: if ``data`` is a scalar and ``index`` is defined it will coerce to ``float64`` regardless of scalar's dtype. (:issue:`19163`) +- Bug in :func:`SparseSeries.unique` which returns only sparse elements during unique (:issue:`19651`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c754c063fce8e..51355db9f7d8a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -10,7 +10,8 @@ maybe_promote, construct_1d_object_array_from_listlike) from pandas.core.dtypes.generic import ( ABCSeries, ABCIndex, - ABCIndexClass, ABCCategorical) + ABCIndexClass, ABCCategorical, + ABCSparseArray) from pandas.core.dtypes.common import ( is_unsigned_integer_dtype, is_signed_integer_dtype, is_integer_dtype, is_complex_dtype, @@ -362,7 +363,14 @@ def unique(values): htable, _, values, dtype, ndtype = _get_hashtable_algo(values) table = htable(len(values)) - uniques = table.unique(values) + + if isinstance(values, ABCSparseArray): + to_unique = values.sp_values + if values.sp_index.ngaps > 0: + to_unique = np.append(to_unique, [values.fill_value]) + uniques = table.unique(to_unique) + else: + uniques = table.unique(values) uniques = _reconstruct_data(uniques, dtype, original) if isinstance(original, ABCSeries) and is_datetime64tz_dtype(dtype): @@ -469,6 +477,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): table = hash_klass(size_hint or len(values)) uniques = vec_klass() check_nulls = not is_integer_dtype(original) + labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) labels = _ensure_platform_int(labels) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index b1e3177547ac6..bc3eb74502c6d 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -8,7 +8,8 @@ from datetime import datetime from itertools import permutations from pandas import (Series, Categorical, CategoricalIndex, - Timestamp, DatetimeIndex, Index, IntervalIndex) + Timestamp, DatetimeIndex, Index, IntervalIndex, + SparseArray) import pandas as pd from pandas import compat @@ -268,6 +269,16 @@ def test_object_refcount_bug(self): for i in range(1000): len(algos.unique(lst)) + @pytest.mark.parametrize('fill_value', [0, 1, np.nan, None]) + def test_sparse(self, fill_value): + # GH 19595 + arr = SparseArray([0, 1, np.nan, None], fill_value=fill_value) + + result = algos.unique(arr) + + assert isinstance(result, np.ndarray) + assert len(result) == 3 + def test_on_index_object(self): mindex = pd.MultiIndex.from_arrays([np.arange(5).repeat(5), np.tile(