From b4a635995302cebfb916e05afb4dc78ffe67a4ea Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Thu, 8 Feb 2018 15:51:44 +0700 Subject: [PATCH 01/10] First pass at having unique work for sparse array --- pandas/_libs/hashtable_class_helper.pxi.in | 24 +++++++++++++++++----- pandas/core/algorithms.py | 10 +++++++-- pandas/core/groupby.py | 9 ++++++++ 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index bca4e388f3279..98fae408ae738 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -261,15 +261,29 @@ def get_dispatch(dtypes): cdef: Py_ssize_t i, n = len(values) int ret = 0 - {dtype}_t val + {dtype}_t val, fill_value_val, ngaps_val khiter_t k bint seen_na = 0 {name}Vector uniques = {name}Vector() {name}VectorData *ud ud = uniques.data - + fill_value_val = fill_value + ngaps_val = ngaps + with nogil: + # If this is a sparse structure we need to append + # The fill value as well assuming the ngaps are greater than 0 + + if ngaps_val > 0: + k = kh_get_{dtype}(self.table, fill_value_val) + if k == self.table.n_buckets: + kh_put_{dtype}(self.table, fill_value_val, &ret) + if needs_resize(ud): + with gil: + uniques.resize() + append_data_{dtype}(ud, fill_value_val) + for i in range(n): val = values[i] IF {float_group}: @@ -496,10 +510,10 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(labels), arr_uniques @cython.boundscheck(False) - def unique(self, ndarray[{{dtype}}_t, ndim=1] values): + def unique(self, ndarray[{{dtype}}_t, ndim=1] values, fill_value=np.nan, ngaps=0): if values.flags.writeable: # If the value is writeable (mutable) then use memview - return self.unique_memview(values) + return self.unique_memview(values, fill_value=fill_value, ngaps=ngaps) # We cannot use the memoryview version on readonly-buffers due to # a limitation of Cython's typed memoryviews. Instead we can use @@ -508,7 +522,7 @@ cdef class {{name}}HashTable(HashTable): {{unique_template}} @cython.boundscheck(False) - def unique_memview(self, {{dtype}}_t[:] values): + def unique_memview(self, {{dtype}}_t[:] values, fill_value=np.nan, ngaps=0): {{unique_template}} {{endfor}} diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c754c063fce8e..ebdec42393479 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -10,7 +10,8 @@ maybe_promote, construct_1d_object_array_from_listlike) from pandas.core.dtypes.generic import ( ABCSeries, ABCIndex, - ABCIndexClass, ABCCategorical) + ABCIndexClass, ABCCategorical, + ABCSparseArray) from pandas.core.dtypes.common import ( is_unsigned_integer_dtype, is_signed_integer_dtype, is_integer_dtype, is_complex_dtype, @@ -362,7 +363,11 @@ def unique(values): htable, _, values, dtype, ndtype = _get_hashtable_algo(values) table = htable(len(values)) - uniques = table.unique(values) + import ipdb; ipdb.set_trace() + if isinstance(values, ABCSparseArray): + uniques = table.unique(values, fill_value=values.fill_value, ngaps=values.sp_index.ngaps) + else: + uniques = table.unique(values) uniques = _reconstruct_data(uniques, dtype, original) if isinstance(original, ABCSeries) and is_datetime64tz_dtype(dtype): @@ -461,6 +466,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): PeriodIndex """ + import ipdb; ipdb.set_trace() values = _ensure_arraylike(values) original = values values, dtype, _ = _ensure_data(values) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 01241db7c0c42..fa11850bef89d 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1035,6 +1035,7 @@ def _cython_agg_general(self, how, alt=None, numeric_only=True, return self._wrap_aggregated_output(output, names) def _python_agg_general(self, func, *args, **kwargs): + import ipdb; ipdb.set_trace() func = self._is_builtin_func(func) f = lambda x: func(x, *args, **kwargs) @@ -1224,6 +1225,7 @@ def count(self): @Substitution(name='groupby') @Appender(_doc_template) def mean(self, *args, **kwargs): + import ipdb; ipdb.set_trace() """ Compute mean of groups, excluding missing values @@ -1948,6 +1950,7 @@ class BaseGrouper(object): def __init__(self, axis, groupings, sort=True, group_keys=True, mutated=False, indexer=None): + import ipdb; ipdb.set_trace() self._filter_empty_groups = self.compressed = len(groupings) != 1 self.axis = axis self.groupings = groupings @@ -2097,6 +2100,7 @@ def is_monotonic(self): @cache_readonly def group_info(self): + import ipdb; ipdb.set_trace() comp_ids, obs_group_ids = self._get_compressed_labels() ngroups = len(obs_group_ids) @@ -2607,6 +2611,7 @@ def indices(self): @cache_readonly def group_info(self): + import ipdb; ipdb.set_trace() ngroups = self.ngroups obs_group_ids = np.arange(ngroups) rep = np.diff(np.r_[0, self.bins]) @@ -2684,6 +2689,7 @@ class Grouping(object): def __init__(self, index, grouper=None, obj=None, name=None, level=None, sort=True, in_axis=False): + import ipdb; ipdb.set_trace() self.name = name self.level = level self.grouper = _convert_grouper(index, grouper) @@ -2816,6 +2822,7 @@ def group_index(self): return self._group_index def _make_labels(self): + import ipdb; ipdb.set_trace() if self._labels is None or self._group_index is None: # we have a list of groupers if isinstance(self.grouper, BaseGrouper): @@ -2993,6 +3000,7 @@ def is_in_obj(gpr): for i, (gpr, level) in enumerate(zip(keys, levels)): + import ipdb; ipdb.set_trace() if is_in_obj(gpr): # df.groupby(df['name']) in_axis, name = True, gpr.name exclusions.append(name) @@ -3024,6 +3032,7 @@ def is_in_obj(gpr): # create the Grouping # allow us to passing the actual Grouping as the gpr + ping = Grouping(group_axis, gpr, obj=obj, From 90a0b3c52482e24887973e58eb510e027ff6e017 Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Thu, 8 Feb 2018 16:22:30 +0700 Subject: [PATCH 02/10] First pass at fixing group by sparse data frames --- pandas/_libs/hashtable_class_helper.pxi.in | 25 +++++++++++++--------- pandas/core/algorithms.py | 14 ++++++++---- pandas/core/groupby.py | 8 ------- 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 98fae408ae738..a959b1514580c 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -251,12 +251,12 @@ cdef class HashTable: {{py: # name, dtype, null_condition, float_group -dtypes = [('Float64', 'float64', 'val != val', True), - ('UInt64', 'uint64', 'False', False), - ('Int64', 'int64', 'val == iNaT', False)] +dtypes = [('Float64', 'float64', 'val != val', True, 'NAN'), + ('UInt64', 'uint64', 'False', False, 'NAN'), + ('Int64', 'int64', 'val == iNaT', False, 'iNaT')] def get_dispatch(dtypes): - for (name, dtype, null_condition, float_group) in dtypes: + for (name, dtype, null_condition, float_group, na_value) in dtypes: unique_template = """\ cdef: Py_ssize_t i, n = len(values) @@ -314,11 +314,11 @@ def get_dispatch(dtypes): unique_template = unique_template.format(name=name, dtype=dtype, null_condition=null_condition, float_group=float_group) - yield (name, dtype, null_condition, float_group, unique_template) + yield (name, dtype, null_condition, float_group, unique_template, na_value) }} -{{for name, dtype, null_condition, float_group, unique_template in get_dispatch(dtypes)}} +{{for name, dtype, null_condition, float_group, unique_template, na_value in get_dispatch(dtypes)}} cdef class {{name}}HashTable(HashTable): @@ -419,22 +419,27 @@ cdef class {{name}}HashTable(HashTable): labels = self.get_labels(values, uniques, 0, 0) return uniques.to_array(), labels + # This seems like duplicate code from def uniques to me... + # Why does this exist? @cython.boundscheck(False) def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior, Py_ssize_t na_sentinel, - bint check_null=True): + bint check_null=True, fill_value={{na_value}}, ngaps=0): cdef: Py_ssize_t i, n = len(values) int64_t[:] labels Py_ssize_t idx, count = count_prior int ret = 0 - {{dtype}}_t val + {{dtype}}_t val, fill_value_val, ngaps_val khiter_t k {{name}}VectorData *ud labels = np.empty(n, dtype=np.int64) ud = uniques.data + if ngaps > 0: + print("Hello world") + with nogil: for i in range(n): val = values[i] @@ -510,7 +515,7 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(labels), arr_uniques @cython.boundscheck(False) - def unique(self, ndarray[{{dtype}}_t, ndim=1] values, fill_value=np.nan, ngaps=0): + def unique(self, ndarray[{{dtype}}_t, ndim=1] values, fill_value={{na_value}}, ngaps=0): if values.flags.writeable: # If the value is writeable (mutable) then use memview return self.unique_memview(values, fill_value=fill_value, ngaps=ngaps) @@ -522,7 +527,7 @@ cdef class {{name}}HashTable(HashTable): {{unique_template}} @cython.boundscheck(False) - def unique_memview(self, {{dtype}}_t[:] values, fill_value=np.nan, ngaps=0): + def unique_memview(self, {{dtype}}_t[:] values, fill_value={{na_value}}, ngaps=0): {{unique_template}} {{endfor}} diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index ebdec42393479..03bbe4956d0e7 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -363,9 +363,10 @@ def unique(values): htable, _, values, dtype, ndtype = _get_hashtable_algo(values) table = htable(len(values)) - import ipdb; ipdb.set_trace() + if isinstance(values, ABCSparseArray): - uniques = table.unique(values, fill_value=values.fill_value, ngaps=values.sp_index.ngaps) + uniques = table.unique(values, fill_value=values.fill_value, + ngaps=values.sp_index.ngaps) else: uniques = table.unique(values) uniques = _reconstruct_data(uniques, dtype, original) @@ -466,7 +467,6 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): PeriodIndex """ - import ipdb; ipdb.set_trace() values = _ensure_arraylike(values) original = values values, dtype, _ = _ensure_data(values) @@ -475,7 +475,13 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): table = hash_klass(size_hint or len(values)) uniques = vec_klass() check_nulls = not is_integer_dtype(original) - labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) + + if isinstance(values, ABCSparseArray): + labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls, + fill_value=values.fill_value, + ngaps=values.sp_index.ngaps) + else: + labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) labels = _ensure_platform_int(labels) uniques = uniques.to_array() diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index fa11850bef89d..a21caa972694b 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1035,7 +1035,6 @@ def _cython_agg_general(self, how, alt=None, numeric_only=True, return self._wrap_aggregated_output(output, names) def _python_agg_general(self, func, *args, **kwargs): - import ipdb; ipdb.set_trace() func = self._is_builtin_func(func) f = lambda x: func(x, *args, **kwargs) @@ -1225,7 +1224,6 @@ def count(self): @Substitution(name='groupby') @Appender(_doc_template) def mean(self, *args, **kwargs): - import ipdb; ipdb.set_trace() """ Compute mean of groups, excluding missing values @@ -1950,7 +1948,6 @@ class BaseGrouper(object): def __init__(self, axis, groupings, sort=True, group_keys=True, mutated=False, indexer=None): - import ipdb; ipdb.set_trace() self._filter_empty_groups = self.compressed = len(groupings) != 1 self.axis = axis self.groupings = groupings @@ -2100,7 +2097,6 @@ def is_monotonic(self): @cache_readonly def group_info(self): - import ipdb; ipdb.set_trace() comp_ids, obs_group_ids = self._get_compressed_labels() ngroups = len(obs_group_ids) @@ -2611,7 +2607,6 @@ def indices(self): @cache_readonly def group_info(self): - import ipdb; ipdb.set_trace() ngroups = self.ngroups obs_group_ids = np.arange(ngroups) rep = np.diff(np.r_[0, self.bins]) @@ -2689,7 +2684,6 @@ class Grouping(object): def __init__(self, index, grouper=None, obj=None, name=None, level=None, sort=True, in_axis=False): - import ipdb; ipdb.set_trace() self.name = name self.level = level self.grouper = _convert_grouper(index, grouper) @@ -2822,7 +2816,6 @@ def group_index(self): return self._group_index def _make_labels(self): - import ipdb; ipdb.set_trace() if self._labels is None or self._group_index is None: # we have a list of groupers if isinstance(self.grouper, BaseGrouper): @@ -3000,7 +2993,6 @@ def is_in_obj(gpr): for i, (gpr, level) in enumerate(zip(keys, levels)): - import ipdb; ipdb.set_trace() if is_in_obj(gpr): # df.groupby(df['name']) in_axis, name = True, gpr.name exclusions.append(name) From 911c265e0e50f1bcf6019d937e8319d0d1014832 Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Mon, 12 Feb 2018 09:58:41 +0700 Subject: [PATCH 03/10] Cleanup hashtable_class_helper.pxi.in --- pandas/_libs/hashtable_class_helper.pxi.in | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index a959b1514580c..636fdd39b708b 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -272,9 +272,6 @@ def get_dispatch(dtypes): ngaps_val = ngaps with nogil: - # If this is a sparse structure we need to append - # The fill value as well assuming the ngaps are greater than 0 - if ngaps_val > 0: k = kh_get_{dtype}(self.table, fill_value_val) if k == self.table.n_buckets: @@ -419,8 +416,6 @@ cdef class {{name}}HashTable(HashTable): labels = self.get_labels(values, uniques, 0, 0) return uniques.to_array(), labels - # This seems like duplicate code from def uniques to me... - # Why does this exist? @cython.boundscheck(False) def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior, Py_ssize_t na_sentinel, @@ -437,9 +432,6 @@ cdef class {{name}}HashTable(HashTable): labels = np.empty(n, dtype=np.int64) ud = uniques.data - if ngaps > 0: - print("Hello world") - with nogil: for i in range(n): val = values[i] From d9d643b76f2d540d123e1003a0a0d1b4dc1e1f62 Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Mon, 12 Feb 2018 10:17:48 +0700 Subject: [PATCH 04/10] uint's NA value is 0, so set it as such --- pandas/_libs/hashtable_class_helper.pxi.in | 2 +- pandas/tests/test_algos.py | 13 ++++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 636fdd39b708b..e6c1372ebee22 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -252,7 +252,7 @@ cdef class HashTable: # name, dtype, null_condition, float_group dtypes = [('Float64', 'float64', 'val != val', True, 'NAN'), - ('UInt64', 'uint64', 'False', False, 'NAN'), + ('UInt64', 'uint64', 'False', False, '0'), ('Int64', 'int64', 'val == iNaT', False, 'iNaT')] def get_dispatch(dtypes): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index b1e3177547ac6..92d715f884d7c 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -8,7 +8,8 @@ from datetime import datetime from itertools import permutations from pandas import (Series, Categorical, CategoricalIndex, - Timestamp, DatetimeIndex, Index, IntervalIndex) + Timestamp, DatetimeIndex, Index, IntervalIndex, + SparseArray) import pandas as pd from pandas import compat @@ -268,6 +269,16 @@ def test_object_refcount_bug(self): for i in range(1000): len(algos.unique(lst)) + @pytest.mark.parametrize('fill_value', [0, 1, np.nan, None]) + def test_sparse(self, fill_value): + arr = SparseArray([0, 1, np.nan, None], fill_value=fill_value) + + result = algos.unique(arr) + + assert isinstance(result, np.ndarray) + assert len(result) == 3 + + def test_on_index_object(self): mindex = pd.MultiIndex.from_arrays([np.arange(5).repeat(5), np.tile( From 6666cd6a80f01d6fc2569daa149773db66c8c99b Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Mon, 12 Feb 2018 10:19:53 +0700 Subject: [PATCH 05/10] Add whatsnew entry --- doc/source/whatsnew/v0.23.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 6c4fce35529ad..1651c75b2f586 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -754,6 +754,7 @@ Sparse - Bug in which creating a ``SparseDataFrame`` from a dense ``Series`` or an unsupported type raised an uncontrolled exception (:issue:`19374`) - Bug in :class:`SparseDataFrame.to_csv` causing exception (:issue:`19384`) - Bug in :class:`SparseSeries.memory_usage` which caused segfault by accessing non sparse elements (:issue:`19368`) +- Bug in :func:`SparseSeries.unique` which returns only sparse elements during unique (:issue:`19651`) Reshaping ^^^^^^^^^ From 77e67542860562e65ce06985eb78d20849794c75 Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Mon, 12 Feb 2018 10:33:09 +0700 Subject: [PATCH 06/10] Reference issue number in tests --- pandas/tests/test_algos.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 92d715f884d7c..bfa60f843e79f 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -271,6 +271,7 @@ def test_object_refcount_bug(self): @pytest.mark.parametrize('fill_value', [0, 1, np.nan, None]) def test_sparse(self, fill_value): + # GH 19595 arr = SparseArray([0, 1, np.nan, None], fill_value=fill_value) result = algos.unique(arr) From 6f242ee4e0e5764f1a9a412eb7c981a3335c68c5 Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Mon, 12 Feb 2018 10:33:22 +0700 Subject: [PATCH 07/10] Reference issue number in tests --- pandas/tests/test_algos.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index bfa60f843e79f..bc3eb74502c6d 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -279,7 +279,6 @@ def test_sparse(self, fill_value): assert isinstance(result, np.ndarray) assert len(result) == 3 - def test_on_index_object(self): mindex = pd.MultiIndex.from_arrays([np.arange(5).repeat(5), np.tile( From 5b39aa2f56246006e766ed4278b63a53aea2e7c4 Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Tue, 13 Feb 2018 11:41:57 +0700 Subject: [PATCH 08/10] Just pass in appended unique array --- pandas/_libs/hashtable_class_helper.pxi.in | 37 ++++++++-------------- pandas/core/algorithms.py | 14 ++++---- 2 files changed, 19 insertions(+), 32 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index e6c1372ebee22..bca4e388f3279 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -251,36 +251,25 @@ cdef class HashTable: {{py: # name, dtype, null_condition, float_group -dtypes = [('Float64', 'float64', 'val != val', True, 'NAN'), - ('UInt64', 'uint64', 'False', False, '0'), - ('Int64', 'int64', 'val == iNaT', False, 'iNaT')] +dtypes = [('Float64', 'float64', 'val != val', True), + ('UInt64', 'uint64', 'False', False), + ('Int64', 'int64', 'val == iNaT', False)] def get_dispatch(dtypes): - for (name, dtype, null_condition, float_group, na_value) in dtypes: + for (name, dtype, null_condition, float_group) in dtypes: unique_template = """\ cdef: Py_ssize_t i, n = len(values) int ret = 0 - {dtype}_t val, fill_value_val, ngaps_val + {dtype}_t val khiter_t k bint seen_na = 0 {name}Vector uniques = {name}Vector() {name}VectorData *ud ud = uniques.data - fill_value_val = fill_value - ngaps_val = ngaps - - with nogil: - if ngaps_val > 0: - k = kh_get_{dtype}(self.table, fill_value_val) - if k == self.table.n_buckets: - kh_put_{dtype}(self.table, fill_value_val, &ret) - if needs_resize(ud): - with gil: - uniques.resize() - append_data_{dtype}(ud, fill_value_val) + with nogil: for i in range(n): val = values[i] IF {float_group}: @@ -311,11 +300,11 @@ def get_dispatch(dtypes): unique_template = unique_template.format(name=name, dtype=dtype, null_condition=null_condition, float_group=float_group) - yield (name, dtype, null_condition, float_group, unique_template, na_value) + yield (name, dtype, null_condition, float_group, unique_template) }} -{{for name, dtype, null_condition, float_group, unique_template, na_value in get_dispatch(dtypes)}} +{{for name, dtype, null_condition, float_group, unique_template in get_dispatch(dtypes)}} cdef class {{name}}HashTable(HashTable): @@ -419,13 +408,13 @@ cdef class {{name}}HashTable(HashTable): @cython.boundscheck(False) def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior, Py_ssize_t na_sentinel, - bint check_null=True, fill_value={{na_value}}, ngaps=0): + bint check_null=True): cdef: Py_ssize_t i, n = len(values) int64_t[:] labels Py_ssize_t idx, count = count_prior int ret = 0 - {{dtype}}_t val, fill_value_val, ngaps_val + {{dtype}}_t val khiter_t k {{name}}VectorData *ud @@ -507,10 +496,10 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(labels), arr_uniques @cython.boundscheck(False) - def unique(self, ndarray[{{dtype}}_t, ndim=1] values, fill_value={{na_value}}, ngaps=0): + def unique(self, ndarray[{{dtype}}_t, ndim=1] values): if values.flags.writeable: # If the value is writeable (mutable) then use memview - return self.unique_memview(values, fill_value=fill_value, ngaps=ngaps) + return self.unique_memview(values) # We cannot use the memoryview version on readonly-buffers due to # a limitation of Cython's typed memoryviews. Instead we can use @@ -519,7 +508,7 @@ cdef class {{name}}HashTable(HashTable): {{unique_template}} @cython.boundscheck(False) - def unique_memview(self, {{dtype}}_t[:] values, fill_value={{na_value}}, ngaps=0): + def unique_memview(self, {{dtype}}_t[:] values): {{unique_template}} {{endfor}} diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 03bbe4956d0e7..affaa5304f908 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -365,8 +365,11 @@ def unique(values): table = htable(len(values)) if isinstance(values, ABCSparseArray): - uniques = table.unique(values, fill_value=values.fill_value, - ngaps=values.sp_index.ngaps) + import ipdb; ipdb.set_trace() + to_unique = values.sp_values + if values.sp_index.ngaps > 0: + to_unique = np.append(to_unique, [values.fill_value]) + uniques = table.unique(to_unique) else: uniques = table.unique(values) uniques = _reconstruct_data(uniques, dtype, original) @@ -476,12 +479,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): uniques = vec_klass() check_nulls = not is_integer_dtype(original) - if isinstance(values, ABCSparseArray): - labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls, - fill_value=values.fill_value, - ngaps=values.sp_index.ngaps) - else: - labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) + labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) labels = _ensure_platform_int(labels) uniques = uniques.to_array() From af7a8040ed966025928d0562ede5355b2883417c Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Tue, 13 Feb 2018 11:42:38 +0700 Subject: [PATCH 09/10] Take out ipdb debug statement --- pandas/core/algorithms.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index affaa5304f908..51355db9f7d8a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -365,7 +365,6 @@ def unique(values): table = htable(len(values)) if isinstance(values, ABCSparseArray): - import ipdb; ipdb.set_trace() to_unique = values.sp_values if values.sp_index.ngaps > 0: to_unique = np.append(to_unique, [values.fill_value]) From 4a13a750a5a9ad04352141122dce4b1a0032ffa5 Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Tue, 13 Feb 2018 11:44:20 +0700 Subject: [PATCH 10/10] Revert change on groupby --- pandas/core/groupby.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index e1bb5835a477c..0363bcd02aa16 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3074,7 +3074,6 @@ def is_in_obj(gpr): # create the Grouping # allow us to passing the actual Grouping as the gpr - ping = Grouping(group_axis, gpr, obj=obj,