From b4a635995302cebfb916e05afb4dc78ffe67a4ea Mon Sep 17 00:00:00 2001
From: Matthew Kirk <matt@matthewkirk.com>
Date: Thu, 8 Feb 2018 15:51:44 +0700
Subject: [PATCH 01/10] First pass at having unique work for sparse array

---
 pandas/_libs/hashtable_class_helper.pxi.in | 24 +++++++++++++++++-----
 pandas/core/algorithms.py                  | 10 +++++++--
 pandas/core/groupby.py                     |  9 ++++++++
 3 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index bca4e388f3279..98fae408ae738 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -261,15 +261,29 @@ def get_dispatch(dtypes):
         cdef:
            Py_ssize_t i, n = len(values)
            int ret = 0
-           {dtype}_t val
+           {dtype}_t val, fill_value_val, ngaps_val
            khiter_t k
            bint seen_na = 0
            {name}Vector uniques = {name}Vector()
            {name}VectorData *ud
 
         ud = uniques.data
-
+        fill_value_val = fill_value
+        ngaps_val = ngaps
+        
         with nogil:
+            # If this is a sparse structure we need to append
+            # The fill value as well assuming the ngaps are greater than 0
+
+            if ngaps_val > 0:
+                k = kh_get_{dtype}(self.table, fill_value_val)
+                if k == self.table.n_buckets:
+                    kh_put_{dtype}(self.table, fill_value_val, &ret)
+                    if needs_resize(ud):
+                        with gil:
+                            uniques.resize()
+                    append_data_{dtype}(ud, fill_value_val)
+
             for i in range(n):
                 val = values[i]
                 IF {float_group}:
@@ -496,10 +510,10 @@ cdef class {{name}}HashTable(HashTable):
         return np.asarray(labels), arr_uniques
 
     @cython.boundscheck(False)
-    def unique(self, ndarray[{{dtype}}_t, ndim=1] values):
+    def unique(self, ndarray[{{dtype}}_t, ndim=1] values, fill_value=np.nan, ngaps=0):
         if values.flags.writeable:
           # If the value is writeable (mutable) then use memview
-          return self.unique_memview(values)
+          return self.unique_memview(values, fill_value=fill_value, ngaps=ngaps)
 
         # We cannot use the memoryview version on readonly-buffers due to
         # a limitation of Cython's typed memoryviews. Instead we can use
@@ -508,7 +522,7 @@ cdef class {{name}}HashTable(HashTable):
 {{unique_template}}
 
     @cython.boundscheck(False)
-    def unique_memview(self, {{dtype}}_t[:] values):
+    def unique_memview(self, {{dtype}}_t[:] values, fill_value=np.nan, ngaps=0):
 {{unique_template}}
 
 {{endfor}}
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index c754c063fce8e..ebdec42393479 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -10,7 +10,8 @@
     maybe_promote, construct_1d_object_array_from_listlike)
 from pandas.core.dtypes.generic import (
     ABCSeries, ABCIndex,
-    ABCIndexClass, ABCCategorical)
+    ABCIndexClass, ABCCategorical,
+    ABCSparseArray)
 from pandas.core.dtypes.common import (
     is_unsigned_integer_dtype, is_signed_integer_dtype,
     is_integer_dtype, is_complex_dtype,
@@ -362,7 +363,11 @@ def unique(values):
     htable, _, values, dtype, ndtype = _get_hashtable_algo(values)
 
     table = htable(len(values))
-    uniques = table.unique(values)
+    import ipdb; ipdb.set_trace()
+    if isinstance(values, ABCSparseArray):
+        uniques = table.unique(values, fill_value=values.fill_value, ngaps=values.sp_index.ngaps)
+    else:
+        uniques = table.unique(values)
     uniques = _reconstruct_data(uniques, dtype, original)
 
     if isinstance(original, ABCSeries) and is_datetime64tz_dtype(dtype):
@@ -461,6 +466,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
     PeriodIndex
     """
 
+    import ipdb; ipdb.set_trace()
     values = _ensure_arraylike(values)
     original = values
     values, dtype, _ = _ensure_data(values)
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index 01241db7c0c42..fa11850bef89d 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -1035,6 +1035,7 @@ def _cython_agg_general(self, how, alt=None, numeric_only=True,
         return self._wrap_aggregated_output(output, names)
 
     def _python_agg_general(self, func, *args, **kwargs):
+        import ipdb; ipdb.set_trace()
         func = self._is_builtin_func(func)
         f = lambda x: func(x, *args, **kwargs)
 
@@ -1224,6 +1225,7 @@ def count(self):
     @Substitution(name='groupby')
     @Appender(_doc_template)
     def mean(self, *args, **kwargs):
+        import ipdb; ipdb.set_trace()
         """
         Compute mean of groups, excluding missing values
 
@@ -1948,6 +1950,7 @@ class BaseGrouper(object):
 
     def __init__(self, axis, groupings, sort=True, group_keys=True,
                  mutated=False, indexer=None):
+        import ipdb; ipdb.set_trace()
         self._filter_empty_groups = self.compressed = len(groupings) != 1
         self.axis = axis
         self.groupings = groupings
@@ -2097,6 +2100,7 @@ def is_monotonic(self):
 
     @cache_readonly
     def group_info(self):
+        import ipdb; ipdb.set_trace()
         comp_ids, obs_group_ids = self._get_compressed_labels()
 
         ngroups = len(obs_group_ids)
@@ -2607,6 +2611,7 @@ def indices(self):
 
     @cache_readonly
     def group_info(self):
+        import ipdb; ipdb.set_trace()
         ngroups = self.ngroups
         obs_group_ids = np.arange(ngroups)
         rep = np.diff(np.r_[0, self.bins])
@@ -2684,6 +2689,7 @@ class Grouping(object):
     def __init__(self, index, grouper=None, obj=None, name=None, level=None,
                  sort=True, in_axis=False):
 
+        import ipdb; ipdb.set_trace()
         self.name = name
         self.level = level
         self.grouper = _convert_grouper(index, grouper)
@@ -2816,6 +2822,7 @@ def group_index(self):
         return self._group_index
 
     def _make_labels(self):
+        import ipdb; ipdb.set_trace()
         if self._labels is None or self._group_index is None:
             # we have a list of groupers
             if isinstance(self.grouper, BaseGrouper):
@@ -2993,6 +3000,7 @@ def is_in_obj(gpr):
 
     for i, (gpr, level) in enumerate(zip(keys, levels)):
 
+        import ipdb; ipdb.set_trace()
         if is_in_obj(gpr):  # df.groupby(df['name'])
             in_axis, name = True, gpr.name
             exclusions.append(name)
@@ -3024,6 +3032,7 @@ def is_in_obj(gpr):
 
         # create the Grouping
         # allow us to passing the actual Grouping as the gpr
+
         ping = Grouping(group_axis,
                         gpr,
                         obj=obj,

From 90a0b3c52482e24887973e58eb510e027ff6e017 Mon Sep 17 00:00:00 2001
From: Matthew Kirk <matt@matthewkirk.com>
Date: Thu, 8 Feb 2018 16:22:30 +0700
Subject: [PATCH 02/10] First pass at fixing group by sparse data frames

---
 pandas/_libs/hashtable_class_helper.pxi.in | 25 +++++++++++++---------
 pandas/core/algorithms.py                  | 14 ++++++++----
 pandas/core/groupby.py                     |  8 -------
 3 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 98fae408ae738..a959b1514580c 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -251,12 +251,12 @@ cdef class HashTable:
 {{py:
 
 # name, dtype, null_condition, float_group
-dtypes = [('Float64', 'float64', 'val != val', True),
-          ('UInt64', 'uint64', 'False', False),
-          ('Int64', 'int64', 'val == iNaT', False)]
+dtypes = [('Float64', 'float64', 'val != val', True, 'NAN'),
+          ('UInt64', 'uint64', 'False', False, 'NAN'),
+          ('Int64', 'int64', 'val == iNaT', False, 'iNaT')]
 
 def get_dispatch(dtypes):
-  for (name, dtype, null_condition, float_group) in dtypes:
+  for (name, dtype, null_condition, float_group, na_value) in dtypes:
     unique_template = """\
         cdef:
            Py_ssize_t i, n = len(values)
@@ -314,11 +314,11 @@ def get_dispatch(dtypes):
 
     unique_template = unique_template.format(name=name, dtype=dtype, null_condition=null_condition, float_group=float_group)
 
-    yield (name, dtype, null_condition, float_group, unique_template)
+    yield (name, dtype, null_condition, float_group, unique_template, na_value)
 }}
 
 
-{{for name, dtype, null_condition, float_group, unique_template in get_dispatch(dtypes)}}
+{{for name, dtype, null_condition, float_group, unique_template, na_value in get_dispatch(dtypes)}}
 
 cdef class {{name}}HashTable(HashTable):
 
@@ -419,22 +419,27 @@ cdef class {{name}}HashTable(HashTable):
         labels = self.get_labels(values, uniques, 0, 0)
         return uniques.to_array(), labels
 
+    # This seems like duplicate code from def uniques to me...
+    # Why does this exist?
     @cython.boundscheck(False)
     def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques,
                    Py_ssize_t count_prior, Py_ssize_t na_sentinel,
-                   bint check_null=True):
+                   bint check_null=True, fill_value={{na_value}}, ngaps=0):
         cdef:
             Py_ssize_t i, n = len(values)
             int64_t[:] labels
             Py_ssize_t idx, count = count_prior
             int ret = 0
-            {{dtype}}_t val
+            {{dtype}}_t val, fill_value_val, ngaps_val
             khiter_t k
             {{name}}VectorData *ud
 
         labels = np.empty(n, dtype=np.int64)
         ud = uniques.data
 
+        if ngaps > 0:
+            print("Hello world")
+
         with nogil:
             for i in range(n):
                 val = values[i]
@@ -510,7 +515,7 @@ cdef class {{name}}HashTable(HashTable):
         return np.asarray(labels), arr_uniques
 
     @cython.boundscheck(False)
-    def unique(self, ndarray[{{dtype}}_t, ndim=1] values, fill_value=np.nan, ngaps=0):
+    def unique(self, ndarray[{{dtype}}_t, ndim=1] values, fill_value={{na_value}}, ngaps=0):
         if values.flags.writeable:
           # If the value is writeable (mutable) then use memview
           return self.unique_memview(values, fill_value=fill_value, ngaps=ngaps)
@@ -522,7 +527,7 @@ cdef class {{name}}HashTable(HashTable):
 {{unique_template}}
 
     @cython.boundscheck(False)
-    def unique_memview(self, {{dtype}}_t[:] values, fill_value=np.nan, ngaps=0):
+    def unique_memview(self, {{dtype}}_t[:] values, fill_value={{na_value}}, ngaps=0):
 {{unique_template}}
 
 {{endfor}}
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index ebdec42393479..03bbe4956d0e7 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -363,9 +363,10 @@ def unique(values):
     htable, _, values, dtype, ndtype = _get_hashtable_algo(values)
 
     table = htable(len(values))
-    import ipdb; ipdb.set_trace()
+
     if isinstance(values, ABCSparseArray):
-        uniques = table.unique(values, fill_value=values.fill_value, ngaps=values.sp_index.ngaps)
+        uniques = table.unique(values, fill_value=values.fill_value,
+                               ngaps=values.sp_index.ngaps)
     else:
         uniques = table.unique(values)
     uniques = _reconstruct_data(uniques, dtype, original)
@@ -466,7 +467,6 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
     PeriodIndex
     """
 
-    import ipdb; ipdb.set_trace()
     values = _ensure_arraylike(values)
     original = values
     values, dtype, _ = _ensure_data(values)
@@ -475,7 +475,13 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
     table = hash_klass(size_hint or len(values))
     uniques = vec_klass()
     check_nulls = not is_integer_dtype(original)
-    labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls)
+
+    if isinstance(values, ABCSparseArray):
+        labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls,
+                                  fill_value=values.fill_value,
+                                  ngaps=values.sp_index.ngaps)
+    else:
+        labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls)
 
     labels = _ensure_platform_int(labels)
     uniques = uniques.to_array()
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index fa11850bef89d..a21caa972694b 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -1035,7 +1035,6 @@ def _cython_agg_general(self, how, alt=None, numeric_only=True,
         return self._wrap_aggregated_output(output, names)
 
     def _python_agg_general(self, func, *args, **kwargs):
-        import ipdb; ipdb.set_trace()
         func = self._is_builtin_func(func)
         f = lambda x: func(x, *args, **kwargs)
 
@@ -1225,7 +1224,6 @@ def count(self):
     @Substitution(name='groupby')
     @Appender(_doc_template)
     def mean(self, *args, **kwargs):
-        import ipdb; ipdb.set_trace()
         """
         Compute mean of groups, excluding missing values
 
@@ -1950,7 +1948,6 @@ class BaseGrouper(object):
 
     def __init__(self, axis, groupings, sort=True, group_keys=True,
                  mutated=False, indexer=None):
-        import ipdb; ipdb.set_trace()
         self._filter_empty_groups = self.compressed = len(groupings) != 1
         self.axis = axis
         self.groupings = groupings
@@ -2100,7 +2097,6 @@ def is_monotonic(self):
 
     @cache_readonly
     def group_info(self):
-        import ipdb; ipdb.set_trace()
         comp_ids, obs_group_ids = self._get_compressed_labels()
 
         ngroups = len(obs_group_ids)
@@ -2611,7 +2607,6 @@ def indices(self):
 
     @cache_readonly
     def group_info(self):
-        import ipdb; ipdb.set_trace()
         ngroups = self.ngroups
         obs_group_ids = np.arange(ngroups)
         rep = np.diff(np.r_[0, self.bins])
@@ -2689,7 +2684,6 @@ class Grouping(object):
     def __init__(self, index, grouper=None, obj=None, name=None, level=None,
                  sort=True, in_axis=False):
 
-        import ipdb; ipdb.set_trace()
         self.name = name
         self.level = level
         self.grouper = _convert_grouper(index, grouper)
@@ -2822,7 +2816,6 @@ def group_index(self):
         return self._group_index
 
     def _make_labels(self):
-        import ipdb; ipdb.set_trace()
         if self._labels is None or self._group_index is None:
             # we have a list of groupers
             if isinstance(self.grouper, BaseGrouper):
@@ -3000,7 +2993,6 @@ def is_in_obj(gpr):
 
     for i, (gpr, level) in enumerate(zip(keys, levels)):
 
-        import ipdb; ipdb.set_trace()
         if is_in_obj(gpr):  # df.groupby(df['name'])
             in_axis, name = True, gpr.name
             exclusions.append(name)

From 911c265e0e50f1bcf6019d937e8319d0d1014832 Mon Sep 17 00:00:00 2001
From: Matthew Kirk <matt@matthewkirk.com>
Date: Mon, 12 Feb 2018 09:58:41 +0700
Subject: [PATCH 03/10] Cleanup hashtable_class_helper.pxi.in

---
 pandas/_libs/hashtable_class_helper.pxi.in | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index a959b1514580c..636fdd39b708b 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -272,9 +272,6 @@ def get_dispatch(dtypes):
         ngaps_val = ngaps
         
         with nogil:
-            # If this is a sparse structure we need to append
-            # The fill value as well assuming the ngaps are greater than 0
-
             if ngaps_val > 0:
                 k = kh_get_{dtype}(self.table, fill_value_val)
                 if k == self.table.n_buckets:
@@ -419,8 +416,6 @@ cdef class {{name}}HashTable(HashTable):
         labels = self.get_labels(values, uniques, 0, 0)
         return uniques.to_array(), labels
 
-    # This seems like duplicate code from def uniques to me...
-    # Why does this exist?
     @cython.boundscheck(False)
     def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques,
                    Py_ssize_t count_prior, Py_ssize_t na_sentinel,
@@ -437,9 +432,6 @@ cdef class {{name}}HashTable(HashTable):
         labels = np.empty(n, dtype=np.int64)
         ud = uniques.data
 
-        if ngaps > 0:
-            print("Hello world")
-
         with nogil:
             for i in range(n):
                 val = values[i]

From d9d643b76f2d540d123e1003a0a0d1b4dc1e1f62 Mon Sep 17 00:00:00 2001
From: Matthew Kirk <matt@matthewkirk.com>
Date: Mon, 12 Feb 2018 10:17:48 +0700
Subject: [PATCH 04/10] uint's NA value is 0, so set it as such

---
 pandas/_libs/hashtable_class_helper.pxi.in |  2 +-
 pandas/tests/test_algos.py                 | 13 ++++++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 636fdd39b708b..e6c1372ebee22 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -252,7 +252,7 @@ cdef class HashTable:
 
 # name, dtype, null_condition, float_group
 dtypes = [('Float64', 'float64', 'val != val', True, 'NAN'),
-          ('UInt64', 'uint64', 'False', False, 'NAN'),
+          ('UInt64', 'uint64', 'False', False, '0'),
           ('Int64', 'int64', 'val == iNaT', False, 'iNaT')]
 
 def get_dispatch(dtypes):
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index b1e3177547ac6..92d715f884d7c 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -8,7 +8,8 @@
 from datetime import datetime
 from itertools import permutations
 from pandas import (Series, Categorical, CategoricalIndex,
-                    Timestamp, DatetimeIndex, Index, IntervalIndex)
+                    Timestamp, DatetimeIndex, Index, IntervalIndex,
+                    SparseArray)
 import pandas as pd
 
 from pandas import compat
@@ -268,6 +269,16 @@ def test_object_refcount_bug(self):
         for i in range(1000):
             len(algos.unique(lst))
 
+    @pytest.mark.parametrize('fill_value', [0, 1, np.nan, None])
+    def test_sparse(self, fill_value):
+        arr = SparseArray([0, 1, np.nan, None], fill_value=fill_value)
+
+        result = algos.unique(arr)
+
+        assert isinstance(result, np.ndarray)
+        assert len(result) == 3
+
+
     def test_on_index_object(self):
 
         mindex = pd.MultiIndex.from_arrays([np.arange(5).repeat(5), np.tile(

From 6666cd6a80f01d6fc2569daa149773db66c8c99b Mon Sep 17 00:00:00 2001
From: Matthew Kirk <matt@matthewkirk.com>
Date: Mon, 12 Feb 2018 10:19:53 +0700
Subject: [PATCH 05/10] Add whatsnew entry

---
 doc/source/whatsnew/v0.23.0.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
index 6c4fce35529ad..1651c75b2f586 100644
--- a/doc/source/whatsnew/v0.23.0.txt
+++ b/doc/source/whatsnew/v0.23.0.txt
@@ -754,6 +754,7 @@ Sparse
 - Bug in which creating a ``SparseDataFrame`` from a dense ``Series`` or an unsupported type raised an uncontrolled exception (:issue:`19374`)
 - Bug in :class:`SparseDataFrame.to_csv` causing exception (:issue:`19384`)
 - Bug in :class:`SparseSeries.memory_usage` which caused segfault by accessing non sparse elements (:issue:`19368`)
+- Bug in :func:`SparseSeries.unique` which returns only sparse elements during unique (:issue:`19651`)
 
 Reshaping
 ^^^^^^^^^

From 77e67542860562e65ce06985eb78d20849794c75 Mon Sep 17 00:00:00 2001
From: Matthew Kirk <matt@matthewkirk.com>
Date: Mon, 12 Feb 2018 10:33:09 +0700
Subject: [PATCH 06/10] Reference issue number in tests

---
 pandas/tests/test_algos.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 92d715f884d7c..bfa60f843e79f 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -271,6 +271,7 @@ def test_object_refcount_bug(self):
 
     @pytest.mark.parametrize('fill_value', [0, 1, np.nan, None])
     def test_sparse(self, fill_value):
+        # GH 19595
         arr = SparseArray([0, 1, np.nan, None], fill_value=fill_value)
 
         result = algos.unique(arr)

From 6f242ee4e0e5764f1a9a412eb7c981a3335c68c5 Mon Sep 17 00:00:00 2001
From: Matthew Kirk <matt@matthewkirk.com>
Date: Mon, 12 Feb 2018 10:33:22 +0700
Subject: [PATCH 07/10] Reference issue number in tests

---
 pandas/tests/test_algos.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index bfa60f843e79f..bc3eb74502c6d 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -279,7 +279,6 @@ def test_sparse(self, fill_value):
         assert isinstance(result, np.ndarray)
         assert len(result) == 3
 
-
     def test_on_index_object(self):
 
         mindex = pd.MultiIndex.from_arrays([np.arange(5).repeat(5), np.tile(

From 5b39aa2f56246006e766ed4278b63a53aea2e7c4 Mon Sep 17 00:00:00 2001
From: Matthew Kirk <matt@matthewkirk.com>
Date: Tue, 13 Feb 2018 11:41:57 +0700
Subject: [PATCH 08/10] Just pass in appended unique array

---
 pandas/_libs/hashtable_class_helper.pxi.in | 37 ++++++++--------------
 pandas/core/algorithms.py                  | 14 ++++----
 2 files changed, 19 insertions(+), 32 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index e6c1372ebee22..bca4e388f3279 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -251,36 +251,25 @@ cdef class HashTable:
 {{py:
 
 # name, dtype, null_condition, float_group
-dtypes = [('Float64', 'float64', 'val != val', True, 'NAN'),
-          ('UInt64', 'uint64', 'False', False, '0'),
-          ('Int64', 'int64', 'val == iNaT', False, 'iNaT')]
+dtypes = [('Float64', 'float64', 'val != val', True),
+          ('UInt64', 'uint64', 'False', False),
+          ('Int64', 'int64', 'val == iNaT', False)]
 
 def get_dispatch(dtypes):
-  for (name, dtype, null_condition, float_group, na_value) in dtypes:
+  for (name, dtype, null_condition, float_group) in dtypes:
     unique_template = """\
         cdef:
            Py_ssize_t i, n = len(values)
            int ret = 0
-           {dtype}_t val, fill_value_val, ngaps_val
+           {dtype}_t val
            khiter_t k
            bint seen_na = 0
            {name}Vector uniques = {name}Vector()
            {name}VectorData *ud
 
         ud = uniques.data
-        fill_value_val = fill_value
-        ngaps_val = ngaps
-        
-        with nogil:
-            if ngaps_val > 0:
-                k = kh_get_{dtype}(self.table, fill_value_val)
-                if k == self.table.n_buckets:
-                    kh_put_{dtype}(self.table, fill_value_val, &ret)
-                    if needs_resize(ud):
-                        with gil:
-                            uniques.resize()
-                    append_data_{dtype}(ud, fill_value_val)
 
+        with nogil:
             for i in range(n):
                 val = values[i]
                 IF {float_group}:
@@ -311,11 +300,11 @@ def get_dispatch(dtypes):
 
     unique_template = unique_template.format(name=name, dtype=dtype, null_condition=null_condition, float_group=float_group)
 
-    yield (name, dtype, null_condition, float_group, unique_template, na_value)
+    yield (name, dtype, null_condition, float_group, unique_template)
 }}
 
 
-{{for name, dtype, null_condition, float_group, unique_template, na_value in get_dispatch(dtypes)}}
+{{for name, dtype, null_condition, float_group, unique_template in get_dispatch(dtypes)}}
 
 cdef class {{name}}HashTable(HashTable):
 
@@ -419,13 +408,13 @@ cdef class {{name}}HashTable(HashTable):
     @cython.boundscheck(False)
     def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques,
                    Py_ssize_t count_prior, Py_ssize_t na_sentinel,
-                   bint check_null=True, fill_value={{na_value}}, ngaps=0):
+                   bint check_null=True):
         cdef:
             Py_ssize_t i, n = len(values)
             int64_t[:] labels
             Py_ssize_t idx, count = count_prior
             int ret = 0
-            {{dtype}}_t val, fill_value_val, ngaps_val
+            {{dtype}}_t val
             khiter_t k
             {{name}}VectorData *ud
 
@@ -507,10 +496,10 @@ cdef class {{name}}HashTable(HashTable):
         return np.asarray(labels), arr_uniques
 
     @cython.boundscheck(False)
-    def unique(self, ndarray[{{dtype}}_t, ndim=1] values, fill_value={{na_value}}, ngaps=0):
+    def unique(self, ndarray[{{dtype}}_t, ndim=1] values):
         if values.flags.writeable:
           # If the value is writeable (mutable) then use memview
-          return self.unique_memview(values, fill_value=fill_value, ngaps=ngaps)
+          return self.unique_memview(values)
 
         # We cannot use the memoryview version on readonly-buffers due to
         # a limitation of Cython's typed memoryviews. Instead we can use
@@ -519,7 +508,7 @@ cdef class {{name}}HashTable(HashTable):
 {{unique_template}}
 
     @cython.boundscheck(False)
-    def unique_memview(self, {{dtype}}_t[:] values, fill_value={{na_value}}, ngaps=0):
+    def unique_memview(self, {{dtype}}_t[:] values):
 {{unique_template}}
 
 {{endfor}}
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 03bbe4956d0e7..affaa5304f908 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -365,8 +365,11 @@ def unique(values):
     table = htable(len(values))
 
     if isinstance(values, ABCSparseArray):
-        uniques = table.unique(values, fill_value=values.fill_value,
-                               ngaps=values.sp_index.ngaps)
+        import ipdb; ipdb.set_trace()
+        to_unique = values.sp_values
+        if values.sp_index.ngaps > 0:
+            to_unique = np.append(to_unique, [values.fill_value])
+        uniques = table.unique(to_unique)
     else:
         uniques = table.unique(values)
     uniques = _reconstruct_data(uniques, dtype, original)
@@ -476,12 +479,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
     uniques = vec_klass()
     check_nulls = not is_integer_dtype(original)
 
-    if isinstance(values, ABCSparseArray):
-        labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls,
-                                  fill_value=values.fill_value,
-                                  ngaps=values.sp_index.ngaps)
-    else:
-        labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls)
+    labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls)
 
     labels = _ensure_platform_int(labels)
     uniques = uniques.to_array()

From af7a8040ed966025928d0562ede5355b2883417c Mon Sep 17 00:00:00 2001
From: Matthew Kirk <matt@matthewkirk.com>
Date: Tue, 13 Feb 2018 11:42:38 +0700
Subject: [PATCH 09/10] Take out ipdb debug statement

---
 pandas/core/algorithms.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index affaa5304f908..51355db9f7d8a 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -365,7 +365,6 @@ def unique(values):
     table = htable(len(values))
 
     if isinstance(values, ABCSparseArray):
-        import ipdb; ipdb.set_trace()
         to_unique = values.sp_values
         if values.sp_index.ngaps > 0:
             to_unique = np.append(to_unique, [values.fill_value])

From 4a13a750a5a9ad04352141122dce4b1a0032ffa5 Mon Sep 17 00:00:00 2001
From: Matthew Kirk <matt@matthewkirk.com>
Date: Tue, 13 Feb 2018 11:44:20 +0700
Subject: [PATCH 10/10] Revert change on groupby

---
 pandas/core/groupby.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index e1bb5835a477c..0363bcd02aa16 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -3074,7 +3074,6 @@ def is_in_obj(gpr):
 
         # create the Grouping
         # allow us to passing the actual Grouping as the gpr
-
         ping = Grouping(group_axis,
                         gpr,
                         obj=obj,