CLN: move unique1d to algorithms from nanops (#14919)

jreback · web-flow · commit 39efbbce6e20 · 2016-12-19T09:03:00.000-05:00
TST: consolidate hashtable testing to test_algos.py
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -113,6 +113,38 @@ def _unique_generic(values, table_type, type_caster):
     return type_caster(uniques)
 
 
+def unique1d(values):
+    """
+    Hash table-based unique
+    """
+    if np.issubdtype(values.dtype, np.floating):
+        table = htable.Float64HashTable(len(values))
+        uniques = np.array(table.unique(_ensure_float64(values)),
+                           dtype=np.float64)
+    elif np.issubdtype(values.dtype, np.datetime64):
+        table = htable.Int64HashTable(len(values))
+        uniques = table.unique(_ensure_int64(values))
+        uniques = uniques.view('M8[ns]')
+    elif np.issubdtype(values.dtype, np.timedelta64):
+        table = htable.Int64HashTable(len(values))
+        uniques = table.unique(_ensure_int64(values))
+        uniques = uniques.view('m8[ns]')
+    elif np.issubdtype(values.dtype, np.integer):
+        table = htable.Int64HashTable(len(values))
+        uniques = table.unique(_ensure_int64(values))
+    else:
+
+        # its cheaper to use a String Hash Table than Object
+        if lib.infer_dtype(values) in ['string']:
+            table = htable.StringHashTable(len(values))
+        else:
+            table = htable.PyObjectHashTable(len(values))
+
+        uniques = table.unique(_ensure_object(values))
+
+    return uniques
+
+
 def isin(comps, values):
     """
     Compute the isin boolean array
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -969,7 +969,7 @@ def unique(self):
         if hasattr(values, 'unique'):
             result = values.unique()
         else:
-            from pandas.core.nanops import unique1d
+            from pandas.core.algorithms import unique1d
             result = unique1d(values)
         return result
 
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -25,7 +25,7 @@
                                  is_scalar)
 from pandas.core.common import is_null_slice
 
-from pandas.core.algorithms import factorize, take_1d
+from pandas.core.algorithms import factorize, take_1d, unique1d
 from pandas.core.base import (PandasObject, PandasDelegate,
                               NoNewAttributesMixin, _shared_docs)
 import pandas.core.common as com
@@ -1834,7 +1834,6 @@ def unique(self):
         unique values : ``Categorical``
         """
 
-        from pandas.core.nanops import unique1d
         # unlike np.unique, unique1d does not sort
         unique_codes = unique1d(self.codes)
         cat = self.copy()
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -9,10 +9,8 @@
 except ImportError:  # pragma: no cover
     _USE_BOTTLENECK = False
 
-import pandas.hashtable as _hash
 from pandas import compat, lib, algos, tslib
-from pandas.types.common import (_ensure_int64, _ensure_object,
-                                 _ensure_float64, _get_dtype,
+from pandas.types.common import (_get_dtype,
                                  is_float, is_scalar,
                                  is_integer, is_complex, is_float_dtype,
                                  is_complex_dtype, is_integer_dtype,
@@ -784,28 +782,3 @@ def f(x, y):
 nanle = make_nancomp(operator.le)
 naneq = make_nancomp(operator.eq)
 nanne = make_nancomp(operator.ne)
-
-
-def unique1d(values):
-    """
-    Hash table-based unique
-    """
-    if np.issubdtype(values.dtype, np.floating):
-        table = _hash.Float64HashTable(len(values))
-        uniques = np.array(table.unique(_ensure_float64(values)),
-                           dtype=np.float64)
-    elif np.issubdtype(values.dtype, np.datetime64):
-        table = _hash.Int64HashTable(len(values))
-        uniques = table.unique(_ensure_int64(values))
-        uniques = uniques.view('M8[ns]')
-    elif np.issubdtype(values.dtype, np.timedelta64):
-        table = _hash.Int64HashTable(len(values))
-        uniques = table.unique(_ensure_int64(values))
-        uniques = uniques.view('m8[ns]')
-    elif np.issubdtype(values.dtype, np.integer):
-        table = _hash.Int64HashTable(len(values))
-        uniques = table.unique(_ensure_int64(values))
-    else:
-        table = _hash.PyObjectHashTable(len(values))
-        uniques = table.unique(_ensure_object(values))
-    return uniques
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -277,28 +277,6 @@ def test_factorize_nan(self):
         self.assertTrue(
             np.array_equal(pd.isnull(key), expected == na_sentinel))
 
-    def test_vector_resize(self):
-        # Test for memory errors after internal vector
-        # reallocations (pull request #7157)
-
-        def _test_vector_resize(htable, uniques, dtype, nvals):
-            vals = np.array(np.random.randn(1000), dtype=dtype)
-            # get_labels appends to the vector
-            htable.get_labels(vals[:nvals], uniques, 0, -1)
-            # to_array resizes the vector
-            uniques.to_array()
-            htable.get_labels(vals, uniques, 0, -1)
-
-        test_cases = [
-            (hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'),
-            (hashtable.Float64HashTable, hashtable.Float64Vector, 'float64'),
-            (hashtable.Int64HashTable, hashtable.Int64Vector, 'int64')]
-
-        for (tbl, vect, dtype) in test_cases:
-            # resizing to empty is a special case
-            _test_vector_resize(tbl(), vect(), dtype, 0)
-            _test_vector_resize(tbl(), vect(), dtype, 10)
-
     def test_complex_sorting(self):
         # gh 12666 - check no segfault
         # Test not valid numpy versions older than 1.11
@@ -912,6 +890,39 @@ class TestGroupVarFloat32(tm.TestCase, GroupVarTestMixin):
     rtol = 1e-2
 
 
+class TestHashTable(tm.TestCase):
+
+    def test_lookup_nan(self):
+        xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3])
+        m = hashtable.Float64HashTable()
+        m.map_locations(xs)
+        self.assert_numpy_array_equal(m.lookup(xs),
+                                      np.arange(len(xs), dtype=np.int64))
+
+    def test_vector_resize(self):
+        # Test for memory errors after internal vector
+        # reallocations (pull request #7157)
+
+        def _test_vector_resize(htable, uniques, dtype, nvals):
+            vals = np.array(np.random.randn(1000), dtype=dtype)
+            # get_labels appends to the vector
+            htable.get_labels(vals[:nvals], uniques, 0, -1)
+            # to_array resizes the vector
+            uniques.to_array()
+            htable.get_labels(vals, uniques, 0, -1)
+
+        test_cases = [
+            (hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'),
+            (hashtable.StringHashTable, hashtable.ObjectVector, 'object'),
+            (hashtable.Float64HashTable, hashtable.Float64Vector, 'float64'),
+            (hashtable.Int64HashTable, hashtable.Int64Vector, 'int64')]
+
+        for (tbl, vect, dtype) in test_cases:
+            # resizing to empty is a special case
+            _test_vector_resize(tbl(), vect(), dtype, 0)
+            _test_vector_resize(tbl(), vect(), dtype, 10)
+
+
 def test_quantile():
     s = Series(np.random.randn(100))
 
diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py
@@ -1051,17 +1051,6 @@ def test_searchsorted(self):
             self.assertTrue(0 <= index <= len(o))
 
 
-class TestFloat64HashTable(tm.TestCase):
-
-    def test_lookup_nan(self):
-        from pandas.hashtable import Float64HashTable
-        xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3])
-        m = Float64HashTable()
-        m.map_locations(xs)
-        self.assert_numpy_array_equal(m.lookup(xs),
-                                      np.arange(len(xs), dtype=np.int64))
-
-
 class TestTranspose(Ops):
     errmsg = "the 'axes' parameter is not supported"
 
diff --git a/pandas/tseries/util.py b/pandas/tseries/util.py
@@ -4,7 +4,7 @@
 import numpy as np
 from pandas.types.common import _ensure_platform_int
 from pandas.core.frame import DataFrame
-import pandas.core.nanops as nanops
+import pandas.core.algorithms as algorithms
 
 
 def pivot_annual(series, freq=None):
@@ -45,7 +45,7 @@ def pivot_annual(series, freq=None):
 
     index = series.index
     year = index.year
-    years = nanops.unique1d(year)
+    years = algorithms.unique1d(year)
 
     if freq is not None:
         freq = freq.upper()