From 0be73701f7e4fc8ad198abd9187ec32a7606e157 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 19 Dec 2016 06:16:39 -0500 Subject: [PATCH] CLN: move unique1d to algorithms from nanops TST: consolidate hashtable testing to test_algos.py --- pandas/core/algorithms.py | 32 ++++++++++++++++++++++ pandas/core/base.py | 2 +- pandas/core/categorical.py | 3 +-- pandas/core/nanops.py | 29 +------------------- pandas/tests/test_algos.py | 55 +++++++++++++++++++++++--------------- pandas/tests/test_base.py | 11 -------- pandas/tseries/util.py | 4 +-- 7 files changed, 70 insertions(+), 66 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b2702ea0acca7..6bcd3776867b6 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -113,6 +113,38 @@ def _unique_generic(values, table_type, type_caster): return type_caster(uniques) +def unique1d(values): + """ + Hash table-based unique + """ + if np.issubdtype(values.dtype, np.floating): + table = htable.Float64HashTable(len(values)) + uniques = np.array(table.unique(_ensure_float64(values)), + dtype=np.float64) + elif np.issubdtype(values.dtype, np.datetime64): + table = htable.Int64HashTable(len(values)) + uniques = table.unique(_ensure_int64(values)) + uniques = uniques.view('M8[ns]') + elif np.issubdtype(values.dtype, np.timedelta64): + table = htable.Int64HashTable(len(values)) + uniques = table.unique(_ensure_int64(values)) + uniques = uniques.view('m8[ns]') + elif np.issubdtype(values.dtype, np.integer): + table = htable.Int64HashTable(len(values)) + uniques = table.unique(_ensure_int64(values)) + else: + + # its cheaper to use a String Hash Table than Object + if lib.infer_dtype(values) in ['string']: + table = htable.StringHashTable(len(values)) + else: + table = htable.PyObjectHashTable(len(values)) + + uniques = table.unique(_ensure_object(values)) + + return uniques + + def isin(comps, values): """ Compute the isin boolean array diff --git a/pandas/core/base.py b/pandas/core/base.py index d412349447794..a0365ce484a5a 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -969,7 +969,7 @@ def unique(self): if hasattr(values, 'unique'): result = values.unique() else: - from pandas.core.nanops import unique1d + from pandas.core.algorithms import unique1d result = unique1d(values) return result diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 5124dc44e2fc8..7f2e6093d0f4c 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -25,7 +25,7 @@ is_scalar) from pandas.core.common import is_null_slice -from pandas.core.algorithms import factorize, take_1d +from pandas.core.algorithms import factorize, take_1d, unique1d from pandas.core.base import (PandasObject, PandasDelegate, NoNewAttributesMixin, _shared_docs) import pandas.core.common as com @@ -1834,7 +1834,6 @@ def unique(self): unique values : ``Categorical`` """ - from pandas.core.nanops import unique1d # unlike np.unique, unique1d does not sort unique_codes = unique1d(self.codes) cat = self.copy() diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index a76e348b7dee2..1f76bc850cee9 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -9,10 +9,8 @@ except ImportError: # pragma: no cover _USE_BOTTLENECK = False -import pandas.hashtable as _hash from pandas import compat, lib, algos, tslib -from pandas.types.common import (_ensure_int64, _ensure_object, - _ensure_float64, _get_dtype, +from pandas.types.common import (_get_dtype, is_float, is_scalar, is_integer, is_complex, is_float_dtype, is_complex_dtype, is_integer_dtype, @@ -784,28 +782,3 @@ def f(x, y): nanle = make_nancomp(operator.le) naneq = make_nancomp(operator.eq) nanne = make_nancomp(operator.ne) - - -def unique1d(values): - """ - Hash table-based unique - """ - if np.issubdtype(values.dtype, np.floating): - table = _hash.Float64HashTable(len(values)) - uniques = np.array(table.unique(_ensure_float64(values)), - dtype=np.float64) - elif np.issubdtype(values.dtype, np.datetime64): - table = _hash.Int64HashTable(len(values)) - uniques = table.unique(_ensure_int64(values)) - uniques = uniques.view('M8[ns]') - elif np.issubdtype(values.dtype, np.timedelta64): - table = _hash.Int64HashTable(len(values)) - uniques = table.unique(_ensure_int64(values)) - uniques = uniques.view('m8[ns]') - elif np.issubdtype(values.dtype, np.integer): - table = _hash.Int64HashTable(len(values)) - uniques = table.unique(_ensure_int64(values)) - else: - table = _hash.PyObjectHashTable(len(values)) - uniques = table.unique(_ensure_object(values)) - return uniques diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index d0c909b9c1b30..92a9184ad30fc 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -277,28 +277,6 @@ def test_factorize_nan(self): self.assertTrue( np.array_equal(pd.isnull(key), expected == na_sentinel)) - def test_vector_resize(self): - # Test for memory errors after internal vector - # reallocations (pull request #7157) - - def _test_vector_resize(htable, uniques, dtype, nvals): - vals = np.array(np.random.randn(1000), dtype=dtype) - # get_labels appends to the vector - htable.get_labels(vals[:nvals], uniques, 0, -1) - # to_array resizes the vector - uniques.to_array() - htable.get_labels(vals, uniques, 0, -1) - - test_cases = [ - (hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'), - (hashtable.Float64HashTable, hashtable.Float64Vector, 'float64'), - (hashtable.Int64HashTable, hashtable.Int64Vector, 'int64')] - - for (tbl, vect, dtype) in test_cases: - # resizing to empty is a special case - _test_vector_resize(tbl(), vect(), dtype, 0) - _test_vector_resize(tbl(), vect(), dtype, 10) - def test_complex_sorting(self): # gh 12666 - check no segfault # Test not valid numpy versions older than 1.11 @@ -912,6 +890,39 @@ class TestGroupVarFloat32(tm.TestCase, GroupVarTestMixin): rtol = 1e-2 +class TestHashTable(tm.TestCase): + + def test_lookup_nan(self): + xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3]) + m = hashtable.Float64HashTable() + m.map_locations(xs) + self.assert_numpy_array_equal(m.lookup(xs), + np.arange(len(xs), dtype=np.int64)) + + def test_vector_resize(self): + # Test for memory errors after internal vector + # reallocations (pull request #7157) + + def _test_vector_resize(htable, uniques, dtype, nvals): + vals = np.array(np.random.randn(1000), dtype=dtype) + # get_labels appends to the vector + htable.get_labels(vals[:nvals], uniques, 0, -1) + # to_array resizes the vector + uniques.to_array() + htable.get_labels(vals, uniques, 0, -1) + + test_cases = [ + (hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'), + (hashtable.StringHashTable, hashtable.ObjectVector, 'object'), + (hashtable.Float64HashTable, hashtable.Float64Vector, 'float64'), + (hashtable.Int64HashTable, hashtable.Int64Vector, 'int64')] + + for (tbl, vect, dtype) in test_cases: + # resizing to empty is a special case + _test_vector_resize(tbl(), vect(), dtype, 0) + _test_vector_resize(tbl(), vect(), dtype, 10) + + def test_quantile(): s = Series(np.random.randn(100)) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index a5cd0bbc28369..717eae3e59715 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1051,17 +1051,6 @@ def test_searchsorted(self): self.assertTrue(0 <= index <= len(o)) -class TestFloat64HashTable(tm.TestCase): - - def test_lookup_nan(self): - from pandas.hashtable import Float64HashTable - xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3]) - m = Float64HashTable() - m.map_locations(xs) - self.assert_numpy_array_equal(m.lookup(xs), - np.arange(len(xs), dtype=np.int64)) - - class TestTranspose(Ops): errmsg = "the 'axes' parameter is not supported" diff --git a/pandas/tseries/util.py b/pandas/tseries/util.py index 59daa8d7780b4..dc460dee8415b 100644 --- a/pandas/tseries/util.py +++ b/pandas/tseries/util.py @@ -4,7 +4,7 @@ import numpy as np from pandas.types.common import _ensure_platform_int from pandas.core.frame import DataFrame -import pandas.core.nanops as nanops +import pandas.core.algorithms as algorithms def pivot_annual(series, freq=None): @@ -45,7 +45,7 @@ def pivot_annual(series, freq=None): index = series.index year = index.year - years = nanops.unique1d(year) + years = algorithms.unique1d(year) if freq is not None: freq = freq.upper()