Skip to content

Commit 39efbbc

Browse files
authored
CLN: move unique1d to algorithms from nanops (#14919)
TST: consolidate hashtable testing to test_algos.py
1 parent dc4b070 commit 39efbbc

File tree

7 files changed

+70
-66
lines changed

7 files changed

+70
-66
lines changed

pandas/core/algorithms.py

+32
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,38 @@ def _unique_generic(values, table_type, type_caster):
113113
return type_caster(uniques)
114114

115115

116+
def unique1d(values):
117+
"""
118+
Hash table-based unique
119+
"""
120+
if np.issubdtype(values.dtype, np.floating):
121+
table = htable.Float64HashTable(len(values))
122+
uniques = np.array(table.unique(_ensure_float64(values)),
123+
dtype=np.float64)
124+
elif np.issubdtype(values.dtype, np.datetime64):
125+
table = htable.Int64HashTable(len(values))
126+
uniques = table.unique(_ensure_int64(values))
127+
uniques = uniques.view('M8[ns]')
128+
elif np.issubdtype(values.dtype, np.timedelta64):
129+
table = htable.Int64HashTable(len(values))
130+
uniques = table.unique(_ensure_int64(values))
131+
uniques = uniques.view('m8[ns]')
132+
elif np.issubdtype(values.dtype, np.integer):
133+
table = htable.Int64HashTable(len(values))
134+
uniques = table.unique(_ensure_int64(values))
135+
else:
136+
137+
# its cheaper to use a String Hash Table than Object
138+
if lib.infer_dtype(values) in ['string']:
139+
table = htable.StringHashTable(len(values))
140+
else:
141+
table = htable.PyObjectHashTable(len(values))
142+
143+
uniques = table.unique(_ensure_object(values))
144+
145+
return uniques
146+
147+
116148
def isin(comps, values):
117149
"""
118150
Compute the isin boolean array

pandas/core/base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -969,7 +969,7 @@ def unique(self):
969969
if hasattr(values, 'unique'):
970970
result = values.unique()
971971
else:
972-
from pandas.core.nanops import unique1d
972+
from pandas.core.algorithms import unique1d
973973
result = unique1d(values)
974974
return result
975975

pandas/core/categorical.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
is_scalar)
2626
from pandas.core.common import is_null_slice
2727

28-
from pandas.core.algorithms import factorize, take_1d
28+
from pandas.core.algorithms import factorize, take_1d, unique1d
2929
from pandas.core.base import (PandasObject, PandasDelegate,
3030
NoNewAttributesMixin, _shared_docs)
3131
import pandas.core.common as com
@@ -1834,7 +1834,6 @@ def unique(self):
18341834
unique values : ``Categorical``
18351835
"""
18361836

1837-
from pandas.core.nanops import unique1d
18381837
# unlike np.unique, unique1d does not sort
18391838
unique_codes = unique1d(self.codes)
18401839
cat = self.copy()

pandas/core/nanops.py

+1-28
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,8 @@
99
except ImportError: # pragma: no cover
1010
_USE_BOTTLENECK = False
1111

12-
import pandas.hashtable as _hash
1312
from pandas import compat, lib, algos, tslib
14-
from pandas.types.common import (_ensure_int64, _ensure_object,
15-
_ensure_float64, _get_dtype,
13+
from pandas.types.common import (_get_dtype,
1614
is_float, is_scalar,
1715
is_integer, is_complex, is_float_dtype,
1816
is_complex_dtype, is_integer_dtype,
@@ -784,28 +782,3 @@ def f(x, y):
784782
nanle = make_nancomp(operator.le)
785783
naneq = make_nancomp(operator.eq)
786784
nanne = make_nancomp(operator.ne)
787-
788-
789-
def unique1d(values):
790-
"""
791-
Hash table-based unique
792-
"""
793-
if np.issubdtype(values.dtype, np.floating):
794-
table = _hash.Float64HashTable(len(values))
795-
uniques = np.array(table.unique(_ensure_float64(values)),
796-
dtype=np.float64)
797-
elif np.issubdtype(values.dtype, np.datetime64):
798-
table = _hash.Int64HashTable(len(values))
799-
uniques = table.unique(_ensure_int64(values))
800-
uniques = uniques.view('M8[ns]')
801-
elif np.issubdtype(values.dtype, np.timedelta64):
802-
table = _hash.Int64HashTable(len(values))
803-
uniques = table.unique(_ensure_int64(values))
804-
uniques = uniques.view('m8[ns]')
805-
elif np.issubdtype(values.dtype, np.integer):
806-
table = _hash.Int64HashTable(len(values))
807-
uniques = table.unique(_ensure_int64(values))
808-
else:
809-
table = _hash.PyObjectHashTable(len(values))
810-
uniques = table.unique(_ensure_object(values))
811-
return uniques

pandas/tests/test_algos.py

+33-22
Original file line numberDiff line numberDiff line change
@@ -277,28 +277,6 @@ def test_factorize_nan(self):
277277
self.assertTrue(
278278
np.array_equal(pd.isnull(key), expected == na_sentinel))
279279

280-
def test_vector_resize(self):
281-
# Test for memory errors after internal vector
282-
# reallocations (pull request #7157)
283-
284-
def _test_vector_resize(htable, uniques, dtype, nvals):
285-
vals = np.array(np.random.randn(1000), dtype=dtype)
286-
# get_labels appends to the vector
287-
htable.get_labels(vals[:nvals], uniques, 0, -1)
288-
# to_array resizes the vector
289-
uniques.to_array()
290-
htable.get_labels(vals, uniques, 0, -1)
291-
292-
test_cases = [
293-
(hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'),
294-
(hashtable.Float64HashTable, hashtable.Float64Vector, 'float64'),
295-
(hashtable.Int64HashTable, hashtable.Int64Vector, 'int64')]
296-
297-
for (tbl, vect, dtype) in test_cases:
298-
# resizing to empty is a special case
299-
_test_vector_resize(tbl(), vect(), dtype, 0)
300-
_test_vector_resize(tbl(), vect(), dtype, 10)
301-
302280
def test_complex_sorting(self):
303281
# gh 12666 - check no segfault
304282
# Test not valid numpy versions older than 1.11
@@ -912,6 +890,39 @@ class TestGroupVarFloat32(tm.TestCase, GroupVarTestMixin):
912890
rtol = 1e-2
913891

914892

893+
class TestHashTable(tm.TestCase):
894+
895+
def test_lookup_nan(self):
896+
xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3])
897+
m = hashtable.Float64HashTable()
898+
m.map_locations(xs)
899+
self.assert_numpy_array_equal(m.lookup(xs),
900+
np.arange(len(xs), dtype=np.int64))
901+
902+
def test_vector_resize(self):
903+
# Test for memory errors after internal vector
904+
# reallocations (pull request #7157)
905+
906+
def _test_vector_resize(htable, uniques, dtype, nvals):
907+
vals = np.array(np.random.randn(1000), dtype=dtype)
908+
# get_labels appends to the vector
909+
htable.get_labels(vals[:nvals], uniques, 0, -1)
910+
# to_array resizes the vector
911+
uniques.to_array()
912+
htable.get_labels(vals, uniques, 0, -1)
913+
914+
test_cases = [
915+
(hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'),
916+
(hashtable.StringHashTable, hashtable.ObjectVector, 'object'),
917+
(hashtable.Float64HashTable, hashtable.Float64Vector, 'float64'),
918+
(hashtable.Int64HashTable, hashtable.Int64Vector, 'int64')]
919+
920+
for (tbl, vect, dtype) in test_cases:
921+
# resizing to empty is a special case
922+
_test_vector_resize(tbl(), vect(), dtype, 0)
923+
_test_vector_resize(tbl(), vect(), dtype, 10)
924+
925+
915926
def test_quantile():
916927
s = Series(np.random.randn(100))
917928

pandas/tests/test_base.py

-11
Original file line numberDiff line numberDiff line change
@@ -1051,17 +1051,6 @@ def test_searchsorted(self):
10511051
self.assertTrue(0 <= index <= len(o))
10521052

10531053

1054-
class TestFloat64HashTable(tm.TestCase):
1055-
1056-
def test_lookup_nan(self):
1057-
from pandas.hashtable import Float64HashTable
1058-
xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3])
1059-
m = Float64HashTable()
1060-
m.map_locations(xs)
1061-
self.assert_numpy_array_equal(m.lookup(xs),
1062-
np.arange(len(xs), dtype=np.int64))
1063-
1064-
10651054
class TestTranspose(Ops):
10661055
errmsg = "the 'axes' parameter is not supported"
10671056

pandas/tseries/util.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import numpy as np
55
from pandas.types.common import _ensure_platform_int
66
from pandas.core.frame import DataFrame
7-
import pandas.core.nanops as nanops
7+
import pandas.core.algorithms as algorithms
88

99

1010
def pivot_annual(series, freq=None):
@@ -45,7 +45,7 @@ def pivot_annual(series, freq=None):
4545

4646
index = series.index
4747
year = index.year
48-
years = nanops.unique1d(year)
48+
years = algorithms.unique1d(year)
4949

5050
if freq is not None:
5151
freq = freq.upper()

0 commit comments

Comments
 (0)