Skip to content

Commit 2247783

Browse files
committed
Merge pull request #6231 from jreback/test_factorize
TST: add tests for algos.factorize (GH6212)
2 parents d890549 + 366653f commit 2247783

File tree

2 files changed

+76
-0
lines changed

2 files changed

+76
-0
lines changed

pandas/core/algorithms.py

+4
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,10 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
129129
130130
Returns
131131
-------
132+
labels : the indexer to the original array
133+
uniques : the unique values
134+
135+
note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex
132136
"""
133137
from pandas.tseries.period import PeriodIndex
134138
vals = np.asarray(values)

pandas/tests/test_algos.py

+72
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,78 @@ def test_strings(self):
4444
expected = Series(np.array([1, 0, np.nan, 0, 1, 2, np.nan]))
4545
tm.assert_series_equal(result,expected)
4646

47+
class TestFactorize(tm.TestCase):
48+
_multiprocess_can_split_ = True
49+
50+
def test_basic(self):
51+
52+
labels, uniques = algos.factorize(['a', 'b', 'b', 'a',
53+
'a', 'c', 'c', 'c'])
54+
self.assert_(np.array_equal(labels, np.array([ 0, 1, 1, 0, 0, 2, 2, 2],dtype=np.int64)))
55+
self.assert_(np.array_equal(uniques, np.array(['a','b','c'], dtype=object)))
56+
57+
labels, uniques = algos.factorize(['a', 'b', 'b', 'a',
58+
'a', 'c', 'c', 'c'], sort=True)
59+
self.assert_(np.array_equal(labels, np.array([ 0, 1, 1, 0, 0, 2, 2, 2],dtype=np.int64)))
60+
self.assert_(np.array_equal(uniques, np.array(['a','b','c'], dtype=object)))
61+
62+
labels, uniques = algos.factorize(list(reversed(range(5))))
63+
self.assert_(np.array_equal(labels, np.array([0, 1, 2, 3, 4], dtype=np.int64)))
64+
self.assert_(np.array_equal(uniques, np.array([ 4, 3, 2, 1, 0],dtype=np.int64)))
65+
66+
labels, uniques = algos.factorize(list(reversed(range(5))), sort=True)
67+
self.assert_(np.array_equal(labels, np.array([ 4, 3, 2, 1, 0],dtype=np.int64)))
68+
self.assert_(np.array_equal(uniques, np.array([0, 1, 2, 3, 4], dtype=np.int64)))
69+
70+
labels, uniques = algos.factorize(list(reversed(np.arange(5.))))
71+
self.assert_(np.array_equal(labels, np.array([0., 1., 2., 3., 4.], dtype=np.float64)))
72+
self.assert_(np.array_equal(uniques, np.array([ 4, 3, 2, 1, 0],dtype=np.int64)))
73+
74+
labels, uniques = algos.factorize(list(reversed(np.arange(5.))), sort=True)
75+
self.assert_(np.array_equal(labels, np.array([ 4, 3, 2, 1, 0],dtype=np.int64)))
76+
self.assert_(np.array_equal(uniques, np.array([0., 1., 2., 3., 4.], dtype=np.float64)))
77+
78+
def test_mixed(self):
79+
80+
# doc example reshaping.rst
81+
x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf])
82+
labels, uniques = algos.factorize(x)
83+
84+
self.assert_(np.array_equal(labels, np.array([ 0, 0, -1, 1, 2, 3],dtype=np.int64)))
85+
self.assert_(np.array_equal(uniques, np.array(['A', 'B', 3.14, np.inf], dtype=object)))
86+
87+
labels, uniques = algos.factorize(x, sort=True)
88+
self.assert_(np.array_equal(labels, np.array([ 2, 2, -1, 3, 0, 1],dtype=np.int64)))
89+
self.assert_(np.array_equal(uniques, np.array([3.14, np.inf, 'A', 'B'], dtype=object)))
90+
91+
def test_datelike(self):
92+
93+
# M8
94+
v1 = pd.Timestamp('20130101 09:00:00.00004')
95+
v2 = pd.Timestamp('20130101')
96+
x = Series([v1,v1,v1,v2,v2,v1])
97+
labels, uniques = algos.factorize(x)
98+
self.assert_(np.array_equal(labels, np.array([ 0,0,0,1,1,0],dtype=np.int64)))
99+
self.assert_(np.array_equal(uniques, np.array([v1.value,v2.value],dtype='M8[ns]')))
100+
101+
labels, uniques = algos.factorize(x, sort=True)
102+
self.assert_(np.array_equal(labels, np.array([ 1,1,1,0,0,1],dtype=np.int64)))
103+
self.assert_(np.array_equal(uniques, np.array([v2.value,v1.value],dtype='M8[ns]')))
104+
105+
# period
106+
v1 = pd.Period('201302',freq='M')
107+
v2 = pd.Period('201303',freq='M')
108+
x = Series([v1,v1,v1,v2,v2,v1])
109+
110+
# periods are not 'sorted' as they are converted back into an index
111+
labels, uniques = algos.factorize(x)
112+
self.assert_(np.array_equal(labels, np.array([ 0,0,0,1,1,0],dtype=np.int64)))
113+
self.assert_(np.array_equal(uniques, np.array([v1, v2],dtype=object)))
114+
115+
labels, uniques = algos.factorize(x,sort=True)
116+
self.assert_(np.array_equal(labels, np.array([ 0,0,0,1,1,0],dtype=np.int64)))
117+
self.assert_(np.array_equal(uniques, np.array([v1, v2],dtype=object)))
118+
47119
class TestUnique(tm.TestCase):
48120
_multiprocess_can_split_ = True
49121

0 commit comments

Comments
 (0)