|
7 | 7 |
|
8 | 8 | import pandas.core.algorithms as algos
|
9 | 9 | import pandas.util.testing as tm
|
| 10 | +import pandas.hashtable as hashtable |
10 | 11 |
|
11 | 12 | class TestMatch(tm.TestCase):
|
12 | 13 | _multiprocess_can_split_ = True
|
@@ -122,6 +123,49 @@ def test_datelike(self):
|
122 | 123 | self.assert_numpy_array_equal(labels, np.array([ 0,0,0,1,1,0],dtype=np.int64))
|
123 | 124 | self.assert_numpy_array_equal(uniques, pd.PeriodIndex([v1, v2]))
|
124 | 125 |
|
| 126 | + def test_factorize_nan(self): |
| 127 | + # nan should map to na_sentinel, not reverse_indexer[na_sentinel] |
| 128 | + # rizer.factorize should not raise an exception if na_sentinel indexes |
| 129 | + # outside of reverse_indexer |
| 130 | + key = np.array([1, 2, 1, np.nan], dtype='O') |
| 131 | + rizer = hashtable.Factorizer(len(key)) |
| 132 | + for na_sentinel in (-1, 20): |
| 133 | + ids = rizer.factorize(key, sort=True, na_sentinel=na_sentinel) |
| 134 | + expected = np.array([0, 1, 0, na_sentinel], dtype='int32') |
| 135 | + self.assertEqual(len(set(key)), len(set(expected))) |
| 136 | + self.assertTrue(np.array_equal(pd.isnull(key), expected == na_sentinel)) |
| 137 | + |
| 138 | + # nan still maps to na_sentinel when sort=False |
| 139 | + key = np.array([0, np.nan, 1], dtype='O') |
| 140 | + na_sentinel = -1 |
| 141 | + ids = rizer.factorize(key, sort=False, na_sentinel=na_sentinel) |
| 142 | + expected = np.array([ 2, -1, 0], dtype='int32') |
| 143 | + self.assertEqual(len(set(key)), len(set(expected))) |
| 144 | + self.assertTrue(np.array_equal(pd.isnull(key), expected == na_sentinel)) |
| 145 | + |
| 146 | + def test_vector_resize(self): |
| 147 | + # Test for memory errors after internal vector |
| 148 | + # reallocations (pull request #7157) |
| 149 | + |
| 150 | + def _test_vector_resize(htable, uniques, dtype, nvals): |
| 151 | + vals = np.array(np.random.randn(1000), dtype=dtype) |
| 152 | + # get_labels appends to the vector |
| 153 | + htable.get_labels(vals[:nvals], uniques, 0, -1) |
| 154 | + # to_array resizes the vector |
| 155 | + uniques.to_array() |
| 156 | + htable.get_labels(vals, uniques, 0, -1) |
| 157 | + |
| 158 | + test_cases = [ |
| 159 | + (hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'), |
| 160 | + (hashtable.Float64HashTable, hashtable.Float64Vector, 'float64'), |
| 161 | + (hashtable.Int64HashTable, hashtable.Int64Vector, 'int64')] |
| 162 | + |
| 163 | + for (tbl, vect, dtype) in test_cases: |
| 164 | + # resizing to empty is a special case |
| 165 | + _test_vector_resize(tbl(), vect(), dtype, 0) |
| 166 | + _test_vector_resize(tbl(), vect(), dtype, 10) |
| 167 | + |
| 168 | + |
125 | 169 | class TestUnique(tm.TestCase):
|
126 | 170 | _multiprocess_can_split_ = True
|
127 | 171 |
|
|
0 commit comments