Skip to content

Commit d82adf0

Browse files
authored
ENH: Support mask in Int64Factorizer (#49549)
* ENH: Avoid object path for merge and masked arrays * ENH: Support mask in Int64Factorizer * Fix mypy * Add gh ref * Fix tests * Fix tests * Fix tests
1 parent eea9e75 commit d82adf0

File tree

4 files changed

+27
-4
lines changed

4 files changed

+27
-4
lines changed

pandas/_libs/hashtable.pyi

+2
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ class Int64Factorizer(Factorizer):
3636
sort: bool = ...,
3737
na_sentinel=...,
3838
na_value=...,
39+
mask=...,
3940
) -> npt.NDArray[np.intp]: ...
4041

4142
class Int64Vector:
@@ -137,6 +138,7 @@ class HashTable:
137138
count_prior: int = ...,
138139
na_sentinel: int = ...,
139140
na_value: object = ...,
141+
mask=...,
140142
) -> npt.NDArray[np.intp]: ...
141143
def unique(
142144
self,

pandas/_libs/hashtable.pyx

+2-2
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ cdef class Int64Factorizer(Factorizer):
129129
self.uniques = Int64Vector()
130130

131131
def factorize(self, const int64_t[:] values,
132-
na_sentinel=-1, na_value=None) -> np.ndarray:
132+
na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray:
133133
"""
134134
Returns
135135
-------
@@ -152,6 +152,6 @@ cdef class Int64Factorizer(Factorizer):
152152
self.uniques = uniques
153153
labels = self.table.get_labels(values, self.uniques,
154154
self.count, na_sentinel,
155-
na_value=na_value)
155+
na_value=na_value, mask=mask)
156156
self.count = len(self.uniques)
157157
return labels

pandas/_libs/hashtable_class_helper.pxi.in

+2-2
Original file line numberDiff line numberDiff line change
@@ -821,11 +821,11 @@ cdef class {{name}}HashTable(HashTable):
821821

822822
def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
823823
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
824-
object na_value=None):
824+
object na_value=None, object mask=None):
825825
# -> np.ndarray[np.intp]
826826
_, labels = self._unique(values, uniques, count_prior=count_prior,
827827
na_sentinel=na_sentinel, na_value=na_value,
828-
ignore_na=True, return_inverse=True)
828+
ignore_na=True, return_inverse=True, mask=mask)
829829
return labels
830830

831831
{{if dtype == 'int64'}}

pandas/tests/test_algos.py

+21
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,27 @@ def test_factorize_nan(self):
222222
tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel)
223223
tm.assert_numpy_array_equal(ids, expected)
224224

225+
def test_factorizer_with_mask(self):
226+
# GH#49549
227+
data = np.array([1, 2, 3, 1, 1, 0], dtype="int64")
228+
mask = np.array([False, False, False, False, False, True])
229+
rizer = ht.Int64Factorizer(len(data))
230+
result = rizer.factorize(data, mask=mask)
231+
expected = np.array([0, 1, 2, 0, 0, -1], dtype=np.intp)
232+
tm.assert_numpy_array_equal(result, expected)
233+
expected_uniques = np.array([1, 2, 3], dtype="int64")
234+
tm.assert_numpy_array_equal(rizer.uniques.to_array(), expected_uniques)
235+
236+
def test_factorizer_object_with_nan(self):
237+
# GH#49549
238+
data = np.array([1, 2, 3, 1, np.nan])
239+
rizer = ht.ObjectFactorizer(len(data))
240+
result = rizer.factorize(data.astype(object))
241+
expected = np.array([0, 1, 2, 0, -1], dtype=np.intp)
242+
tm.assert_numpy_array_equal(result, expected)
243+
expected_uniques = np.array([1, 2, 3], dtype=object)
244+
tm.assert_numpy_array_equal(rizer.uniques.to_array(), expected_uniques)
245+
225246
@pytest.mark.parametrize(
226247
"data, expected_codes, expected_uniques",
227248
[

0 commit comments

Comments
 (0)