Skip to content

Commit 7e14996

Browse files
jbrockmendelJulianWgs
authored andcommitted
TYP: _libs.hashtable (pandas-dev#41246)
1 parent 050ce8e commit 7e14996

File tree

5 files changed

+253
-7
lines changed

5 files changed

+253
-7
lines changed

pandas/_libs/hashtable.pyi

+242
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,242 @@
1+
from typing import (
2+
Any,
3+
Hashable,
4+
Literal,
5+
)
6+
7+
import numpy as np
8+
9+
def unique_label_indices(
10+
labels: np.ndarray, # const int64_t[:]
11+
) -> np.ndarray: ...
12+
13+
14+
class Factorizer:
15+
table: PyObjectHashTable
16+
uniques: ObjectVector
17+
count: int
18+
19+
def __init__(self, size_hint: int): ...
20+
def get_count(self) -> int: ...
21+
22+
def factorize(
23+
self,
24+
values: np.ndarray, # np.ndarray[object]
25+
sort: bool = ...,
26+
na_sentinel=...,
27+
na_value=...,
28+
) -> np.ndarray: ... # np.ndarray[intp]
29+
30+
def unique(
31+
self,
32+
values: np.ndarray, # np.ndarray[object]
33+
) -> np.ndarray: ... # np.ndarray[object]
34+
35+
36+
class Int64Factorizer:
37+
table: Int64HashTable
38+
uniques: Int64Vector
39+
count: int
40+
41+
def __init__(self, size_hint: int): ...
42+
def get_count(self) -> int: ...
43+
44+
def factorize(
45+
self,
46+
values: np.ndarray, # const int64_t[:]
47+
sort: bool = ...,
48+
na_sentinel=...,
49+
na_value=...,
50+
) -> np.ndarray: ... # np.ndarray[intp]
51+
52+
53+
class Int64Vector:
54+
def __init__(self): ...
55+
def __len__(self) -> int: ...
56+
def to_array(self) -> np.ndarray: ... # np.ndarray[np.int64]
57+
58+
class Int32Vector:
59+
def __init__(self): ...
60+
def __len__(self) -> int: ...
61+
def to_array(self) -> np.ndarray: ... # np.ndarray[np.int32]
62+
63+
class Int16Vector:
64+
def __init__(self): ...
65+
def __len__(self) -> int: ...
66+
def to_array(self) -> np.ndarray: ... # np.ndarray[np.int16]
67+
68+
class Int8Vector:
69+
def __init__(self): ...
70+
def __len__(self) -> int: ...
71+
def to_array(self) -> np.ndarray: ... # np.ndarray[np.int8]
72+
73+
class UInt64Vector:
74+
def __init__(self): ...
75+
def __len__(self) -> int: ...
76+
def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint64]
77+
78+
class UInt32Vector:
79+
def __init__(self): ...
80+
def __len__(self) -> int: ...
81+
def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint32]
82+
83+
class UInt16Vector:
84+
def __init__(self): ...
85+
def __len__(self) -> int: ...
86+
def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint16]
87+
88+
class UInt8Vector:
89+
def __init__(self): ...
90+
def __len__(self) -> int: ...
91+
def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint8]
92+
93+
class Float64Vector:
94+
def __init__(self): ...
95+
def __len__(self) -> int: ...
96+
def to_array(self) -> np.ndarray: ... # np.ndarray[np.float64]
97+
98+
class Float32Vector:
99+
def __init__(self): ...
100+
def __len__(self) -> int: ...
101+
def to_array(self) -> np.ndarray: ... # np.ndarray[np.float32]
102+
103+
class Complex128Vector:
104+
def __init__(self): ...
105+
def __len__(self) -> int: ...
106+
def to_array(self) -> np.ndarray: ... # np.ndarray[np.complex128]
107+
108+
class Complex64Vector:
109+
def __init__(self): ...
110+
def __len__(self) -> int: ...
111+
def to_array(self) -> np.ndarray: ... # np.ndarray[np.complex64]
112+
113+
class StringVector:
114+
def __init__(self): ...
115+
def __len__(self) -> int: ...
116+
def to_array(self) -> np.ndarray: ... # np.ndarray[object]
117+
118+
class ObjectVector:
119+
def __init__(self): ...
120+
def __len__(self) -> int: ...
121+
def to_array(self) -> np.ndarray: ... # np.ndarray[object]
122+
123+
124+
class HashTable:
125+
# NB: The base HashTable class does _not_ actually have these methods;
126+
# we are putting the here for the sake of mypy to avoid
127+
# reproducing them in each subclass below.
128+
def __init__(self, size_hint: int = ...): ...
129+
def __len__(self) -> int: ...
130+
def __contains__(self, key: Hashable) -> bool: ...
131+
def sizeof(self, deep: bool = ...) -> int: ...
132+
def get_state(self) -> dict[str, int]: ...
133+
134+
# TODO: `item` type is subclass-specific
135+
def get_item(self, item): ... # TODO: return type?
136+
def set_item(self, item) -> None: ...
137+
138+
# FIXME: we don't actually have this for StringHashTable or ObjectHashTable?
139+
def map(
140+
self,
141+
keys: np.ndarray, # np.ndarray[subclass-specific]
142+
values: np.ndarray, # const int64_t[:] values
143+
) -> None: ...
144+
145+
def map_locations(
146+
self,
147+
values: np.ndarray, # np.ndarray[subclass-specific]
148+
) -> None: ...
149+
150+
def lookup(
151+
self,
152+
values: np.ndarray, # np.ndarray[subclass-specific]
153+
) -> np.ndarray: ... # np.ndarray[np.intp]
154+
155+
def get_labels(
156+
self,
157+
values: np.ndarray, # np.ndarray[subclass-specific]
158+
uniques, # SubclassTypeVector
159+
count_prior: int = ...,
160+
na_sentinel: int = ...,
161+
na_value: object = ...,
162+
) -> np.ndarray: ... # np.ndarray[intp_t]
163+
164+
def unique(
165+
self,
166+
values: np.ndarray, # np.ndarray[subclass-specific]
167+
return_inverse: bool = ...,
168+
) -> tuple[
169+
np.ndarray, # np.ndarray[subclass-specific]
170+
np.ndarray, # np.ndarray[np.intp],
171+
] | np.ndarray: ... # np.ndarray[subclass-specific]
172+
173+
def _unique(
174+
self,
175+
values: np.ndarray, # np.ndarray[subclass-specific]
176+
uniques, # FooVector
177+
count_prior: int = ...,
178+
na_sentinel: int = ...,
179+
na_value: object = ...,
180+
ignore_na: bool = ...,
181+
return_inverse: bool = ...,
182+
) -> tuple[
183+
np.ndarray, # np.ndarray[subclass-specific]
184+
np.ndarray, # np.ndarray[np.intp],
185+
] | np.ndarray: ... # np.ndarray[subclass-specific]
186+
187+
def factorize(
188+
self,
189+
values: np.ndarray, # np.ndarray[subclass-specific]
190+
na_sentinel: int = ...,
191+
na_value: object = ...,
192+
mask=...,
193+
) -> tuple[
194+
np.ndarray, # np.ndarray[subclass-specific]
195+
np.ndarray, # np.ndarray[np.intp],
196+
]: ...
197+
198+
class Complex128HashTable(HashTable): ...
199+
class Complex64HashTable(HashTable): ...
200+
class Float64HashTable(HashTable): ...
201+
class Float32HashTable(HashTable): ...
202+
203+
class Int64HashTable(HashTable):
204+
# Only Int64HashTable has get_labels_groupby
205+
def get_labels_groupby(
206+
self,
207+
values: np.ndarray, # const int64_t[:]
208+
) -> tuple[
209+
np.ndarray, # np.ndarray[np.intp]
210+
np.ndarray, # np.ndarray[np.int64]
211+
]: ...
212+
213+
class Int32HashTable(HashTable): ...
214+
class Int16HashTable(HashTable): ...
215+
class Int8HashTable(HashTable): ...
216+
class UInt64HashTable(HashTable): ...
217+
class UInt32HashTable(HashTable): ...
218+
class UInt16HashTable(HashTable): ...
219+
class UInt8HashTable(HashTable): ...
220+
221+
class StringHashTable(HashTable): ...
222+
class PyObjectHashTable(HashTable): ...
223+
224+
225+
def duplicated_int64(
226+
values: np.ndarray, # const int64_t[:] values
227+
keep: Literal["last", "first", False] = ...,
228+
) -> np.ndarray: ... # np.ndarray[bool]
229+
# TODO: Is it actually bool or is it uint8?
230+
231+
def mode_int64(
232+
values: np.ndarray, # const int64_t[:] values
233+
dropna: bool,
234+
) -> np.ndarray: ... # np.ndarray[np.int64]
235+
236+
def value_count_int64(
237+
values: np.ndarray, # const int64_t[:]
238+
dropna: bool,
239+
) -> tuple[
240+
np.ndarray, # np.ndarray[np.int64]
241+
np.ndarray, # np.ndarray[np.int64]
242+
]: ...

pandas/_libs/hashtable_class_helper.pxi.in

+5-2
Original file line numberDiff line numberDiff line change
@@ -680,6 +680,7 @@ cdef class {{name}}HashTable(HashTable):
680680
def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
681681
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
682682
object na_value=None):
683+
# -> np.ndarray[np.intp]
683684
_, labels = self._unique(values, uniques, count_prior=count_prior,
684685
na_sentinel=na_sentinel, na_value=na_value,
685686
ignore_na=True, return_inverse=True)
@@ -1012,7 +1013,7 @@ cdef class StringHashTable(HashTable):
10121013
-------
10131014
uniques : ndarray[object]
10141015
Unique values of input, not sorted
1015-
labels : ndarray[int64] (if return_inverse)
1016+
labels : ndarray[intp_t] (if return_inverse)
10161017
The labels from values to uniques
10171018
"""
10181019
uniques = ObjectVector()
@@ -1045,7 +1046,7 @@ cdef class StringHashTable(HashTable):
10451046
-------
10461047
uniques : ndarray[object]
10471048
Unique values of input, not sorted
1048-
labels : ndarray[int64]
1049+
labels : ndarray[intp]
10491050
The labels from values to uniques
10501051
"""
10511052
uniques_vector = ObjectVector()
@@ -1056,6 +1057,7 @@ cdef class StringHashTable(HashTable):
10561057
def get_labels(self, ndarray[object] values, ObjectVector uniques,
10571058
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
10581059
object na_value=None):
1060+
# -> np.ndarray[np.intp]
10591061
_, labels = self._unique(values, uniques, count_prior=count_prior,
10601062
na_sentinel=na_sentinel, na_value=na_value,
10611063
ignore_na=True, return_inverse=True)
@@ -1310,6 +1312,7 @@ cdef class PyObjectHashTable(HashTable):
13101312
def get_labels(self, ndarray[object] values, ObjectVector uniques,
13111313
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
13121314
object na_value=None):
1315+
# -> np.ndarray[np.intp]
13131316
_, labels = self._unique(values, uniques, count_prior=count_prior,
13141317
na_sentinel=na_sentinel, na_value=na_value,
13151318
ignore_na=True, return_inverse=True)

pandas/core/algorithms.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -554,7 +554,7 @@ def factorize_array(
554554
555555
Returns
556556
-------
557-
codes : ndarray
557+
codes : ndarray[np.intp]
558558
uniques : ndarray
559559
"""
560560
hash_klass, values = get_data_algo(values)
@@ -906,9 +906,9 @@ def value_counts_arraylike(values, dropna: bool):
906906
f = getattr(htable, f"value_count_{ndtype}")
907907
keys, counts = f(values, dropna)
908908

909-
keys = _reconstruct_data(keys, original.dtype, original)
909+
res_keys = _reconstruct_data(keys, original.dtype, original)
910910

911-
return keys, counts
911+
return res_keys, counts
912912

913913

914914
def duplicated(values: ArrayLike, keep: str | bool = "first") -> np.ndarray:

pandas/core/frame.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -5954,7 +5954,7 @@ def dropna(
59545954
def drop_duplicates(
59555955
self,
59565956
subset: Hashable | Sequence[Hashable] | None = None,
5957-
keep: str | bool = "first",
5957+
keep: Literal["first"] | Literal["last"] | Literal[False] = "first",
59585958
inplace: bool = False,
59595959
ignore_index: bool = False,
59605960
) -> DataFrame | None:
@@ -6051,7 +6051,7 @@ def drop_duplicates(
60516051
def duplicated(
60526052
self,
60536053
subset: Hashable | Sequence[Hashable] | None = None,
6054-
keep: str | bool = "first",
6054+
keep: Literal["first"] | Literal["last"] | Literal[False] = "first",
60556055
) -> Series:
60566056
"""
60576057
Return boolean Series denoting duplicate rows.

pandas/core/reshape/merge.py

+1
Original file line numberDiff line numberDiff line change
@@ -2138,6 +2138,7 @@ def _factorize_keys(
21382138
# "_values_for_factorize"
21392139
rk, _ = rk._values_for_factorize() # type: ignore[union-attr,assignment]
21402140

2141+
klass: type[libhashtable.Factorizer] | type[libhashtable.Int64Factorizer]
21412142
if is_integer_dtype(lk.dtype) and is_integer_dtype(rk.dtype):
21422143
# GH#23917 TODO: needs tests for case where lk is integer-dtype
21432144
# and rk is datetime-dtype

0 commit comments

Comments
 (0)