Skip to content

Commit 71edcca

Browse files
committed
introducing stable value_counts function
1 parent b5707d6 commit 71edcca

File tree

1 file changed

+55
-30
lines changed

1 file changed

+55
-30
lines changed

pandas/_libs/hashtable_func_helper.pxi.in

+55-30
Original file line numberDiff line numberDiff line change
@@ -77,54 +77,79 @@ cdef build_count_table_{{dtype}}(const {{dtype}}_t[:] values,
7777
@cython.wraparound(False)
7878
@cython.boundscheck(False)
7979
{{if dtype == 'object'}}
80-
cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
80+
cpdef stable_value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
8181
{{else}}
82-
cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
82+
cpdef stable_value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
8383
{{endif}}
8484
cdef:
8585
Py_ssize_t i = 0
86+
Py_ssize_t n = len(values)
87+
size_t unique_key_index = 0
88+
size_t unique_key_count = 0
8689
kh_{{ttype}}_t *table
8790

88-
{{if dtype != 'object'}}
89-
{{dtype}}_t[:] result_keys
90-
int64_t[:] result_counts
91-
{{endif}}
92-
9391
# Don't use Py_ssize_t, since table.n_buckets is unsigned
9492
khiter_t k
9593

96-
table = kh_init_{{ttype}}()
97-
{{if dtype == 'object'}}
98-
build_count_table_{{dtype}}(values, table, 1)
94+
{{c_type}} val
95+
96+
int ret = 0
97+
98+
{{if dtype[0]!='u'}}
99+
result_keys = {{dtype.title()}}Vector()
99100
{{else}}
100-
build_count_table_{{dtype}}(values, table, dropna)
101+
result_keys = {{'U'+dtype[1::].title()}}Vector()
101102
{{endif}}
102-
103-
result_keys = np.empty(table.n_occupied, '{{dtype}}')
104-
result_counts = np.zeros(table.n_occupied, dtype=np.int64)
103+
result_counts = Int64Vector()
104+
table = kh_init_{{ttype}}()
105105

106106
{{if dtype == 'object'}}
107-
for k in range(table.n_buckets):
108-
if kh_exist_{{ttype}}(table, k):
109-
result_keys[i] = <{{dtype}}>table.keys[k]
110-
result_counts[i] = table.vals[k]
111-
i += 1
107+
kh_resize_{{ttype}}(table, n // 10)
108+
109+
for i in range(n):
110+
val = values[i]
111+
if not checknull(val) or not dropna:
112+
k = kh_get_{{ttype}}(table, <PyObject*>val)
113+
if k != table.n_buckets:
114+
unique_key_index = table.vals[k]
115+
result_counts.data.data[unique_key_index] += 1
116+
else:
117+
k = kh_put_{{ttype}}(table, <PyObject*>val, &ret)
118+
table.vals[k] = unique_key_count
119+
result_keys.append(val)
120+
result_counts.append(1)
121+
unique_key_count+=1
112122
{{else}}
113-
with nogil:
114-
for k in range(table.n_buckets):
115-
if kh_exist_{{ttype}}(table, k):
116-
result_keys[i] = {{to_dtype}}(table.keys[k])
117-
result_counts[i] = table.vals[k]
118-
i += 1
123+
kh_resize_{{ttype}}(table, n)
124+
125+
for i in range(n):
126+
val = {{to_c_type}}(values[i])
127+
128+
if not is_nan_{{c_type}}(val) or not dropna:
129+
k = kh_get_{{ttype}}(table, val)
130+
if k != table.n_buckets:
131+
unique_key_index = table.vals[k]
132+
result_counts.data.data[unique_key_index] += 1
133+
else:
134+
k = kh_put_{{ttype}}(table, val, &ret)
135+
table.vals[k] = unique_key_count
136+
result_keys.append(val)
137+
result_counts.append(1)
138+
unique_key_count+=1
119139
{{endif}}
120140

121141
kh_destroy_{{ttype}}(table)
122142

123-
{{if dtype == 'object'}}
124-
return result_keys, result_counts
125-
{{else}}
126-
return np.asarray(result_keys), np.asarray(result_counts)
127-
{{endif}}
143+
return result_keys.to_array(), result_counts.to_array()
144+
145+
146+
{{if dtype == 'object'}}
147+
cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
148+
return stable_value_count_{{dtype}}(values, 1)
149+
{{else}}
150+
cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
151+
return stable_value_count_{{dtype}}(values, dropna)
152+
{{endif}}
128153

129154

130155
@cython.wraparound(False)

0 commit comments

Comments
 (0)