Skip to content

Commit bbdb77d

Browse files
committed
DOC, TST, BUG: Improve uint64 core/algos behavior
1) duplicated() Updates documentation to describe the "values" parameter in the signature, adds tests for uint64, and refactors to use duplicated_uint64. 2) mode() Updates documentation to describe the "values" parameter in the signature, adds tests for uint64, and reactors to use mode_uint64. 3) unique() Uses UInt64HashTable to patch a uint64 overflow bug analogous to that seen in Series.unique (patched in pandas-devgh-14915). 4) Types API Introduces "is_signed_integer_dtype" and "is_unsigned _integer_dtype" to the public API. Used in refactoring/ patching of 1-3.
1 parent 8b497e4 commit bbdb77d

File tree

9 files changed

+346
-121
lines changed

9 files changed

+346
-121
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -298,5 +298,6 @@ Bug Fixes
298298

299299

300300
- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
301+
- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
301302
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
302303
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)

pandas/api/tests/test_api.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -153,10 +153,10 @@ class TestTypes(Base, tm.TestCase):
153153
'is_floating_dtype', 'is_int64_dtype', 'is_integer',
154154
'is_integer_dtype', 'is_number', 'is_numeric_dtype',
155155
'is_object_dtype', 'is_scalar', 'is_sparse',
156-
'is_string_dtype',
156+
'is_string_dtype', 'is_signed_integer_dtype',
157157
'is_timedelta64_dtype', 'is_timedelta64_ns_dtype',
158-
'is_period', 'is_period_dtype',
159-
'is_re', 'is_re_compilable',
158+
'is_unsigned_integer_dtype', 'is_period',
159+
'is_period_dtype', 'is_re', 'is_re_compilable',
160160
'is_dict_like', 'is_iterator',
161161
'is_list_like', 'is_hashable',
162162
'is_named_tuple', 'is_sequence',

pandas/core/algorithms.py

+33-11
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
from pandas import compat, lib, tslib, _np_version_under1p8
1010
from pandas.types.cast import _maybe_promote
1111
from pandas.types.generic import ABCSeries, ABCIndex
12-
from pandas.types.common import (is_integer_dtype,
12+
from pandas.types.common import (is_unsigned_integer_dtype,
13+
is_signed_integer_dtype,
14+
is_integer_dtype,
1315
is_int64_dtype,
1416
is_categorical_dtype,
1517
is_extension_type,
@@ -490,12 +492,14 @@ def _value_counts_arraylike(values, dropna=True):
490492

491493
def duplicated(values, keep='first'):
492494
"""
493-
Return boolean ndarray denoting duplicate values
495+
Return boolean ndarray denoting duplicate values.
494496
495497
.. versionadded:: 0.19.0
496498
497499
Parameters
498500
----------
501+
values : ndarray-like
502+
Array over which to check for duplicate values.
499503
keep : {'first', 'last', False}, default 'first'
500504
- ``first`` : Mark duplicates as ``True`` except for the first
501505
occurrence.
@@ -521,9 +525,12 @@ def duplicated(values, keep='first'):
521525
elif isinstance(values, (ABCSeries, ABCIndex)):
522526
values = values.values
523527

524-
if is_integer_dtype(dtype):
528+
if is_signed_integer_dtype(dtype):
525529
values = _ensure_int64(values)
526530
duplicated = htable.duplicated_int64(values, keep=keep)
531+
elif is_unsigned_integer_dtype(dtype):
532+
values = _ensure_uint64(values)
533+
duplicated = htable.duplicated_uint64(values, keep=keep)
527534
elif is_float_dtype(dtype):
528535
values = _ensure_float64(values)
529536
duplicated = htable.duplicated_float64(values, keep=keep)
@@ -535,7 +542,19 @@ def duplicated(values, keep='first'):
535542

536543

537544
def mode(values):
538-
"""Returns the mode or mode(s) of the passed Series or ndarray (sorted)"""
545+
"""
546+
Returns the mode(s) of an array.
547+
548+
Parameters
549+
----------
550+
values : array-like
551+
Array over which to check for duplicate values.
552+
553+
Returns
554+
-------
555+
mode : Series
556+
"""
557+
539558
# must sort because hash order isn't necessarily defined.
540559
from pandas.core.series import Series
541560

@@ -547,23 +566,24 @@ def mode(values):
547566
constructor = Series
548567

549568
dtype = values.dtype
550-
if is_integer_dtype(values):
569+
if is_signed_integer_dtype(values):
551570
values = _ensure_int64(values)
552-
result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)
553-
571+
result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype)
572+
elif is_unsigned_integer_dtype(values):
573+
values = _ensure_uint64(values)
574+
result = constructor(np.sort(htable.mode_uint64(values)), dtype=dtype)
554575
elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
555576
dtype = values.dtype
556577
values = values.view(np.int64)
557-
result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)
558-
578+
result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype)
559579
elif is_categorical_dtype(values):
560580
result = constructor(values.mode())
561581
else:
562582
mask = isnull(values)
563583
values = _ensure_object(values)
564584
res = htable.mode_object(values, mask)
565585
try:
566-
res = sorted(res)
586+
res = np.sort(res)
567587
except TypeError as e:
568588
warn("Unable to sort modes: %s" % e)
569589
result = constructor(res, dtype=dtype)
@@ -893,8 +913,10 @@ def _hashtable_algo(f, values, return_dtype=None):
893913
dtype = values.dtype
894914
if is_float_dtype(dtype):
895915
return f(htable.Float64HashTable, _ensure_float64)
896-
elif is_integer_dtype(dtype):
916+
elif is_signed_integer_dtype(dtype):
897917
return f(htable.Int64HashTable, _ensure_int64)
918+
elif is_unsigned_integer_dtype(dtype):
919+
return f(htable.UInt64HashTable, _ensure_uint64)
898920
elif is_datetime64_dtype(dtype):
899921
return_dtype = return_dtype or 'M8[ns]'
900922
return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)

pandas/hashtable.pyx

-67
Original file line numberDiff line numberDiff line change
@@ -167,73 +167,6 @@ cpdef value_count_object(ndarray[object] values,
167167
return result_keys, result_counts
168168

169169

170-
@cython.wraparound(False)
171-
@cython.boundscheck(False)
172-
def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask):
173-
cdef:
174-
int count, max_count = 2
175-
int j = -1 # so you can do +=
176-
int k
177-
ndarray[object] modes
178-
kh_pymap_t *table
179-
180-
table = kh_init_pymap()
181-
build_count_table_object(values, mask, table)
182-
183-
modes = np.empty(table.n_buckets, dtype=np.object_)
184-
for k in range(table.n_buckets):
185-
if kh_exist_pymap(table, k):
186-
count = table.vals[k]
187-
188-
if count == max_count:
189-
j += 1
190-
elif count > max_count:
191-
max_count = count
192-
j = 0
193-
else:
194-
continue
195-
modes[j] = <object> table.keys[k]
196-
197-
kh_destroy_pymap(table)
198-
199-
return modes[:j + 1]
200-
201-
202-
@cython.wraparound(False)
203-
@cython.boundscheck(False)
204-
def mode_int64(int64_t[:] values):
205-
cdef:
206-
int count, max_count = 2
207-
int j = -1 # so you can do +=
208-
int k
209-
kh_int64_t *table
210-
ndarray[int64_t] modes
211-
212-
table = kh_init_int64()
213-
214-
build_count_table_int64(values, table, 0)
215-
216-
modes = np.empty(table.n_buckets, dtype=np.int64)
217-
218-
with nogil:
219-
for k in range(table.n_buckets):
220-
if kh_exist_int64(table, k):
221-
count = table.vals[k]
222-
223-
if count == max_count:
224-
j += 1
225-
elif count > max_count:
226-
max_count = count
227-
j = 0
228-
else:
229-
continue
230-
modes[j] = table.keys[k]
231-
232-
kh_destroy_int64(table)
233-
234-
return modes[:j + 1]
235-
236-
237170
@cython.wraparound(False)
238171
@cython.boundscheck(False)
239172
def duplicated_object(ndarray[object] values, object keep='first'):

pandas/src/hashtable_func_helper.pxi.in

+82
Original file line numberDiff line numberDiff line change
@@ -112,3 +112,85 @@ def duplicated_{{dtype}}({{dtype}}_t[:] values,
112112
return out
113113

114114
{{endfor}}
115+
116+
#----------------------------------------------------------------------
117+
# Mode Computations
118+
#----------------------------------------------------------------------
119+
120+
{{py:
121+
122+
# dtype, ctype, table_type, npy_dtype
123+
dtypes = [('int64', 'int64_t', 'int64', 'int64'),
124+
('uint64', 'uint64_t', 'uint64', 'uint64'),
125+
('object', 'object', 'pymap', 'object_')]
126+
}}
127+
128+
{{for dtype, ctype, table_type, npy_dtype in dtypes}}
129+
130+
131+
@cython.wraparound(False)
132+
@cython.boundscheck(False)
133+
134+
{{if dtype == 'object'}}
135+
136+
137+
def mode_{{dtype}}(ndarray[{{ctype}}] values,
138+
ndarray[uint8_t, cast=True] mask):
139+
{{else}}
140+
141+
142+
def mode_{{dtype}}({{ctype}}[:] values):
143+
{{endif}}
144+
cdef:
145+
int count, max_count = 2
146+
int j = -1 # so you can do +=
147+
int k
148+
kh_{{table_type}}_t *table
149+
ndarray[{{ctype}}] modes
150+
151+
table = kh_init_{{table_type}}()
152+
153+
{{if dtype == 'object'}}
154+
build_count_table_{{dtype}}(values, mask, table)
155+
{{else}}
156+
build_count_table_{{dtype}}(values, table, 0)
157+
{{endif}}
158+
159+
modes = np.empty(table.n_buckets, dtype=np.{{npy_dtype}})
160+
161+
{{if dtype != 'object'}}
162+
with nogil:
163+
for k in range(table.n_buckets):
164+
if kh_exist_{{table_type}}(table, k):
165+
count = table.vals[k]
166+
167+
if count == max_count:
168+
j += 1
169+
elif count > max_count:
170+
max_count = count
171+
j = 0
172+
else:
173+
continue
174+
175+
modes[j] = table.keys[k]
176+
{{else}}
177+
for k in range(table.n_buckets):
178+
if kh_exist_{{table_type}}(table, k):
179+
count = table.vals[k]
180+
181+
if count == max_count:
182+
j += 1
183+
elif count > max_count:
184+
max_count = count
185+
j = 0
186+
else:
187+
continue
188+
189+
modes[j] = <object> table.keys[k]
190+
{{endif}}
191+
192+
kh_destroy_{{table_type}}(table)
193+
194+
return modes[:j + 1]
195+
196+
{{endfor}}

0 commit comments

Comments
 (0)