Skip to content

Commit 6d31057

Browse files
committed
DOC, TST, BUG: Improve uint64 core/algos behavior
1) duplicated() Updates documentation to describe the "values" parameter in the signature, adds tests for uint64, and refactors to use duplicated_uint64. 2) mode() Updates documentation to describe the "values" parameter in the signature, adds tests for uint64, and reactors to use mode_uint64. 3) unique() Uses UInt64HashTable to patch a uint64 overflow bug analogous to that seen in Series.unique (patched in pandas-devgh-14915). 4) Types API Introduces "is_signed_integer_dtype" and "is_unsigned _integer_dtype" to the public API. Used in refactoring/ patching of 1-3.
1 parent 74de478 commit 6d31057

File tree

9 files changed

+457
-237
lines changed

9 files changed

+457
-237
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -298,5 +298,6 @@ Bug Fixes
298298

299299

300300
- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
301+
- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
301302
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
302303
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)

pandas/api/tests/test_api.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -153,10 +153,10 @@ class TestTypes(Base, tm.TestCase):
153153
'is_floating_dtype', 'is_int64_dtype', 'is_integer',
154154
'is_integer_dtype', 'is_number', 'is_numeric_dtype',
155155
'is_object_dtype', 'is_scalar', 'is_sparse',
156-
'is_string_dtype',
156+
'is_string_dtype', 'is_signed_integer_dtype',
157157
'is_timedelta64_dtype', 'is_timedelta64_ns_dtype',
158-
'is_period', 'is_period_dtype',
159-
'is_re', 'is_re_compilable',
158+
'is_unsigned_integer_dtype', 'is_period',
159+
'is_period_dtype', 'is_re', 'is_re_compilable',
160160
'is_dict_like', 'is_iterator',
161161
'is_list_like', 'is_hashable',
162162
'is_named_tuple', 'is_sequence',

pandas/core/algorithms.py

+36-14
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
from pandas import compat, lib, tslib, _np_version_under1p8
1010
from pandas.types.cast import _maybe_promote
1111
from pandas.types.generic import ABCSeries, ABCIndex
12-
from pandas.types.common import (is_integer_dtype,
12+
from pandas.types.common import (is_unsigned_integer_dtype,
13+
is_signed_integer_dtype,
14+
is_integer_dtype,
1315
is_int64_dtype,
1416
is_categorical_dtype,
1517
is_extension_type,
@@ -479,8 +481,9 @@ def _value_counts_arraylike(values, dropna=True):
479481
keys, counts = htable.value_count_float64(values, dropna)
480482
else:
481483
values = _ensure_object(values)
484+
keys, counts = htable.value_count_object(values, dropna)
485+
482486
mask = isnull(values)
483-
keys, counts = htable.value_count_object(values, mask)
484487
if not dropna and mask.any():
485488
keys = np.insert(keys, 0, np.NaN)
486489
counts = np.insert(counts, 0, mask.sum())
@@ -490,12 +493,14 @@ def _value_counts_arraylike(values, dropna=True):
490493

491494
def duplicated(values, keep='first'):
492495
"""
493-
Return boolean ndarray denoting duplicate values
496+
Return boolean ndarray denoting duplicate values.
494497
495498
.. versionadded:: 0.19.0
496499
497500
Parameters
498501
----------
502+
values : ndarray-like
503+
Array over which to check for duplicate values.
499504
keep : {'first', 'last', False}, default 'first'
500505
- ``first`` : Mark duplicates as ``True`` except for the first
501506
occurrence.
@@ -521,9 +526,12 @@ def duplicated(values, keep='first'):
521526
elif isinstance(values, (ABCSeries, ABCIndex)):
522527
values = values.values
523528

524-
if is_integer_dtype(dtype):
529+
if is_signed_integer_dtype(dtype):
525530
values = _ensure_int64(values)
526531
duplicated = htable.duplicated_int64(values, keep=keep)
532+
elif is_unsigned_integer_dtype(dtype):
533+
values = _ensure_uint64(values)
534+
duplicated = htable.duplicated_uint64(values, keep=keep)
527535
elif is_float_dtype(dtype):
528536
values = _ensure_float64(values)
529537
duplicated = htable.duplicated_float64(values, keep=keep)
@@ -535,7 +543,19 @@ def duplicated(values, keep='first'):
535543

536544

537545
def mode(values):
538-
"""Returns the mode or mode(s) of the passed Series or ndarray (sorted)"""
546+
"""
547+
Returns the mode(s) of an array.
548+
549+
Parameters
550+
----------
551+
values : array-like
552+
Array over which to check for duplicate values.
553+
554+
Returns
555+
-------
556+
mode : Series
557+
"""
558+
539559
# must sort because hash order isn't necessarily defined.
540560
from pandas.core.series import Series
541561

@@ -547,23 +567,23 @@ def mode(values):
547567
constructor = Series
548568

549569
dtype = values.dtype
550-
if is_integer_dtype(values):
570+
if is_signed_integer_dtype(values):
551571
values = _ensure_int64(values)
552-
result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)
553-
572+
result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype)
573+
elif is_unsigned_integer_dtype(values):
574+
values = _ensure_uint64(values)
575+
result = constructor(np.sort(htable.mode_uint64(values)), dtype=dtype)
554576
elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
555577
dtype = values.dtype
556578
values = values.view(np.int64)
557-
result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)
558-
579+
result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype)
559580
elif is_categorical_dtype(values):
560581
result = constructor(values.mode())
561582
else:
562-
mask = isnull(values)
563583
values = _ensure_object(values)
564-
res = htable.mode_object(values, mask)
584+
res = htable.mode_object(values)
565585
try:
566-
res = sorted(res)
586+
res = np.sort(res)
567587
except TypeError as e:
568588
warn("Unable to sort modes: %s" % e)
569589
result = constructor(res, dtype=dtype)
@@ -893,8 +913,10 @@ def _hashtable_algo(f, values, return_dtype=None):
893913
dtype = values.dtype
894914
if is_float_dtype(dtype):
895915
return f(htable.Float64HashTable, _ensure_float64)
896-
elif is_integer_dtype(dtype):
916+
elif is_signed_integer_dtype(dtype):
897917
return f(htable.Int64HashTable, _ensure_int64)
918+
elif is_unsigned_integer_dtype(dtype):
919+
return f(htable.UInt64HashTable, _ensure_uint64)
898920
elif is_datetime64_dtype(dtype):
899921
return_dtype = return_dtype or 'M8[ns]'
900922
return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)

pandas/hashtable.pyx

+2-159
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ cdef extern from "numpy/npy_math.h":
2222
cimport cython
2323
cimport numpy as cnp
2424

25+
from pandas.lib import checknull
26+
2527
cnp.import_array()
2628
cnp.import_ufunc()
2729

@@ -117,165 +119,6 @@ cdef class Int64Factorizer:
117119
return labels
118120

119121

120-
@cython.wraparound(False)
121-
@cython.boundscheck(False)
122-
cdef build_count_table_object(ndarray[object] values,
123-
ndarray[uint8_t, cast=True] mask,
124-
kh_pymap_t *table):
125-
cdef:
126-
khiter_t k
127-
Py_ssize_t i, n = len(values)
128-
int ret = 0
129-
130-
kh_resize_pymap(table, n // 10)
131-
132-
for i in range(n):
133-
if mask[i]:
134-
continue
135-
136-
val = values[i]
137-
k = kh_get_pymap(table, <PyObject*> val)
138-
if k != table.n_buckets:
139-
table.vals[k] += 1
140-
else:
141-
k = kh_put_pymap(table, <PyObject*> val, &ret)
142-
table.vals[k] = 1
143-
144-
145-
@cython.wraparound(False)
146-
@cython.boundscheck(False)
147-
cpdef value_count_object(ndarray[object] values,
148-
ndarray[uint8_t, cast=True] mask):
149-
cdef:
150-
Py_ssize_t i
151-
kh_pymap_t *table
152-
int k
153-
154-
table = kh_init_pymap()
155-
build_count_table_object(values, mask, table)
156-
157-
i = 0
158-
result_keys = np.empty(table.n_occupied, dtype=object)
159-
result_counts = np.zeros(table.n_occupied, dtype=np.int64)
160-
for k in range(table.n_buckets):
161-
if kh_exist_pymap(table, k):
162-
result_keys[i] = <object> table.keys[k]
163-
result_counts[i] = table.vals[k]
164-
i += 1
165-
kh_destroy_pymap(table)
166-
167-
return result_keys, result_counts
168-
169-
170-
@cython.wraparound(False)
171-
@cython.boundscheck(False)
172-
def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask):
173-
cdef:
174-
int count, max_count = 2
175-
int j = -1 # so you can do +=
176-
int k
177-
ndarray[object] modes
178-
kh_pymap_t *table
179-
180-
table = kh_init_pymap()
181-
build_count_table_object(values, mask, table)
182-
183-
modes = np.empty(table.n_buckets, dtype=np.object_)
184-
for k in range(table.n_buckets):
185-
if kh_exist_pymap(table, k):
186-
count = table.vals[k]
187-
188-
if count == max_count:
189-
j += 1
190-
elif count > max_count:
191-
max_count = count
192-
j = 0
193-
else:
194-
continue
195-
modes[j] = <object> table.keys[k]
196-
197-
kh_destroy_pymap(table)
198-
199-
return modes[:j + 1]
200-
201-
202-
@cython.wraparound(False)
203-
@cython.boundscheck(False)
204-
def mode_int64(int64_t[:] values):
205-
cdef:
206-
int count, max_count = 2
207-
int j = -1 # so you can do +=
208-
int k
209-
kh_int64_t *table
210-
ndarray[int64_t] modes
211-
212-
table = kh_init_int64()
213-
214-
build_count_table_int64(values, table, 0)
215-
216-
modes = np.empty(table.n_buckets, dtype=np.int64)
217-
218-
with nogil:
219-
for k in range(table.n_buckets):
220-
if kh_exist_int64(table, k):
221-
count = table.vals[k]
222-
223-
if count == max_count:
224-
j += 1
225-
elif count > max_count:
226-
max_count = count
227-
j = 0
228-
else:
229-
continue
230-
modes[j] = table.keys[k]
231-
232-
kh_destroy_int64(table)
233-
234-
return modes[:j + 1]
235-
236-
237-
@cython.wraparound(False)
238-
@cython.boundscheck(False)
239-
def duplicated_object(ndarray[object] values, object keep='first'):
240-
cdef:
241-
Py_ssize_t i, n
242-
dict seen = dict()
243-
object row
244-
245-
n = len(values)
246-
cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8)
247-
248-
if keep == 'last':
249-
for i from n > i >= 0:
250-
row = values[i]
251-
if row in seen:
252-
result[i] = 1
253-
else:
254-
seen[row] = i
255-
result[i] = 0
256-
elif keep == 'first':
257-
for i from 0 <= i < n:
258-
row = values[i]
259-
if row in seen:
260-
result[i] = 1
261-
else:
262-
seen[row] = i
263-
result[i] = 0
264-
elif keep is False:
265-
for i from 0 <= i < n:
266-
row = values[i]
267-
if row in seen:
268-
result[i] = 1
269-
result[seen[row]] = 1
270-
else:
271-
seen[row] = i
272-
result[i] = 0
273-
else:
274-
raise ValueError('keep must be either "first", "last" or False')
275-
276-
return result.view(np.bool_)
277-
278-
279122
@cython.wraparound(False)
280123
@cython.boundscheck(False)
281124
def unique_label_indices(ndarray[int64_t, ndim=1] labels):

0 commit comments

Comments
 (0)