Skip to content

Commit b47353a

Browse files
committed
DOC, TST, BUG: Improve uint64 core/algos behavior
1) duplicated() Updates documentation to describe the "values" parameter in the signature, adds tests for uint64, and refactors to use duplicated_uint64. 2) mode() Updates documentation to describe the "values" parameter in the signature, adds tests for uint64, and reactors to use mode_uint64. 3) unique() Uses UInt64HashTable to patch a uint64 overflow bug analogous to that seen in Series.unique (patched in pandas-devgh-14915). 4) Types API Introduces "is_signed_integer_dtype" and "is_unsigned _integer_dtype" to the public API. Used in refactoring/ patching of 1-3.
1 parent 73e2829 commit b47353a

File tree

8 files changed

+162
-46
lines changed

8 files changed

+162
-46
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -298,5 +298,6 @@ Bug Fixes
298298

299299

300300
- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
301+
- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
301302
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
302303
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)

pandas/api/tests/test_api.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -153,10 +153,10 @@ class TestTypes(Base, tm.TestCase):
153153
'is_floating_dtype', 'is_int64_dtype', 'is_integer',
154154
'is_integer_dtype', 'is_number', 'is_numeric_dtype',
155155
'is_object_dtype', 'is_scalar', 'is_sparse',
156-
'is_string_dtype',
156+
'is_string_dtype', 'is_signed_integer_dtype',
157157
'is_timedelta64_dtype', 'is_timedelta64_ns_dtype',
158-
'is_period', 'is_period_dtype',
159-
'is_re', 'is_re_compilable',
158+
'is_unsigned_integer_dtype', 'is_period',
159+
'is_period_dtype', 'is_re', 'is_re_compilable',
160160
'is_dict_like', 'is_iterator',
161161
'is_list_like', 'is_hashable',
162162
'is_named_tuple', 'is_sequence',

pandas/core/algorithms.py

+30-7
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
from pandas import compat, lib, tslib, _np_version_under1p8
1010
from pandas.types.cast import _maybe_promote
1111
from pandas.types.generic import ABCSeries, ABCIndex
12-
from pandas.types.common import (is_integer_dtype,
12+
from pandas.types.common import (is_unsigned_integer_dtype,
13+
is_signed_integer_dtype,
14+
is_integer_dtype,
1315
is_int64_dtype,
1416
is_categorical_dtype,
1517
is_extension_type,
@@ -490,12 +492,14 @@ def _value_counts_arraylike(values, dropna=True):
490492

491493
def duplicated(values, keep='first'):
492494
"""
493-
Return boolean ndarray denoting duplicate values
495+
Return boolean ndarray denoting duplicate values.
494496
495497
.. versionadded:: 0.19.0
496498
497499
Parameters
498500
----------
501+
values : ndarray-like
502+
Array over which to check for duplicate values.
499503
keep : {'first', 'last', False}, default 'first'
500504
- ``first`` : Mark duplicates as ``True`` except for the first
501505
occurrence.
@@ -521,9 +525,12 @@ def duplicated(values, keep='first'):
521525
elif isinstance(values, (ABCSeries, ABCIndex)):
522526
values = values.values
523527

524-
if is_integer_dtype(dtype):
528+
if is_signed_integer_dtype(dtype):
525529
values = _ensure_int64(values)
526530
duplicated = htable.duplicated_int64(values, keep=keep)
531+
elif is_unsigned_integer_dtype(dtype):
532+
values = _ensure_uint64(values)
533+
duplicated = htable.duplicated_uint64(values, keep=keep)
527534
elif is_float_dtype(dtype):
528535
values = _ensure_float64(values)
529536
duplicated = htable.duplicated_float64(values, keep=keep)
@@ -535,7 +542,19 @@ def duplicated(values, keep='first'):
535542

536543

537544
def mode(values):
538-
"""Returns the mode or mode(s) of the passed Series or ndarray (sorted)"""
545+
"""
546+
Returns the mode(s) of an array.
547+
548+
Parameters
549+
----------
550+
values : array-like
551+
Array over which to check for duplicate values.
552+
553+
Returns
554+
-------
555+
mode : Series
556+
"""
557+
539558
# must sort because hash order isn't necessarily defined.
540559
from pandas.core.series import Series
541560

@@ -547,10 +566,12 @@ def mode(values):
547566
constructor = Series
548567

549568
dtype = values.dtype
550-
if is_integer_dtype(values):
569+
if is_signed_integer_dtype(values):
551570
values = _ensure_int64(values)
552571
result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)
553-
572+
elif is_unsigned_integer_dtype(values):
573+
values = _ensure_uint64(values)
574+
result = constructor(sorted(htable.mode_uint64(values)), dtype=dtype)
554575
elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
555576
dtype = values.dtype
556577
values = values.view(np.int64)
@@ -893,8 +914,10 @@ def _hashtable_algo(f, values, return_dtype=None):
893914
dtype = values.dtype
894915
if is_float_dtype(dtype):
895916
return f(htable.Float64HashTable, _ensure_float64)
896-
elif is_integer_dtype(dtype):
917+
elif is_signed_integer_dtype(dtype):
897918
return f(htable.Int64HashTable, _ensure_int64)
919+
elif is_unsigned_integer_dtype(dtype):
920+
return f(htable.UInt64HashTable, _ensure_uint64)
898921
elif is_datetime64_dtype(dtype):
899922
return_dtype = return_dtype or 'M8[ns]'
900923
return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)

pandas/hashtable.pyx

-35
Original file line numberDiff line numberDiff line change
@@ -199,41 +199,6 @@ def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask):
199199
return modes[:j + 1]
200200

201201

202-
@cython.wraparound(False)
203-
@cython.boundscheck(False)
204-
def mode_int64(int64_t[:] values):
205-
cdef:
206-
int count, max_count = 2
207-
int j = -1 # so you can do +=
208-
int k
209-
kh_int64_t *table
210-
ndarray[int64_t] modes
211-
212-
table = kh_init_int64()
213-
214-
build_count_table_int64(values, table, 0)
215-
216-
modes = np.empty(table.n_buckets, dtype=np.int64)
217-
218-
with nogil:
219-
for k in range(table.n_buckets):
220-
if kh_exist_int64(table, k):
221-
count = table.vals[k]
222-
223-
if count == max_count:
224-
j += 1
225-
elif count > max_count:
226-
max_count = count
227-
j = 0
228-
else:
229-
continue
230-
modes[j] = table.keys[k]
231-
232-
kh_destroy_int64(table)
233-
234-
return modes[:j + 1]
235-
236-
237202
@cython.wraparound(False)
238203
@cython.boundscheck(False)
239204
def duplicated_object(ndarray[object] values, object keep='first'):

pandas/src/hashtable_func_helper.pxi.in

+52
Original file line numberDiff line numberDiff line change
@@ -112,3 +112,55 @@ def duplicated_{{dtype}}({{dtype}}_t[:] values,
112112
return out
113113

114114
{{endfor}}
115+
116+
#----------------------------------------------------------------------
117+
# Mode Computations
118+
#----------------------------------------------------------------------
119+
120+
{{py:
121+
122+
# Note that mode is also implemented for object,
123+
# but it takes different parameters.
124+
125+
# dtype, ctype
126+
dtypes = [('int64', 'int64_t'),
127+
('uint64', 'uint64_t')]
128+
}}
129+
130+
{{for dtype, ctype in dtypes}}
131+
132+
@cython.wraparound(False)
133+
@cython.boundscheck(False)
134+
def mode_{{dtype}}({{ctype}}[:] values):
135+
cdef:
136+
int count, max_count = 2
137+
int j = -1 # so you can do +=
138+
int k
139+
kh_{{ctype}} *table
140+
ndarray[{{ctype}}] modes
141+
142+
table = kh_init_{{dtype}}()
143+
144+
build_count_table_{{dtype}}(values, table, 0)
145+
146+
modes = np.empty(table.n_buckets, dtype=np.{{dtype}})
147+
148+
with nogil:
149+
for k in range(table.n_buckets):
150+
if kh_exist_{{dtype}}(table, k):
151+
count = table.vals[k]
152+
153+
if count == max_count:
154+
j += 1
155+
elif count > max_count:
156+
max_count = count
157+
j = 0
158+
else:
159+
continue
160+
modes[j] = table.keys[k]
161+
162+
kh_destroy_{{dtype}}(table)
163+
164+
return modes[:j + 1]
165+
166+
{{endfor}}

pandas/tests/test_algos.py

+62-1
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,11 @@ def test_timedelta64_dtype_array_returned(self):
365365
tm.assert_numpy_array_equal(result, expected)
366366
self.assertEqual(result.dtype, expected.dtype)
367367

368+
def test_uint64_overflow(self):
369+
s = pd.Series([1, 2, 2**63, 2**63], dtype=np.uint64)
370+
exp = np.array([1, 2, 2**63], dtype=np.uint64)
371+
tm.assert_numpy_array_equal(algos.unique(s), exp)
372+
368373

369374
class TestIsin(tm.TestCase):
370375
_multiprocess_can_split_ = True
@@ -672,7 +677,9 @@ def test_numeric_object_likes(self):
672677
np.array([1 + 1j, 2 + 2j, 1 + 1j, 5 + 5j, 3 + 3j,
673678
2 + 2j, 4 + 4j, 1 + 1j, 5 + 5j, 6 + 6j]),
674679
np.array(['a', 'b', 'a', 'e', 'c',
675-
'b', 'd', 'a', 'e', 'f'], dtype=object)]
680+
'b', 'd', 'a', 'e', 'f'], dtype=object),
681+
np.array([1, 2**63, 1, 3**5, 10,
682+
2**63, 39, 1, 3**5, 7], dtype=np.uint64)]
676683

677684
exp_first = np.array([False, False, True, False, False,
678685
True, False, True, True, False])
@@ -1202,6 +1209,60 @@ def test_int64_add_overflow():
12021209
b_mask=np.array([False, True]))
12031210

12041211

1212+
class TestMode(tm.TestCase):
1213+
1214+
def test_basic(self):
1215+
s = Series([1, 2], dtype=np.intp)
1216+
exp = Series([], dtype=np.intp)
1217+
tm.assert_series_equal(algos.mode(s), exp)
1218+
1219+
s = Series([1, 2, 2], dtype=np.intp)
1220+
exp = Series([2], dtype=np.intp)
1221+
tm.assert_series_equal(algos.mode(s), exp)
1222+
1223+
s = Series([1, 1, 2, 3, 3], dtype=np.intp)
1224+
exp = Series([1, 3], dtype=np.intp)
1225+
tm.assert_series_equal(algos.mode(s), exp)
1226+
1227+
def test_categorical(self):
1228+
c = Categorical([1, 2])
1229+
exp = Series([], dtype=np.intp)
1230+
tm.assert_series_equal(algos.mode(c), exp)
1231+
1232+
c = Categorical([1, 2, 2])
1233+
exp = Series([2], dtype=np.intp)
1234+
tm.assert_series_equal(algos.mode(c), exp)
1235+
1236+
c = Categorical([1, 1, 2, 3, 3])
1237+
exp = Series([1, 3], dtype=np.intp)
1238+
tm.assert_series_equal(algos.mode(c), exp)
1239+
1240+
def test_nonnumeric(self):
1241+
s = Series([1, 'foo', 'foo'])
1242+
exp = Series(['foo'])
1243+
tm.assert_series_equal(algos.mode(s), exp)
1244+
1245+
s = Series([1, 2, 'foo', 'foo', 2])
1246+
exp = Series(['foo', 2])
1247+
1248+
# Cannot sort "int" and "str" together
1249+
with tm.assert_produces_warning(UserWarning):
1250+
tm.assert_series_equal(algos.mode(s), exp)
1251+
1252+
def test_uint64(self):
1253+
s = Series([1, 2**63, 2**63], dtype=np.uint64)
1254+
exp = Series([2**63], dtype=np.uint64)
1255+
1256+
tm.assert_series_equal(algos.mode(s), exp)
1257+
tm.assert_series_equal(s.mode(), exp)
1258+
1259+
s = Series([1, 2**63], dtype=np.uint64)
1260+
exp = Series([], dtype=np.uint64)
1261+
1262+
tm.assert_series_equal(algos.mode(s), exp)
1263+
tm.assert_series_equal(s.mode(), exp)
1264+
1265+
12051266
if __name__ == '__main__':
12061267
import nose
12071268
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

pandas/types/api.py

+2
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@
4444
is_floating_dtype,
4545
is_bool_dtype,
4646
is_complex_dtype,
47+
is_signed_integer_dtype,
48+
is_unsigned_integer_dtype,
4749

4850
# like
4951
is_re,

pandas/types/common.py

+12
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,18 @@ def is_integer_dtype(arr_or_dtype):
155155
not issubclass(tipo, (np.datetime64, np.timedelta64)))
156156

157157

158+
def is_signed_integer_dtype(arr_or_dtype):
159+
tipo = _get_dtype_type(arr_or_dtype)
160+
return (issubclass(tipo, np.signedinteger) and
161+
not issubclass(tipo, (np.datetime64, np.timedelta64)))
162+
163+
164+
def is_unsigned_integer_dtype(arr_or_dtype):
165+
tipo = _get_dtype_type(arr_or_dtype)
166+
return (issubclass(tipo, np.unsignedinteger) and
167+
not issubclass(tipo, (np.datetime64, np.timedelta64)))
168+
169+
158170
def is_int64_dtype(arr_or_dtype):
159171
tipo = _get_dtype_type(arr_or_dtype)
160172
return issubclass(tipo, np.int64)

0 commit comments

Comments
 (0)