Skip to content

BUG, TST: Check uint64 behaviour in algorithms.py #14934

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -298,5 +298,6 @@ Bug Fixes


- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)
6 changes: 3 additions & 3 deletions pandas/api/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,10 +153,10 @@ class TestTypes(Base, tm.TestCase):
'is_floating_dtype', 'is_int64_dtype', 'is_integer',
'is_integer_dtype', 'is_number', 'is_numeric_dtype',
'is_object_dtype', 'is_scalar', 'is_sparse',
'is_string_dtype',
'is_string_dtype', 'is_signed_integer_dtype',
'is_timedelta64_dtype', 'is_timedelta64_ns_dtype',
'is_period', 'is_period_dtype',
'is_re', 'is_re_compilable',
'is_unsigned_integer_dtype', 'is_period',
'is_period_dtype', 'is_re', 'is_re_compilable',
'is_dict_like', 'is_iterator',
'is_list_like', 'is_hashable',
'is_named_tuple', 'is_sequence',
Expand Down
50 changes: 36 additions & 14 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
from pandas import compat, lib, tslib, _np_version_under1p8
from pandas.types.cast import _maybe_promote
from pandas.types.generic import ABCSeries, ABCIndex
from pandas.types.common import (is_integer_dtype,
from pandas.types.common import (is_unsigned_integer_dtype,
is_signed_integer_dtype,
is_integer_dtype,
is_int64_dtype,
is_categorical_dtype,
is_extension_type,
Expand Down Expand Up @@ -479,8 +481,9 @@ def _value_counts_arraylike(values, dropna=True):
keys, counts = htable.value_count_float64(values, dropna)
else:
values = _ensure_object(values)
keys, counts = htable.value_count_object(values, dropna)

mask = isnull(values)
keys, counts = htable.value_count_object(values, mask)
if not dropna and mask.any():
keys = np.insert(keys, 0, np.NaN)
counts = np.insert(counts, 0, mask.sum())
Expand All @@ -490,12 +493,14 @@ def _value_counts_arraylike(values, dropna=True):

def duplicated(values, keep='first'):
"""
Return boolean ndarray denoting duplicate values
Return boolean ndarray denoting duplicate values.
.. versionadded:: 0.19.0
Parameters
----------
values : ndarray-like
Array over which to check for duplicate values.
keep : {'first', 'last', False}, default 'first'
- ``first`` : Mark duplicates as ``True`` except for the first
occurrence.
Expand All @@ -521,9 +526,12 @@ def duplicated(values, keep='first'):
elif isinstance(values, (ABCSeries, ABCIndex)):
values = values.values

if is_integer_dtype(dtype):
if is_signed_integer_dtype(dtype):
values = _ensure_int64(values)
duplicated = htable.duplicated_int64(values, keep=keep)
elif is_unsigned_integer_dtype(dtype):
values = _ensure_uint64(values)
duplicated = htable.duplicated_uint64(values, keep=keep)
elif is_float_dtype(dtype):
values = _ensure_float64(values)
duplicated = htable.duplicated_float64(values, keep=keep)
Expand All @@ -535,7 +543,19 @@ def duplicated(values, keep='first'):


def mode(values):
"""Returns the mode or mode(s) of the passed Series or ndarray (sorted)"""
"""
Returns the mode(s) of an array.
Parameters
----------
values : array-like
Array over which to check for duplicate values.
Returns
-------
mode : Series
"""

# must sort because hash order isn't necessarily defined.
from pandas.core.series import Series

Expand All @@ -547,23 +567,23 @@ def mode(values):
constructor = Series

dtype = values.dtype
if is_integer_dtype(values):
if is_signed_integer_dtype(values):
values = _ensure_int64(values)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would rather make this less magical and just select based on dtype. A read will be confused by this; better to do it on dtype.

result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)

result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype)
elif is_unsigned_integer_dtype(values):
values = _ensure_uint64(values)
result = constructor(np.sort(htable.mode_uint64(values)), dtype=dtype)
elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
dtype = values.dtype
values = values.view(np.int64)
result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)

result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype)
elif is_categorical_dtype(values):
result = constructor(values.mode())
else:
mask = isnull(values)
values = _ensure_object(values)
res = htable.mode_object(values, mask)
res = htable.mode_object(values)
try:
res = sorted(res)
res = np.sort(res)
except TypeError as e:
warn("Unable to sort modes: %s" % e)
result = constructor(res, dtype=dtype)
Expand Down Expand Up @@ -893,8 +913,10 @@ def _hashtable_algo(f, values, return_dtype=None):
dtype = values.dtype
if is_float_dtype(dtype):
return f(htable.Float64HashTable, _ensure_float64)
elif is_integer_dtype(dtype):
elif is_signed_integer_dtype(dtype):
return f(htable.Int64HashTable, _ensure_int64)
elif is_unsigned_integer_dtype(dtype):
return f(htable.UInt64HashTable, _ensure_uint64)
elif is_datetime64_dtype(dtype):
return_dtype = return_dtype or 'M8[ns]'
return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)
Expand Down
161 changes: 2 additions & 159 deletions pandas/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ cdef extern from "numpy/npy_math.h":
cimport cython
cimport numpy as cnp

from pandas.lib import checknull

cnp.import_array()
cnp.import_ufunc()

Expand Down Expand Up @@ -117,165 +119,6 @@ cdef class Int64Factorizer:
return labels


@cython.wraparound(False)
@cython.boundscheck(False)
cdef build_count_table_object(ndarray[object] values,
ndarray[uint8_t, cast=True] mask,
kh_pymap_t *table):
cdef:
khiter_t k
Py_ssize_t i, n = len(values)
int ret = 0

kh_resize_pymap(table, n // 10)

for i in range(n):
if mask[i]:
continue

val = values[i]
k = kh_get_pymap(table, <PyObject*> val)
if k != table.n_buckets:
table.vals[k] += 1
else:
k = kh_put_pymap(table, <PyObject*> val, &ret)
table.vals[k] = 1


@cython.wraparound(False)
@cython.boundscheck(False)
cpdef value_count_object(ndarray[object] values,
ndarray[uint8_t, cast=True] mask):
cdef:
Py_ssize_t i
kh_pymap_t *table
int k

table = kh_init_pymap()
build_count_table_object(values, mask, table)

i = 0
result_keys = np.empty(table.n_occupied, dtype=object)
result_counts = np.zeros(table.n_occupied, dtype=np.int64)
for k in range(table.n_buckets):
if kh_exist_pymap(table, k):
result_keys[i] = <object> table.keys[k]
result_counts[i] = table.vals[k]
i += 1
kh_destroy_pymap(table)

return result_keys, result_counts


@cython.wraparound(False)
@cython.boundscheck(False)
def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask):
cdef:
int count, max_count = 2
int j = -1 # so you can do +=
int k
ndarray[object] modes
kh_pymap_t *table

table = kh_init_pymap()
build_count_table_object(values, mask, table)

modes = np.empty(table.n_buckets, dtype=np.object_)
for k in range(table.n_buckets):
if kh_exist_pymap(table, k):
count = table.vals[k]

if count == max_count:
j += 1
elif count > max_count:
max_count = count
j = 0
else:
continue
modes[j] = <object> table.keys[k]

kh_destroy_pymap(table)

return modes[:j + 1]


@cython.wraparound(False)
@cython.boundscheck(False)
def mode_int64(int64_t[:] values):
cdef:
int count, max_count = 2
int j = -1 # so you can do +=
int k
kh_int64_t *table
ndarray[int64_t] modes

table = kh_init_int64()

build_count_table_int64(values, table, 0)

modes = np.empty(table.n_buckets, dtype=np.int64)

with nogil:
for k in range(table.n_buckets):
if kh_exist_int64(table, k):
count = table.vals[k]

if count == max_count:
j += 1
elif count > max_count:
max_count = count
j = 0
else:
continue
modes[j] = table.keys[k]

kh_destroy_int64(table)

return modes[:j + 1]


@cython.wraparound(False)
@cython.boundscheck(False)
def duplicated_object(ndarray[object] values, object keep='first'):
cdef:
Py_ssize_t i, n
dict seen = dict()
object row

n = len(values)
cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8)

if keep == 'last':
for i from n > i >= 0:
row = values[i]
if row in seen:
result[i] = 1
else:
seen[row] = i
result[i] = 0
elif keep == 'first':
for i from 0 <= i < n:
row = values[i]
if row in seen:
result[i] = 1
else:
seen[row] = i
result[i] = 0
elif keep is False:
for i from 0 <= i < n:
row = values[i]
if row in seen:
result[i] = 1
result[seen[row]] = 1
else:
seen[row] = i
result[i] = 0
else:
raise ValueError('keep must be either "first", "last" or False')

return result.view(np.bool_)


@cython.wraparound(False)
@cython.boundscheck(False)
def unique_label_indices(ndarray[int64_t, ndim=1] labels):
Expand Down
Loading