From 6d31057d0b1b9bb6f2840771a9aff3860f2b9b00 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 20 Dec 2016 12:39:09 -0500 Subject: [PATCH] DOC, TST, BUG: Improve uint64 core/algos behavior 1) duplicated() Updates documentation to describe the "values" parameter in the signature, adds tests for uint64, and refactors to use duplicated_uint64. 2) mode() Updates documentation to describe the "values" parameter in the signature, adds tests for uint64, and reactors to use mode_uint64. 3) unique() Uses UInt64HashTable to patch a uint64 overflow bug analogous to that seen in Series.unique (patched in gh-14915). 4) Types API Introduces "is_signed_integer_dtype" and "is_unsigned _integer_dtype" to the public API. Used in refactoring/ patching of 1-3. --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/api/tests/test_api.py | 6 +- pandas/core/algorithms.py | 50 ++++-- pandas/hashtable.pyx | 161 +----------------- pandas/src/hashtable_func_helper.pxi.in | 209 +++++++++++++++++++++--- pandas/tests/series/test_analytics.py | 132 ++++++++++----- pandas/tests/test_algos.py | 121 +++++++++++++- pandas/types/api.py | 2 + pandas/types/common.py | 12 ++ 9 files changed, 457 insertions(+), 237 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index b32b9fbbab04e..ffd2fa90dc9e6 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -298,5 +298,6 @@ Bug Fixes - Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`) +- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`) - Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`) - Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`) diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py index 49aa31c375e25..bc126447213ca 100644 --- a/pandas/api/tests/test_api.py +++ b/pandas/api/tests/test_api.py @@ -153,10 +153,10 @@ class TestTypes(Base, tm.TestCase): 'is_floating_dtype', 'is_int64_dtype', 'is_integer', 'is_integer_dtype', 'is_number', 'is_numeric_dtype', 'is_object_dtype', 'is_scalar', 'is_sparse', - 'is_string_dtype', + 'is_string_dtype', 'is_signed_integer_dtype', 'is_timedelta64_dtype', 'is_timedelta64_ns_dtype', - 'is_period', 'is_period_dtype', - 'is_re', 'is_re_compilable', + 'is_unsigned_integer_dtype', 'is_period', + 'is_period_dtype', 'is_re', 'is_re_compilable', 'is_dict_like', 'is_iterator', 'is_list_like', 'is_hashable', 'is_named_tuple', 'is_sequence', diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index e51774ce4d9b4..1a967bdd7a1a3 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -9,7 +9,9 @@ from pandas import compat, lib, tslib, _np_version_under1p8 from pandas.types.cast import _maybe_promote from pandas.types.generic import ABCSeries, ABCIndex -from pandas.types.common import (is_integer_dtype, +from pandas.types.common import (is_unsigned_integer_dtype, + is_signed_integer_dtype, + is_integer_dtype, is_int64_dtype, is_categorical_dtype, is_extension_type, @@ -479,8 +481,9 @@ def _value_counts_arraylike(values, dropna=True): keys, counts = htable.value_count_float64(values, dropna) else: values = _ensure_object(values) + keys, counts = htable.value_count_object(values, dropna) + mask = isnull(values) - keys, counts = htable.value_count_object(values, mask) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) @@ -490,12 +493,14 @@ def _value_counts_arraylike(values, dropna=True): def duplicated(values, keep='first'): """ - Return boolean ndarray denoting duplicate values + Return boolean ndarray denoting duplicate values. .. versionadded:: 0.19.0 Parameters ---------- + values : ndarray-like + Array over which to check for duplicate values. keep : {'first', 'last', False}, default 'first' - ``first`` : Mark duplicates as ``True`` except for the first occurrence. @@ -521,9 +526,12 @@ def duplicated(values, keep='first'): elif isinstance(values, (ABCSeries, ABCIndex)): values = values.values - if is_integer_dtype(dtype): + if is_signed_integer_dtype(dtype): values = _ensure_int64(values) duplicated = htable.duplicated_int64(values, keep=keep) + elif is_unsigned_integer_dtype(dtype): + values = _ensure_uint64(values) + duplicated = htable.duplicated_uint64(values, keep=keep) elif is_float_dtype(dtype): values = _ensure_float64(values) duplicated = htable.duplicated_float64(values, keep=keep) @@ -535,7 +543,19 @@ def duplicated(values, keep='first'): def mode(values): - """Returns the mode or mode(s) of the passed Series or ndarray (sorted)""" + """ + Returns the mode(s) of an array. + + Parameters + ---------- + values : array-like + Array over which to check for duplicate values. + + Returns + ------- + mode : Series + """ + # must sort because hash order isn't necessarily defined. from pandas.core.series import Series @@ -547,23 +567,23 @@ def mode(values): constructor = Series dtype = values.dtype - if is_integer_dtype(values): + if is_signed_integer_dtype(values): values = _ensure_int64(values) - result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) - + result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype) + elif is_unsigned_integer_dtype(values): + values = _ensure_uint64(values) + result = constructor(np.sort(htable.mode_uint64(values)), dtype=dtype) elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): dtype = values.dtype values = values.view(np.int64) - result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) - + result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype) elif is_categorical_dtype(values): result = constructor(values.mode()) else: - mask = isnull(values) values = _ensure_object(values) - res = htable.mode_object(values, mask) + res = htable.mode_object(values) try: - res = sorted(res) + res = np.sort(res) except TypeError as e: warn("Unable to sort modes: %s" % e) result = constructor(res, dtype=dtype) @@ -893,8 +913,10 @@ def _hashtable_algo(f, values, return_dtype=None): dtype = values.dtype if is_float_dtype(dtype): return f(htable.Float64HashTable, _ensure_float64) - elif is_integer_dtype(dtype): + elif is_signed_integer_dtype(dtype): return f(htable.Int64HashTable, _ensure_int64) + elif is_unsigned_integer_dtype(dtype): + return f(htable.UInt64HashTable, _ensure_uint64) elif is_datetime64_dtype(dtype): return_dtype = return_dtype or 'M8[ns]' return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index ce760b49fabc0..276b0679070dc 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -22,6 +22,8 @@ cdef extern from "numpy/npy_math.h": cimport cython cimport numpy as cnp +from pandas.lib import checknull + cnp.import_array() cnp.import_ufunc() @@ -117,165 +119,6 @@ cdef class Int64Factorizer: return labels -@cython.wraparound(False) -@cython.boundscheck(False) -cdef build_count_table_object(ndarray[object] values, - ndarray[uint8_t, cast=True] mask, - kh_pymap_t *table): - cdef: - khiter_t k - Py_ssize_t i, n = len(values) - int ret = 0 - - kh_resize_pymap(table, n // 10) - - for i in range(n): - if mask[i]: - continue - - val = values[i] - k = kh_get_pymap(table, val) - if k != table.n_buckets: - table.vals[k] += 1 - else: - k = kh_put_pymap(table, val, &ret) - table.vals[k] = 1 - - -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef value_count_object(ndarray[object] values, - ndarray[uint8_t, cast=True] mask): - cdef: - Py_ssize_t i - kh_pymap_t *table - int k - - table = kh_init_pymap() - build_count_table_object(values, mask, table) - - i = 0 - result_keys = np.empty(table.n_occupied, dtype=object) - result_counts = np.zeros(table.n_occupied, dtype=np.int64) - for k in range(table.n_buckets): - if kh_exist_pymap(table, k): - result_keys[i] = table.keys[k] - result_counts[i] = table.vals[k] - i += 1 - kh_destroy_pymap(table) - - return result_keys, result_counts - - -@cython.wraparound(False) -@cython.boundscheck(False) -def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask): - cdef: - int count, max_count = 2 - int j = -1 # so you can do += - int k - ndarray[object] modes - kh_pymap_t *table - - table = kh_init_pymap() - build_count_table_object(values, mask, table) - - modes = np.empty(table.n_buckets, dtype=np.object_) - for k in range(table.n_buckets): - if kh_exist_pymap(table, k): - count = table.vals[k] - - if count == max_count: - j += 1 - elif count > max_count: - max_count = count - j = 0 - else: - continue - modes[j] = table.keys[k] - - kh_destroy_pymap(table) - - return modes[:j + 1] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def mode_int64(int64_t[:] values): - cdef: - int count, max_count = 2 - int j = -1 # so you can do += - int k - kh_int64_t *table - ndarray[int64_t] modes - - table = kh_init_int64() - - build_count_table_int64(values, table, 0) - - modes = np.empty(table.n_buckets, dtype=np.int64) - - with nogil: - for k in range(table.n_buckets): - if kh_exist_int64(table, k): - count = table.vals[k] - - if count == max_count: - j += 1 - elif count > max_count: - max_count = count - j = 0 - else: - continue - modes[j] = table.keys[k] - - kh_destroy_int64(table) - - return modes[:j + 1] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def duplicated_object(ndarray[object] values, object keep='first'): - cdef: - Py_ssize_t i, n - dict seen = dict() - object row - - n = len(values) - cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8) - - if keep == 'last': - for i from n > i >= 0: - row = values[i] - if row in seen: - result[i] = 1 - else: - seen[row] = i - result[i] = 0 - elif keep == 'first': - for i from 0 <= i < n: - row = values[i] - if row in seen: - result[i] = 1 - else: - seen[row] = i - result[i] = 0 - elif keep is False: - for i from 0 <= i < n: - row = values[i] - if row in seen: - result[i] = 1 - result[seen[row]] = 1 - else: - seen[row] = i - result[i] = 0 - else: - raise ValueError('keep must be either "first", "last" or False') - - return result.view(np.bool_) - - @cython.wraparound(False) @cython.boundscheck(False) def unique_label_indices(ndarray[int64_t, ndim=1] labels): diff --git a/pandas/src/hashtable_func_helper.pxi.in b/pandas/src/hashtable_func_helper.pxi.in index f3e16cfd32963..c292256767315 100644 --- a/pandas/src/hashtable_func_helper.pxi.in +++ b/pandas/src/hashtable_func_helper.pxi.in @@ -10,105 +10,272 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: -# name -dtypes = ['float64', 'int64', 'uint64'] +# dtype, ttype +dtypes = [('float64', 'float64'), + ('uint64', 'uint64'), + ('object', 'pymap'), + ('int64', 'int64')] }} -{{for dtype in dtypes}} +{{for dtype, ttype in dtypes}} @cython.wraparound(False) @cython.boundscheck(False) +{{if dtype == 'object'}} +cdef build_count_table_{{dtype}}(ndarray[{{dtype}}] values, + kh_{{ttype}}_t *table, bint dropna): +{{else}} cdef build_count_table_{{dtype}}({{dtype}}_t[:] values, - kh_{{dtype}}_t *table, bint dropna): + kh_{{ttype}}_t *table, bint dropna): +{{endif}} cdef: khiter_t k Py_ssize_t i, n = len(values) + + {{if dtype != 'object'}} {{dtype}}_t val + {{endif}} + int ret = 0 + {{if dtype == 'object'}} + kh_resize_{{ttype}}(table, n // 10) + + for i in range(n): + val = values[i] + + if not checknull(val) or not dropna: + k = kh_get_{{ttype}}(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_{{ttype}}(table, val, &ret) + table.vals[k] = 1 + {{else}} with nogil: - kh_resize_{{dtype}}(table, n) + kh_resize_{{ttype}}(table, n) for i in range(n): val = values[i] if val == val or not dropna: - k = kh_get_{{dtype}}(table, val) + k = kh_get_{{ttype}}(table, val) if k != table.n_buckets: table.vals[k] += 1 else: - k = kh_put_{{dtype}}(table, val, &ret) + k = kh_put_{{ttype}}(table, val, &ret) table.vals[k] = 1 + {{endif}} @cython.wraparound(False) @cython.boundscheck(False) +{{if dtype == 'object'}} +cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna): +{{else}} cpdef value_count_{{dtype}}({{dtype}}_t[:] values, bint dropna): +{{endif}} cdef: Py_ssize_t i=0 - kh_{{dtype}}_t *table + kh_{{ttype}}_t *table + + {{if dtype != 'object'}} {{dtype}}_t[:] result_keys int64_t[:] result_counts + {{endif}} + int k - table = kh_init_{{dtype}}() + table = kh_init_{{ttype}}() + {{if dtype == 'object'}} + build_count_table_{{dtype}}(values, table, 1) + {{else}} build_count_table_{{dtype}}(values, table, dropna) + {{endif}} result_keys = np.empty(table.n_occupied, dtype=np.{{dtype}}) result_counts = np.zeros(table.n_occupied, dtype=np.int64) + {{if dtype == 'object'}} + for k in range(table.n_buckets): + if kh_exist_{{ttype}}(table, k): + result_keys[i] = <{{dtype}}> table.keys[k] + result_counts[i] = table.vals[k] + i += 1 + {{else}} with nogil: for k in range(table.n_buckets): - if kh_exist_{{dtype}}(table, k): + if kh_exist_{{ttype}}(table, k): result_keys[i] = table.keys[k] result_counts[i] = table.vals[k] i += 1 - kh_destroy_{{dtype}}(table) + {{endif}} + kh_destroy_{{ttype}}(table) + + {{if dtype == 'object'}} + return result_keys, result_counts + {{else}} return np.asarray(result_keys), np.asarray(result_counts) + {{endif}} @cython.wraparound(False) @cython.boundscheck(False) -def duplicated_{{dtype}}({{dtype}}_t[:] values, - object keep='first'): +{{if dtype == 'object'}} + + +def duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'): +{{else}} + + +def duplicated_{{dtype}}({{dtype}}_t[:] values, object keep='first'): +{{endif}} cdef: int ret = 0, k + {{if dtype != 'object'}} {{dtype}}_t value + {{endif}} Py_ssize_t i, n = len(values) - kh_{{dtype}}_t * table = kh_init_{{dtype}}() + kh_{{ttype}}_t * table = kh_init_{{ttype}}() ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') - kh_resize_{{dtype}}(table, min(n, _SIZE_HINT_LIMIT)) + kh_resize_{{ttype}}(table, min(n, _SIZE_HINT_LIMIT)) if keep not in ('last', 'first', False): raise ValueError('keep must be either "first", "last" or False') if keep == 'last': + {{if dtype == 'object'}} + for i from n > i >= 0: + kh_put_{{ttype}}(table, values[i], &ret) + out[i] = ret == 0 + {{else}} with nogil: - for i from n > i >=0: - kh_put_{{dtype}}(table, values[i], &ret) + for i from n > i >= 0: + kh_put_{{ttype}}(table, values[i], &ret) out[i] = ret == 0 + {{endif}} elif keep == 'first': + {{if dtype == 'object'}} + for i from 0 <= i < n: + kh_put_{{ttype}}(table, values[i], &ret) + out[i] = ret == 0 + {{else}} with nogil: for i from 0 <= i < n: - kh_put_{{dtype}}(table, values[i], &ret) + kh_put_{{ttype}}(table, values[i], &ret) out[i] = ret == 0 + {{endif}} else: + {{if dtype == 'object'}} + for i from 0 <= i < n: + value = values[i] + k = kh_get_{{ttype}}(table, value) + if k != table.n_buckets: + out[table.vals[k]] = 1 + out[i] = 1 + else: + k = kh_put_{{ttype}}(table, value, &ret) + table.keys[k] = value + table.vals[k] = i + out[i] = 0 + {{else}} with nogil: for i from 0 <= i < n: value = values[i] - k = kh_get_{{dtype}}(table, value) + k = kh_get_{{ttype}}(table, value) if k != table.n_buckets: out[table.vals[k]] = 1 out[i] = 1 else: - k = kh_put_{{dtype}}(table, value, &ret) + k = kh_put_{{ttype}}(table, value, &ret) table.keys[k] = value table.vals[k] = i out[i] = 0 - kh_destroy_{{dtype}}(table) + {{endif}} + kh_destroy_{{ttype}}(table) return out {{endfor}} + +#---------------------------------------------------------------------- +# Mode Computations +#---------------------------------------------------------------------- + +{{py: + +# dtype, ctype, table_type, npy_dtype +dtypes = [('int64', 'int64_t', 'int64', 'int64'), + ('uint64', 'uint64_t', 'uint64', 'uint64'), + ('object', 'object', 'pymap', 'object_')] +}} + +{{for dtype, ctype, table_type, npy_dtype in dtypes}} + + +@cython.wraparound(False) +@cython.boundscheck(False) + +{{if dtype == 'object'}} + + +def mode_{{dtype}}(ndarray[{{ctype}}] values): +{{else}} + + +def mode_{{dtype}}({{ctype}}[:] values): +{{endif}} + cdef: + int count, max_count = 2 + int j = -1 # so you can do += + int k + kh_{{table_type}}_t *table + ndarray[{{ctype}}] modes + + table = kh_init_{{table_type}}() + {{if dtype == 'object'}} + build_count_table_{{dtype}}(values, table, 1) + {{else}} + build_count_table_{{dtype}}(values, table, 0) + {{endif}} + + modes = np.empty(table.n_buckets, dtype=np.{{npy_dtype}}) + + {{if dtype != 'object'}} + with nogil: + for k in range(table.n_buckets): + if kh_exist_{{table_type}}(table, k): + count = table.vals[k] + + if count == max_count: + j += 1 + elif count > max_count: + max_count = count + j = 0 + else: + continue + + modes[j] = table.keys[k] + {{else}} + for k in range(table.n_buckets): + if kh_exist_{{table_type}}(table, k): + count = table.vals[k] + + if count == max_count: + j += 1 + elif count > max_count: + max_count = count + j = 0 + else: + continue + + modes[j] = table.keys[k] + {{endif}} + + kh_destroy_{{table_type}}(table) + + return modes[:j + 1] + +{{endfor}} diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index d4c209d4532e4..3896e255f0c2f 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -10,8 +10,8 @@ import numpy as np import pandas as pd -from pandas import (Series, DataFrame, isnull, notnull, bdate_range, - date_range, _np_version_under1p10) +from pandas import (Series, Categorical, DataFrame, isnull, notnull, + bdate_range, date_range, _np_version_under1p10) from pandas.core.index import MultiIndex from pandas.tseries.index import Timestamp from pandas.tseries.tdi import Timedelta @@ -128,45 +128,99 @@ def test_median(self): self.assertAlmostEqual(np.median(int_ts), int_ts.median()) def test_mode(self): - s = Series([12, 12, 11, 10, 19, 11]) - exp = Series([11, 12]) - assert_series_equal(s.mode(), exp) - - assert_series_equal( - Series([1, 2, 3]).mode(), Series( - [], dtype='int64')) - - lst = [5] * 20 + [1] * 10 + [6] * 25 - np.random.shuffle(lst) - s = Series(lst) - assert_series_equal(s.mode(), Series([6])) - - s = Series([5] * 10) - assert_series_equal(s.mode(), Series([5])) - - s = Series(lst) - s[0] = np.nan - assert_series_equal(s.mode(), Series([6.])) - - s = Series(list('adfasbasfwewefwefweeeeasdfasnbam')) - assert_series_equal(s.mode(), Series(['e'])) - - s = Series(['2011-01-03', '2013-01-02', '1900-05-03'], dtype='M8[ns]') - assert_series_equal(s.mode(), Series([], dtype="M8[ns]")) - s = Series(['2011-01-03', '2013-01-02', '1900-05-03', '2011-01-03', - '2013-01-02'], dtype='M8[ns]') - assert_series_equal(s.mode(), Series(['2011-01-03', '2013-01-02'], - dtype='M8[ns]')) - - # GH 5986 - s = Series(['1 days', '-1 days', '0 days'], dtype='timedelta64[ns]') - assert_series_equal(s.mode(), Series([], dtype='timedelta64[ns]')) + # No mode should be found. + exp = Series([], dtype=np.float64) + tm.assert_series_equal(Series([]).mode(), exp) + + exp = Series([], dtype=np.int64) + tm.assert_series_equal(Series([1]).mode(), exp) + + exp = Series([], dtype=np.object) + tm.assert_series_equal(Series(['a', 'b', 'c']).mode(), exp) + + # Test numerical data types. + exp_single = [1] + data_single = [1] * 5 + [2] * 3 + + exp_multi = [1, 3] + data_multi = [1] * 5 + [2] * 3 + [3] * 5 + + for dt in np.typecodes['AllInteger'] + np.typecodes['Float']: + s = Series(data_single, dtype=dt) + exp = Series(exp_single, dtype=dt) + tm.assert_series_equal(s.mode(), exp) + + s = Series(data_multi, dtype=dt) + exp = Series(exp_multi, dtype=dt) + tm.assert_series_equal(s.mode(), exp) + + # Test string and object types. + exp = ['b'] + data = ['a'] * 2 + ['b'] * 3 + + s = Series(data, dtype='c') + exp = Series(exp, dtype='c') + tm.assert_series_equal(s.mode(), exp) + + exp = ['bar'] + data = ['foo'] * 2 + ['bar'] * 3 + + for dt in [str, object]: + s = Series(data, dtype=dt) + exp = Series(exp, dtype=dt) + tm.assert_series_equal(s.mode(), exp) + + # Test datetime types. + exp = Series([], dtype="M8[ns]") + s = Series(['2011-01-03', '2013-01-02', + '1900-05-03'], dtype='M8[ns]') + tm.assert_series_equal(s.mode(), exp) + + exp = Series(['2011-01-03', '2013-01-02'], dtype='M8[ns]') + s = Series(['2011-01-03', '2013-01-02', '1900-05-03', + '2011-01-03', '2013-01-02'], dtype='M8[ns]') + tm.assert_series_equal(s.mode(), exp) + + # gh-5986: Test timedelta types. + exp = Series([], dtype='timedelta64[ns]') + s = Series(['1 days', '-1 days', '0 days'], + dtype='timedelta64[ns]') + tm.assert_series_equal(s.mode(), exp) + exp = Series(['2 min', '1 day'], dtype='timedelta64[ns]') s = Series(['1 day', '1 day', '-1 day', '-1 day 2 min', - '2 min', '2 min'], - dtype='timedelta64[ns]') - assert_series_equal(s.mode(), Series(['2 min', '1 day'], - dtype='timedelta64[ns]')) + '2 min', '2 min'], dtype='timedelta64[ns]') + tm.assert_series_equal(s.mode(), exp) + + # Test mixed dtype. + exp = Series(['foo']) + s = Series([1, 'foo', 'foo']) + tm.assert_series_equal(s.mode(), exp) + + # Test for uint64 overflow. + exp = Series([2**63], dtype=np.uint64) + s = Series([1, 2**63, 2**63], dtype=np.uint64) + tm.assert_series_equal(s.mode(), exp) + + exp = Series([], dtype=np.uint64) + s = Series([1, 2**63], dtype=np.uint64) + tm.assert_series_equal(s.mode(), exp) + + # Test category dtype. + c = Categorical([1, 2]) + exp = Categorical([], categories=[1, 2]) + exp = Series(exp, dtype='category') + tm.assert_series_equal(Series(c).mode(), exp) + + c = Categorical([1, 'a', 'a']) + exp = Categorical(['a'], categories=[1, 'a']) + exp = Series(exp, dtype='category') + tm.assert_series_equal(Series(c).mode(), exp) + + c = Categorical([1, 1, 2, 3, 3]) + exp = Categorical([1, 3], categories=[1, 2, 3]) + exp = Series(exp, dtype='category') + tm.assert_series_equal(Series(c).mode(), exp) def test_prod(self): self._check_stat_op('prod', np.prod) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 7f1745edbb816..e360089928000 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -365,6 +365,11 @@ def test_timedelta64_dtype_array_returned(self): tm.assert_numpy_array_equal(result, expected) self.assertEqual(result.dtype, expected.dtype) + def test_uint64_overflow(self): + s = pd.Series([1, 2, 2**63, 2**63], dtype=np.uint64) + exp = np.array([1, 2, 2**63], dtype=np.uint64) + tm.assert_numpy_array_equal(algos.unique(s), exp) + class TestIsin(tm.TestCase): _multiprocess_can_split_ = True @@ -672,7 +677,9 @@ def test_numeric_object_likes(self): np.array([1 + 1j, 2 + 2j, 1 + 1j, 5 + 5j, 3 + 3j, 2 + 2j, 4 + 4j, 1 + 1j, 5 + 5j, 6 + 6j]), np.array(['a', 'b', 'a', 'e', 'c', - 'b', 'd', 'a', 'e', 'f'], dtype=object)] + 'b', 'd', 'a', 'e', 'f'], dtype=object), + np.array([1, 2**63, 1, 3**5, 10, + 2**63, 39, 1, 3**5, 7], dtype=np.uint64)] exp_first = np.array([False, False, True, False, False, True, False, True, True, False]) @@ -1202,6 +1209,118 @@ def test_int64_add_overflow(): b_mask=np.array([False, True])) +class TestMode(tm.TestCase): + + def test_no_mode(self): + exp = Series([], dtype=np.float64) + tm.assert_series_equal(algos.mode([]), exp) + + exp = Series([], dtype=np.int) + tm.assert_series_equal(algos.mode([1]), exp) + + exp = Series([], dtype=np.object) + tm.assert_series_equal(algos.mode(['a', 'b', 'c']), exp) + + def test_number_mode(self): + exp_single = [1] + data_single = [1] * 5 + [2] * 3 + + exp_multi = [1, 3] + data_multi = [1] * 5 + [2] * 3 + [3] * 5 + + for dt in np.typecodes['AllInteger'] + np.typecodes['Float']: + s = Series(data_single, dtype=dt) + exp = Series(exp_single, dtype=dt) + tm.assert_series_equal(algos.mode(s), exp) + + s = Series(data_multi, dtype=dt) + exp = Series(exp_multi, dtype=dt) + tm.assert_series_equal(algos.mode(s), exp) + + def test_strobj_mode(self): + exp = ['b'] + data = ['a'] * 2 + ['b'] * 3 + + s = Series(data, dtype='c') + exp = Series(exp, dtype='c') + tm.assert_series_equal(algos.mode(s), exp) + + exp = ['bar'] + data = ['foo'] * 2 + ['bar'] * 3 + + for dt in [str, object]: + s = Series(data, dtype=dt) + exp = Series(exp, dtype=dt) + tm.assert_series_equal(algos.mode(s), exp) + + def test_datelike_mode(self): + exp = Series([], dtype="M8[ns]") + s = Series(['2011-01-03', '2013-01-02', + '1900-05-03'], dtype='M8[ns]') + tm.assert_series_equal(algos.mode(s), exp) + + exp = Series(['2011-01-03', '2013-01-02'], dtype='M8[ns]') + s = Series(['2011-01-03', '2013-01-02', '1900-05-03', + '2011-01-03', '2013-01-02'], dtype='M8[ns]') + tm.assert_series_equal(algos.mode(s), exp) + + def test_timedelta_mode(self): + exp = Series([], dtype='timedelta64[ns]') + s = Series(['1 days', '-1 days', '0 days'], + dtype='timedelta64[ns]') + tm.assert_series_equal(algos.mode(s), exp) + + exp = Series(['2 min', '1 day'], dtype='timedelta64[ns]') + s = Series(['1 day', '1 day', '-1 day', '-1 day 2 min', + '2 min', '2 min'], dtype='timedelta64[ns]') + tm.assert_series_equal(algos.mode(s), exp) + + def test_mixed_dtype(self): + exp = Series(['foo']) + s = Series([1, 'foo', 'foo']) + tm.assert_series_equal(algos.mode(s), exp) + + def test_uint64_overflow(self): + exp = Series([2**63], dtype=np.uint64) + s = Series([1, 2**63, 2**63], dtype=np.uint64) + tm.assert_series_equal(algos.mode(s), exp) + + exp = Series([], dtype=np.uint64) + s = Series([1, 2**63], dtype=np.uint64) + tm.assert_series_equal(algos.mode(s), exp) + + def test_categorical(self): + c = Categorical([1, 2]) + exp = Series([], dtype=np.int64) + tm.assert_series_equal(algos.mode(c), exp) + + c = Categorical([1, 'a', 'a']) + exp = Series(['a'], dtype=object) + tm.assert_series_equal(algos.mode(c), exp) + + c = Categorical([1, 1, 2, 3, 3]) + exp = Series([1, 3], dtype=np.int64) + tm.assert_series_equal(algos.mode(c), exp) + + def test_index(self): + idx = Index([1, 2, 3]) + exp = Series([], dtype=np.int64) + tm.assert_series_equal(algos.mode(idx), exp) + + idx = Index([1, 'a', 'a']) + exp = Series(['a'], dtype=object) + tm.assert_series_equal(algos.mode(idx), exp) + + idx = Index([1, 1, 2, 3, 3]) + exp = Series([1, 3], dtype=np.int64) + tm.assert_series_equal(algos.mode(idx), exp) + + exp = Series(['2 min', '1 day'], dtype='timedelta64[ns]') + idx = Index(['1 day', '1 day', '-1 day', '-1 day 2 min', + '2 min', '2 min'], dtype='timedelta64[ns]') + tm.assert_series_equal(algos.mode(idx), exp) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/types/api.py b/pandas/types/api.py index 096dc2f84aa67..c809cb3614a8c 100644 --- a/pandas/types/api.py +++ b/pandas/types/api.py @@ -44,6 +44,8 @@ is_floating_dtype, is_bool_dtype, is_complex_dtype, + is_signed_integer_dtype, + is_unsigned_integer_dtype, # like is_re, diff --git a/pandas/types/common.py b/pandas/types/common.py index 06c8ef6e35cd7..96eb6d6968bfb 100644 --- a/pandas/types/common.py +++ b/pandas/types/common.py @@ -155,6 +155,18 @@ def is_integer_dtype(arr_or_dtype): not issubclass(tipo, (np.datetime64, np.timedelta64))) +def is_signed_integer_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return (issubclass(tipo, np.signedinteger) and + not issubclass(tipo, (np.datetime64, np.timedelta64))) + + +def is_unsigned_integer_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return (issubclass(tipo, np.unsignedinteger) and + not issubclass(tipo, (np.datetime64, np.timedelta64))) + + def is_int64_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return issubclass(tipo, np.int64)