Skip to content

BUG, TST: Patch handling of uint64 in algorithms #15162

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937
- Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`)
- Bug in ``pd.read_csv()`` in which unsigned 64-bit integer elements were being improperly converted to the wrong data types (:issue:`14983`)
- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
- Bug in ``pd.value_counts()`` in which unsigned 64-bit integers were being erroneously truncated in the output (:issue:`14934`)

.. _whatsnew_0200.enhancements.other:

Expand Down
8 changes: 4 additions & 4 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,9 +366,6 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
if isinstance(values, Index):
uniques = values._shallow_copy(uniques, name=None)
elif isinstance(values, Series):
# TODO: This constructor is bugged for uint's, especially
# np.uint64 due to overflow. Test this for uint behavior
# once constructor has been fixed.
uniques = Index(uniques)
return labels, uniques

Expand Down Expand Up @@ -477,9 +474,12 @@ def _value_counts_arraylike(values, dropna=True):
if is_period_type:
keys = PeriodIndex._simple_new(keys, freq=freq)

elif is_integer_dtype(dtype):
elif is_signed_integer_dtype(dtype):
values = _ensure_int64(values)
keys, counts = htable.value_count_int64(values, dropna)
elif is_unsigned_integer_dtype(dtype):
values = _ensure_uint64(values)
keys, counts = htable.value_count_uint64(values, dropna)
elif is_float_dtype(dtype):
values = _ensure_float64(values)
keys, counts = htable.value_count_float64(values, dropna)
Expand Down
77 changes: 50 additions & 27 deletions pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,28 +202,15 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
elif inferred in ['floating', 'mixed-integer-float']:

# If we are actually all equal to integers,
# then coerce to integer
from .numeric import (Int64Index, UInt64Index,
Float64Index)
try:
res = data.astype('i8', copy=False)
if (res == data).all():
return Int64Index(res, copy=copy,
name=name)
except (OverflowError, TypeError, ValueError):
pass
# then coerce to integer.
out = cls._convert_to_int_index(data, copy, name)

# Conversion to int64 failed (possibly due to
# overflow), so let's try now with uint64.
try:
res = data.astype('u8', copy=False)
if (res == data).all():
return UInt64Index(res, copy=copy,
name=name)
except (TypeError, ValueError):
pass
# Conversion was successful.
if out is not None:
return out

# return an actual float index
# Return an actual float index.
from .numeric import Float64Index
return Float64Index(data, copy=copy, dtype=dtype,
name=name)

Expand Down Expand Up @@ -270,13 +257,13 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
if dtype is None:
inferred = lib.infer_dtype(subarr)
if inferred == 'integer':
from .numeric import Int64Index, UInt64Index
try:
return Int64Index(subarr.astype('i8'), copy=copy,
name=name)
except OverflowError:
return UInt64Index(subarr.astype('u8'), copy=copy,
name=name)
out = cls._convert_to_int_index(subarr, copy, name)

if out is not None:
return out

return Index(subarr, copy=copy,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this a change here? IOW, above we will then try to convert to a Float64Index.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, there is no change here. Just a refactoring.

dtype=object, name=name)
elif inferred in ['floating', 'mixed-integer-float']:
from .numeric import Float64Index
return Float64Index(subarr, copy=copy, name=name)
Expand Down Expand Up @@ -351,6 +338,42 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
See each method's docstring.
"""

@classmethod
def _convert_to_int_index(cls, data, copy, name):
"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there any case where the dtype is specified when you are calling this? (IOW should there be)

Copy link
Member Author

@gfyoung gfyoung Jan 21, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, there should not be. This was a refactoring because I had duplicate logic.

Attempt to convert an array of data into an integer index.

Parameters
----------
data : The data to convert.
copy : Whether to copy the data or not.
name : The name of the index returned.

Returns
-------
int_index : data converted to either an Int64Index or a
UInt64Index, or None, if the conversion was
not successful.
"""
from .numeric import Int64Index, UInt64Index
try:
res = data.astype('i8', copy=False)
if (res == data).all():
return Int64Index(res, copy=copy, name=name)
except (OverflowError, TypeError, ValueError):
pass

# Conversion to int64 failed (possibly due to
# overflow), so let's try now with uint64.
try:
res = data.astype('u8', copy=False)
if (res == data).all():
return UInt64Index(res, copy=copy, name=name)
except (TypeError, ValueError):
pass

return None

@classmethod
def _simple_new(cls, values, name=None, dtype=None, **kwargs):
"""
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/indexes/test_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -919,6 +919,10 @@ def test_constructor(self):
res = Index([1, 2**63])
tm.assert_index_equal(res, idx)

idx = Index([-1, 2**63], dtype=object)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think it reads easier to hard code the expected here e.g. use UInt64Index or whatever as the constructor

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add test when dtype is specified (and it fails)
e.g. uint64 with neg and also with nan

Copy link
Member Author

@gfyoung gfyoung Jan 21, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I specified dtype=object. There is no other sensible constructor I can use here.

Copy link
Member Author

@gfyoung gfyoung Jan 21, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With regards to the specifying uint64 with negative or NaN, the current behavior is this:

>>> Index([-1], dtype='uint64')
UInt64Index([18446744073709551615], dtype='uint64')
>>>
>>> Index([np.nan], dtype='uint64')
Float64Index([nan], dtype='float64')

uint64 does not raise on either case (which I imagine is where we are / should be going is because) we return np.array([-1],..., dtype='uint64'), and numpy is too kind that it casts -1 to uint64. For np.nan, when it detects floating data, it tries converting to an integer index before giving up and returning Float64Index.

I should point out though that int64 has similar behavior for NaN (same __new__):

>>> Index([np.nan], dtype='int64')
Float64Index([nan], dtype='float64')

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Those examples should both raise an error. @jreback : thoughts?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In [1]: Series([np.nan],dtype='int64')
ValueError: cannot convert float NaN to integer

In [2]: Index([np.nan],dtype='int64')
Out[2]: Float64Index([nan], dtype='float64')

yeah this seems different that what we do in Series. I agree I think this should raise, IOW, IF you specify a dtype and its not compatible it should raise, rather than fall back. (of course if you don't specify a dtype it will just figure it out).

Note that we should raise if we have a compat type though, e.g. dtype=int32 -> int64 is fine (same with float32 -> float64).

I think we may be a bit relaxed about conversions when int is specified though. See what breaks if you change this to be more strict.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI, Series is very lenient against negative numbers in uint64 too:

>>> Series([-1], dtype='uint64')
0    18446744073709551615
dtype: uint64

Copy link
Member Author

@gfyoung gfyoung Jan 22, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback : Raising on NaN causes test breakages. Two of them are in test_where for Index because the construction of the test involves constructing an Indexwith integer dtype but we are passing in a NaN as one of the elements. The other one is a little more concerning in test_analytics.test_count for Series:

>>> mi = MultiIndex.from_arrays([list('aabbcc'), [1, 2, 2, nan, 1, 2]])
>>> ts = Series(np.arange(len(mi)), index=mi)
>>> ts.count(level=1)
...
ValueError: cannot convert float NaN to integer

res = Index(np.array([-1, 2**63], dtype=object))
tm.assert_index_equal(res, idx)

def test_get_indexer(self):
target = UInt64Index(np.arange(10).astype('uint64') * 5 + 2**63)
indexer = self.index.get_indexer(target)
Expand Down
30 changes: 30 additions & 0 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,23 @@ def test_complex_sorting(self):

self.assertRaises(TypeError, algos.factorize, x17[::-1], sort=True)

def test_uint64_factorize(self):
data = np.array([2**63, 1, 2**63], dtype=np.uint64)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can add an example with a nan & separately with a -1 in the data

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do once we can figure what to do about the situation with passing in NaN and specifying an integer index.

exp_labels = np.array([0, 1, 0], dtype=np.intp)
exp_uniques = np.array([2**63, 1], dtype=np.uint64)

labels, uniques = algos.factorize(data)
tm.assert_numpy_array_equal(labels, exp_labels)
tm.assert_numpy_array_equal(uniques, exp_uniques)

data = np.array([2**63, -1, 2**63], dtype=object)
exp_labels = np.array([0, 1, 0], dtype=np.intp)
exp_uniques = np.array([2**63, -1], dtype=object)

labels, uniques = algos.factorize(data)
tm.assert_numpy_array_equal(labels, exp_labels)
tm.assert_numpy_array_equal(uniques, exp_uniques)


class TestUnique(tm.TestCase):
_multiprocess_can_split_ = True
Expand Down Expand Up @@ -626,6 +643,19 @@ def test_value_counts_normalized(self):
index=Series([2.0, 1.0], dtype=t))
tm.assert_series_equal(result, expected)

def test_value_counts_uint64(self):
arr = np.array([2**63], dtype=np.uint64)
expected = Series([1], index=[2**63])
result = algos.value_counts(arr)

tm.assert_series_equal(result, expected)

arr = np.array([-1, 2**63], dtype=object)
expected = Series([1, 1], index=[-1, 2**63])
result = algos.value_counts(arr)

tm.assert_series_equal(result, expected)


class TestDuplicated(tm.TestCase):

Expand Down