-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG, TST: Patch handling of uint64 in algorithms #15162
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -202,28 +202,15 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, | |
elif inferred in ['floating', 'mixed-integer-float']: | ||
|
||
# If we are actually all equal to integers, | ||
# then coerce to integer | ||
from .numeric import (Int64Index, UInt64Index, | ||
Float64Index) | ||
try: | ||
res = data.astype('i8', copy=False) | ||
if (res == data).all(): | ||
return Int64Index(res, copy=copy, | ||
name=name) | ||
except (OverflowError, TypeError, ValueError): | ||
pass | ||
# then coerce to integer. | ||
out = cls._convert_to_int_index(data, copy, name) | ||
|
||
# Conversion to int64 failed (possibly due to | ||
# overflow), so let's try now with uint64. | ||
try: | ||
res = data.astype('u8', copy=False) | ||
if (res == data).all(): | ||
return UInt64Index(res, copy=copy, | ||
name=name) | ||
except (TypeError, ValueError): | ||
pass | ||
# Conversion was successful. | ||
if out is not None: | ||
return out | ||
|
||
# return an actual float index | ||
# Return an actual float index. | ||
from .numeric import Float64Index | ||
return Float64Index(data, copy=copy, dtype=dtype, | ||
name=name) | ||
|
||
|
@@ -270,13 +257,13 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, | |
if dtype is None: | ||
inferred = lib.infer_dtype(subarr) | ||
if inferred == 'integer': | ||
from .numeric import Int64Index, UInt64Index | ||
try: | ||
return Int64Index(subarr.astype('i8'), copy=copy, | ||
name=name) | ||
except OverflowError: | ||
return UInt64Index(subarr.astype('u8'), copy=copy, | ||
name=name) | ||
out = cls._convert_to_int_index(subarr, copy, name) | ||
|
||
if out is not None: | ||
return out | ||
|
||
return Index(subarr, copy=copy, | ||
dtype=object, name=name) | ||
elif inferred in ['floating', 'mixed-integer-float']: | ||
from .numeric import Float64Index | ||
return Float64Index(subarr, copy=copy, name=name) | ||
|
@@ -351,6 +338,42 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, | |
See each method's docstring. | ||
""" | ||
|
||
@classmethod | ||
def _convert_to_int_index(cls, data, copy, name): | ||
""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there any case where the dtype is specified when you are calling this? (IOW should there be) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, there should not be. This was a refactoring because I had duplicate logic. |
||
Attempt to convert an array of data into an integer index. | ||
|
||
Parameters | ||
---------- | ||
data : The data to convert. | ||
copy : Whether to copy the data or not. | ||
name : The name of the index returned. | ||
|
||
Returns | ||
------- | ||
int_index : data converted to either an Int64Index or a | ||
UInt64Index, or None, if the conversion was | ||
not successful. | ||
""" | ||
from .numeric import Int64Index, UInt64Index | ||
try: | ||
res = data.astype('i8', copy=False) | ||
if (res == data).all(): | ||
return Int64Index(res, copy=copy, name=name) | ||
except (OverflowError, TypeError, ValueError): | ||
pass | ||
|
||
# Conversion to int64 failed (possibly due to | ||
# overflow), so let's try now with uint64. | ||
try: | ||
res = data.astype('u8', copy=False) | ||
if (res == data).all(): | ||
return UInt64Index(res, copy=copy, name=name) | ||
except (TypeError, ValueError): | ||
pass | ||
|
||
return None | ||
|
||
@classmethod | ||
def _simple_new(cls, values, name=None, dtype=None, **kwargs): | ||
""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -919,6 +919,10 @@ def test_constructor(self): | |
res = Index([1, 2**63]) | ||
tm.assert_index_equal(res, idx) | ||
|
||
idx = Index([-1, 2**63], dtype=object) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think it reads easier to hard code the expected here e.g. use UInt64Index or whatever as the constructor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add test when dtype is specified (and it fails) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I specified There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With regards to the specifying >>> Index([-1], dtype='uint64')
UInt64Index([18446744073709551615], dtype='uint64')
>>>
>>> Index([np.nan], dtype='uint64')
Float64Index([nan], dtype='float64')
I should point out though that >>> Index([np.nan], dtype='int64')
Float64Index([nan], dtype='float64') There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Those examples should both raise an error. @jreback : thoughts? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
yeah this seems different that what we do in Series. I agree I think this should raise, IOW, IF you specify a dtype and its not compatible it should raise, rather than fall back. (of course if you don't specify a dtype it will just figure it out). Note that we should raise if we have a compat type though, e.g. I think we may be a bit relaxed about conversions when int is specified though. See what breaks if you change this to be more strict. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FYI, >>> Series([-1], dtype='uint64')
0 18446744073709551615
dtype: uint64 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jreback : Raising on >>> mi = MultiIndex.from_arrays([list('aabbcc'), [1, 2, 2, nan, 1, 2]])
>>> ts = Series(np.arange(len(mi)), index=mi)
>>> ts.count(level=1)
...
ValueError: cannot convert float NaN to integer |
||
res = Index(np.array([-1, 2**63], dtype=object)) | ||
tm.assert_index_equal(res, idx) | ||
|
||
def test_get_indexer(self): | ||
target = UInt64Index(np.arange(10).astype('uint64') * 5 + 2**63) | ||
indexer = self.index.get_indexer(target) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -287,6 +287,23 @@ def test_complex_sorting(self): | |
|
||
self.assertRaises(TypeError, algos.factorize, x17[::-1], sort=True) | ||
|
||
def test_uint64_factorize(self): | ||
data = np.array([2**63, 1, 2**63], dtype=np.uint64) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can add an example with a nan & separately with a -1 in the data There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will do once we can figure what to do about the situation with passing in |
||
exp_labels = np.array([0, 1, 0], dtype=np.intp) | ||
exp_uniques = np.array([2**63, 1], dtype=np.uint64) | ||
|
||
labels, uniques = algos.factorize(data) | ||
tm.assert_numpy_array_equal(labels, exp_labels) | ||
tm.assert_numpy_array_equal(uniques, exp_uniques) | ||
|
||
data = np.array([2**63, -1, 2**63], dtype=object) | ||
exp_labels = np.array([0, 1, 0], dtype=np.intp) | ||
exp_uniques = np.array([2**63, -1], dtype=object) | ||
|
||
labels, uniques = algos.factorize(data) | ||
tm.assert_numpy_array_equal(labels, exp_labels) | ||
tm.assert_numpy_array_equal(uniques, exp_uniques) | ||
|
||
|
||
class TestUnique(tm.TestCase): | ||
_multiprocess_can_split_ = True | ||
|
@@ -626,6 +643,19 @@ def test_value_counts_normalized(self): | |
index=Series([2.0, 1.0], dtype=t)) | ||
tm.assert_series_equal(result, expected) | ||
|
||
def test_value_counts_uint64(self): | ||
arr = np.array([2**63], dtype=np.uint64) | ||
expected = Series([1], index=[2**63]) | ||
result = algos.value_counts(arr) | ||
|
||
tm.assert_series_equal(result, expected) | ||
|
||
arr = np.array([-1, 2**63], dtype=object) | ||
expected = Series([1, 1], index=[-1, 2**63]) | ||
result = algos.value_counts(arr) | ||
|
||
tm.assert_series_equal(result, expected) | ||
|
||
|
||
class TestDuplicated(tm.TestCase): | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is this a change here? IOW, above we will then try to convert to a
Float64Index
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, there is no change here. Just a refactoring.