Skip to content

REGR: maybe_convert_objects ignoring uints #47475

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jul 10, 2022
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -857,6 +857,7 @@ Conversion
- Bug in :meth:`DataFrame.to_records` returning inconsistent numpy types if the index was a :class:`MultiIndex` (:issue:`47263`)
- Bug in :meth:`DataFrame.to_dict` for ``orient="list"`` or ``orient="index"`` was not returning native types (:issue:`46751`)
- Bug in :meth:`DataFrame.apply` that returns a :class:`DataFrame` instead of a :class:`Series` when applied to an empty :class:`DataFrame` and ``axis=1`` (:issue:`39111`)
- Bug when inferring the dtype from an iterable that is *not* a NumPy ``ndarray`` consisting of all NumPy unsigned integer scalars did not result in an unsigned integer dtype (:issue:`47294`)

Strings
^^^^^^^
Expand Down
28 changes: 23 additions & 5 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ from cpython.number cimport PyNumber_Check
from cpython.object cimport (
Py_EQ,
PyObject_RichCompareBool,
PyTypeObject,
)
from cpython.ref cimport Py_INCREF
from cpython.sequence cimport PySequence_Check
Expand Down Expand Up @@ -54,6 +55,11 @@ from numpy cimport (

cnp.import_array()

cdef extern from "Python.h":
# Note: importing extern-style allows us to declare these as nogil
# functions, whereas `from cpython cimport` does not.
bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil

cdef extern from "numpy/arrayobject.h":
# cython's numpy.dtype specification is incorrect, which leads to
# errors in issubclass(self.dtype.type, np.bool_), so we directly
Expand All @@ -71,6 +77,9 @@ cdef extern from "numpy/arrayobject.h":
object fields
tuple names

PyTypeObject PySignedIntegerArrType_Type
PyTypeObject PyUnsignedIntegerArrType_Type

cdef extern from "numpy/ndarrayobject.h":
bint PyArray_CheckScalar(obj) nogil

Expand Down Expand Up @@ -1283,9 +1292,9 @@ cdef class Seen:
In addition to setting a flag that an integer was seen, we
also set two flags depending on the type of integer seen:

1) sint_ : a negative (signed) number in the
1) sint_ : a signed numpy integer type or a negative (signed) number in the
range of [-2**63, 0) was encountered
2) uint_ : a positive number in the range of
2) uint_ : an unsigned numpy integer type or a positive number in the range of
[2**63, 2**64) was encountered

Parameters
Expand All @@ -1294,8 +1303,18 @@ cdef class Seen:
Value with which to set the flags.
"""
self.int_ = True
self.sint_ = self.sint_ or (oINT64_MIN <= val < 0)
self.uint_ = self.uint_ or (oINT64_MAX < val <= oUINT64_MAX)
self.sint_ = (
self.sint_
or (oINT64_MIN <= val < 0)
# Cython equivalent of `isinstance(val, np.signedinteger)`
or PyObject_TypeCheck(val, &PySignedIntegerArrType_Type)
)
self.uint_ = (
self.uint_
or (oINT64_MAX < val <= oUINT64_MAX)
# Cython equivalent of `isinstance(val, np.unsignedinteger)`
or PyObject_TypeCheck(val, &PyUnsignedIntegerArrType_Type)
)

@property
def numeric_(self):
Expand Down Expand Up @@ -2542,7 +2561,6 @@ def maybe_convert_objects(ndarray[object] objects,
floats[i] = <float64_t>val
complexes[i] = <double complex>val
if not seen.null_:
val = int(val)
seen.saw_int(val)

if ((seen.uint_ and seen.sint_) or
Expand Down
45 changes: 26 additions & 19 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -700,25 +700,32 @@ def test_convert_int_overflow(self, value):
result = lib.maybe_convert_objects(arr)
tm.assert_numpy_array_equal(arr, result)

def test_maybe_convert_objects_uint64(self):
# see gh-4471
arr = np.array([2**63], dtype=object)
exp = np.array([2**63], dtype=np.uint64)
tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)

# NumPy bug: can't compare uint64 to int64, as that
# results in both casting to float64, so we should
# make sure that this function is robust against it
arr = np.array([np.uint64(2**63)], dtype=object)
exp = np.array([2**63], dtype=np.uint64)
tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)

arr = np.array([2, -1], dtype=object)
exp = np.array([2, -1], dtype=np.int64)
tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)

arr = np.array([2**63, -1], dtype=object)
exp = np.array([2**63, -1], dtype=object)
@pytest.mark.parametrize(
"value, expected_dtype",
[
# see gh-4471
([2**63], np.uint64),
# NumPy bug: can't compare uint64 to int64, as that
# results in both casting to float64, so we should
# make sure that this function is robust against it
([np.uint64(2**63)], np.uint64),
([2, -1], np.int64),
([2**63, -1], object),
# GH#47294
([np.uint8(1)], np.uint8),
([np.uint16(1)], np.uint16),
([np.uint32(1)], np.uint32),
([np.uint64(1)], np.uint64),
([np.uint8(2), np.uint16(1)], np.uint16),
([np.uint32(2), np.uint16(1)], np.uint32),
([np.uint32(2), -1], object),
([np.uint32(2), 1], np.uint64),
([np.uint32(2), np.int32(1)], object),
],
)
def test_maybe_convert_objects_uint(self, value, expected_dtype):
arr = np.array(value, dtype=object)
exp = np.array(value, dtype=expected_dtype)
tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)

def test_maybe_convert_objects_datetime(self):
Expand Down
13 changes: 11 additions & 2 deletions pandas/tests/frame/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,9 @@ class mystring(str):
expected = DataFrame({"a": [1], "b": [2], mystring("c"): [3]}, index=index)
tm.assert_equal(df, expected)

@pytest.mark.parametrize("dtype", ["int32", "int64", "float32", "float64"])
@pytest.mark.parametrize(
"dtype", ["int32", "int64", "uint32", "uint64", "float32", "float64"]
)
def test_setitem_dtype(self, dtype, float_frame):
arr = np.random.randn(len(float_frame))

Expand Down Expand Up @@ -210,17 +212,24 @@ def test_setitem_dict_preserves_dtypes(self):
"a": Series([0, 1, 2], dtype="int64"),
"b": Series([1, 2, 3], dtype=float),
"c": Series([1, 2, 3], dtype=float),
"d": Series([1, 2, 3], dtype="uint32"),
}
)
df = DataFrame(
{
"a": Series([], dtype="int64"),
"b": Series([], dtype=float),
"c": Series([], dtype=float),
"d": Series([], dtype="uint32"),
}
)
for idx, b in enumerate([1, 2, 3]):
df.loc[df.shape[0]] = {"a": int(idx), "b": float(b), "c": float(b)}
df.loc[df.shape[0]] = {
"a": int(idx),
"b": float(b),
"c": float(b),
"d": np.uint32(b),
}
tm.assert_frame_equal(df, expected)

@pytest.mark.parametrize(
Expand Down
19 changes: 19 additions & 0 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,25 @@ def test_constructor_int_overflow(self, values):
assert result[0].dtype == object
assert result[0][0] == value

@pytest.mark.parametrize(
"values",
[
np.array([1], dtype=np.uint16),
np.array([1], dtype=np.uint32),
np.array([1], dtype=np.uint64),
[np.uint16(1)],
[np.uint32(1)],
[np.uint64(1)],
],
)
def test_constructor_numpy_uints(self, values):
# GH#47294
value = values[0]
result = DataFrame(values)

assert result[0].dtype == value.dtype
assert result[0][0] == value

def test_constructor_ordereddict(self):
import random

Expand Down
12 changes: 10 additions & 2 deletions pandas/tests/indexes/multi/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,10 +540,18 @@ def test_union_duplicates(index, request):
mi1 = MultiIndex.from_arrays([values, [1] * len(values)])
mi2 = MultiIndex.from_arrays([[values[0]] + values, [1] * (len(values) + 1)])
result = mi1.union(mi2)
tm.assert_index_equal(result, mi2.sort_values())
expected = mi2.sort_values()
if mi2.levels[0].dtype == np.uint64 and (mi2.get_level_values(0) < 2**63).all():
# GH#47294 - union uses lib.fast_zip, converting data to Python integers
# and loses type information. Result is then unsigned only when values are
# sufficiently large to require unsigned dtype.
expected = expected.set_levels(
[expected.levels[0].astype(int), expected.levels[1]]
)
tm.assert_index_equal(result, expected)

result = mi2.union(mi1)
tm.assert_index_equal(result, mi2.sort_values())
tm.assert_index_equal(result, expected)


@pytest.mark.parametrize(
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/indexes/numeric/test_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,20 @@ def test_constructor_coercion_signed_to_unsigned(
with pytest.raises(OverflowError, match=msg):
Index([-1], dtype=any_unsigned_int_numpy_dtype)

def test_constructor_np_signed(self, any_signed_int_numpy_dtype):
# GH#47475
scalar = np.dtype(any_signed_int_numpy_dtype).type(1)
result = Index([scalar])
expected = Int64Index([1])
tm.assert_index_equal(result, expected)

def test_constructor_np_unsigned(self, any_unsigned_int_numpy_dtype):
# GH#47475
scalar = np.dtype(any_unsigned_int_numpy_dtype).type(1)
result = Index([scalar])
expected = UInt64Index([1])
tm.assert_index_equal(result, expected)

def test_coerce_list(self):
# coerce things
arr = Index([1, 2, 3, 4])
Expand Down
19 changes: 19 additions & 0 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -745,6 +745,25 @@ def test_constructor_signed_int_overflow_deprecation(self):
expected = Series([1, 200, 50], dtype="uint8")
tm.assert_series_equal(ser, expected)

@pytest.mark.parametrize(
"values",
[
np.array([1], dtype=np.uint16),
np.array([1], dtype=np.uint32),
np.array([1], dtype=np.uint64),
[np.uint16(1)],
[np.uint32(1)],
[np.uint64(1)],
],
)
def test_constructor_numpy_uints(self, values):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are pd.Index or pd.array affected? pd.NumericIndex?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pd.array and pd.NumericIndex are not impacted; I've added tests for Index.

arr = pd.array([np.uint16(1)])
print(arr)

# <IntegerArray>
# [1]
# Length: 1, dtype: Int64

index = NumericIndex([np.uint16(1)])
print(index)

# NumericIndex([1], dtype='uint16')

Both of these are the same as 1.4.3

# GH#47294
value = values[0]
result = Series(values)

assert result[0].dtype == value.dtype
assert result[0] == value

def test_constructor_unsigned_dtype_overflow(self, any_unsigned_int_numpy_dtype):
# see gh-15832
msg = "Trying to coerce negative values to unsigned integers"
Expand Down