Skip to content

Commit 196d655

Browse files
rhshadrachyehoshuadimarsky
authored andcommitted
REGR: maybe_convert_objects ignoring uints (pandas-dev#47475)
1 parent dff158e commit 196d655

File tree

8 files changed

+123
-28
lines changed

8 files changed

+123
-28
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -857,6 +857,7 @@ Conversion
857857
- Bug in :meth:`DataFrame.to_records` returning inconsistent numpy types if the index was a :class:`MultiIndex` (:issue:`47263`)
858858
- Bug in :meth:`DataFrame.to_dict` for ``orient="list"`` or ``orient="index"`` was not returning native types (:issue:`46751`)
859859
- Bug in :meth:`DataFrame.apply` that returns a :class:`DataFrame` instead of a :class:`Series` when applied to an empty :class:`DataFrame` and ``axis=1`` (:issue:`39111`)
860+
- Bug when inferring the dtype from an iterable that is *not* a NumPy ``ndarray`` consisting of all NumPy unsigned integer scalars did not result in an unsigned integer dtype (:issue:`47294`)
860861

861862
Strings
862863
^^^^^^^

pandas/_libs/lib.pyx

+23-5
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ from cpython.number cimport PyNumber_Check
1717
from cpython.object cimport (
1818
Py_EQ,
1919
PyObject_RichCompareBool,
20+
PyTypeObject,
2021
)
2122
from cpython.ref cimport Py_INCREF
2223
from cpython.sequence cimport PySequence_Check
@@ -54,6 +55,11 @@ from numpy cimport (
5455

5556
cnp.import_array()
5657

58+
cdef extern from "Python.h":
59+
# Note: importing extern-style allows us to declare these as nogil
60+
# functions, whereas `from cpython cimport` does not.
61+
bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil
62+
5763
cdef extern from "numpy/arrayobject.h":
5864
# cython's numpy.dtype specification is incorrect, which leads to
5965
# errors in issubclass(self.dtype.type, np.bool_), so we directly
@@ -71,6 +77,9 @@ cdef extern from "numpy/arrayobject.h":
7177
object fields
7278
tuple names
7379

80+
PyTypeObject PySignedIntegerArrType_Type
81+
PyTypeObject PyUnsignedIntegerArrType_Type
82+
7483
cdef extern from "numpy/ndarrayobject.h":
7584
bint PyArray_CheckScalar(obj) nogil
7685

@@ -1283,9 +1292,9 @@ cdef class Seen:
12831292
In addition to setting a flag that an integer was seen, we
12841293
also set two flags depending on the type of integer seen:
12851294
1286-
1) sint_ : a negative (signed) number in the
1295+
1) sint_ : a signed numpy integer type or a negative (signed) number in the
12871296
range of [-2**63, 0) was encountered
1288-
2) uint_ : a positive number in the range of
1297+
2) uint_ : an unsigned numpy integer type or a positive number in the range of
12891298
[2**63, 2**64) was encountered
12901299
12911300
Parameters
@@ -1294,8 +1303,18 @@ cdef class Seen:
12941303
Value with which to set the flags.
12951304
"""
12961305
self.int_ = True
1297-
self.sint_ = self.sint_ or (oINT64_MIN <= val < 0)
1298-
self.uint_ = self.uint_ or (oINT64_MAX < val <= oUINT64_MAX)
1306+
self.sint_ = (
1307+
self.sint_
1308+
or (oINT64_MIN <= val < 0)
1309+
# Cython equivalent of `isinstance(val, np.signedinteger)`
1310+
or PyObject_TypeCheck(val, &PySignedIntegerArrType_Type)
1311+
)
1312+
self.uint_ = (
1313+
self.uint_
1314+
or (oINT64_MAX < val <= oUINT64_MAX)
1315+
# Cython equivalent of `isinstance(val, np.unsignedinteger)`
1316+
or PyObject_TypeCheck(val, &PyUnsignedIntegerArrType_Type)
1317+
)
12991318

13001319
@property
13011320
def numeric_(self):
@@ -2542,7 +2561,6 @@ def maybe_convert_objects(ndarray[object] objects,
25422561
floats[i] = <float64_t>val
25432562
complexes[i] = <double complex>val
25442563
if not seen.null_:
2545-
val = int(val)
25462564
seen.saw_int(val)
25472565

25482566
if ((seen.uint_ and seen.sint_) or

pandas/tests/dtypes/test_inference.py

+26-19
Original file line numberDiff line numberDiff line change
@@ -700,25 +700,32 @@ def test_convert_int_overflow(self, value):
700700
result = lib.maybe_convert_objects(arr)
701701
tm.assert_numpy_array_equal(arr, result)
702702

703-
def test_maybe_convert_objects_uint64(self):
704-
# see gh-4471
705-
arr = np.array([2**63], dtype=object)
706-
exp = np.array([2**63], dtype=np.uint64)
707-
tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)
708-
709-
# NumPy bug: can't compare uint64 to int64, as that
710-
# results in both casting to float64, so we should
711-
# make sure that this function is robust against it
712-
arr = np.array([np.uint64(2**63)], dtype=object)
713-
exp = np.array([2**63], dtype=np.uint64)
714-
tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)
715-
716-
arr = np.array([2, -1], dtype=object)
717-
exp = np.array([2, -1], dtype=np.int64)
718-
tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)
719-
720-
arr = np.array([2**63, -1], dtype=object)
721-
exp = np.array([2**63, -1], dtype=object)
703+
@pytest.mark.parametrize(
704+
"value, expected_dtype",
705+
[
706+
# see gh-4471
707+
([2**63], np.uint64),
708+
# NumPy bug: can't compare uint64 to int64, as that
709+
# results in both casting to float64, so we should
710+
# make sure that this function is robust against it
711+
([np.uint64(2**63)], np.uint64),
712+
([2, -1], np.int64),
713+
([2**63, -1], object),
714+
# GH#47294
715+
([np.uint8(1)], np.uint8),
716+
([np.uint16(1)], np.uint16),
717+
([np.uint32(1)], np.uint32),
718+
([np.uint64(1)], np.uint64),
719+
([np.uint8(2), np.uint16(1)], np.uint16),
720+
([np.uint32(2), np.uint16(1)], np.uint32),
721+
([np.uint32(2), -1], object),
722+
([np.uint32(2), 1], np.uint64),
723+
([np.uint32(2), np.int32(1)], object),
724+
],
725+
)
726+
def test_maybe_convert_objects_uint(self, value, expected_dtype):
727+
arr = np.array(value, dtype=object)
728+
exp = np.array(value, dtype=expected_dtype)
722729
tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)
723730

724731
def test_maybe_convert_objects_datetime(self):

pandas/tests/frame/indexing/test_setitem.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,9 @@ class mystring(str):
5757
expected = DataFrame({"a": [1], "b": [2], mystring("c"): [3]}, index=index)
5858
tm.assert_equal(df, expected)
5959

60-
@pytest.mark.parametrize("dtype", ["int32", "int64", "float32", "float64"])
60+
@pytest.mark.parametrize(
61+
"dtype", ["int32", "int64", "uint32", "uint64", "float32", "float64"]
62+
)
6163
def test_setitem_dtype(self, dtype, float_frame):
6264
arr = np.random.randn(len(float_frame))
6365

@@ -210,17 +212,24 @@ def test_setitem_dict_preserves_dtypes(self):
210212
"a": Series([0, 1, 2], dtype="int64"),
211213
"b": Series([1, 2, 3], dtype=float),
212214
"c": Series([1, 2, 3], dtype=float),
215+
"d": Series([1, 2, 3], dtype="uint32"),
213216
}
214217
)
215218
df = DataFrame(
216219
{
217220
"a": Series([], dtype="int64"),
218221
"b": Series([], dtype=float),
219222
"c": Series([], dtype=float),
223+
"d": Series([], dtype="uint32"),
220224
}
221225
)
222226
for idx, b in enumerate([1, 2, 3]):
223-
df.loc[df.shape[0]] = {"a": int(idx), "b": float(b), "c": float(b)}
227+
df.loc[df.shape[0]] = {
228+
"a": int(idx),
229+
"b": float(b),
230+
"c": float(b),
231+
"d": np.uint32(b),
232+
}
224233
tm.assert_frame_equal(df, expected)
225234

226235
@pytest.mark.parametrize(

pandas/tests/frame/test_constructors.py

+19
Original file line numberDiff line numberDiff line change
@@ -434,6 +434,25 @@ def test_constructor_int_overflow(self, values):
434434
assert result[0].dtype == object
435435
assert result[0][0] == value
436436

437+
@pytest.mark.parametrize(
438+
"values",
439+
[
440+
np.array([1], dtype=np.uint16),
441+
np.array([1], dtype=np.uint32),
442+
np.array([1], dtype=np.uint64),
443+
[np.uint16(1)],
444+
[np.uint32(1)],
445+
[np.uint64(1)],
446+
],
447+
)
448+
def test_constructor_numpy_uints(self, values):
449+
# GH#47294
450+
value = values[0]
451+
result = DataFrame(values)
452+
453+
assert result[0].dtype == value.dtype
454+
assert result[0][0] == value
455+
437456
def test_constructor_ordereddict(self):
438457
import random
439458

pandas/tests/indexes/multi/test_setops.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -540,10 +540,18 @@ def test_union_duplicates(index, request):
540540
mi1 = MultiIndex.from_arrays([values, [1] * len(values)])
541541
mi2 = MultiIndex.from_arrays([[values[0]] + values, [1] * (len(values) + 1)])
542542
result = mi1.union(mi2)
543-
tm.assert_index_equal(result, mi2.sort_values())
543+
expected = mi2.sort_values()
544+
if mi2.levels[0].dtype == np.uint64 and (mi2.get_level_values(0) < 2**63).all():
545+
# GH#47294 - union uses lib.fast_zip, converting data to Python integers
546+
# and loses type information. Result is then unsigned only when values are
547+
# sufficiently large to require unsigned dtype.
548+
expected = expected.set_levels(
549+
[expected.levels[0].astype(int), expected.levels[1]]
550+
)
551+
tm.assert_index_equal(result, expected)
544552

545553
result = mi2.union(mi1)
546-
tm.assert_index_equal(result, mi2.sort_values())
554+
tm.assert_index_equal(result, expected)
547555

548556

549557
@pytest.mark.parametrize(

pandas/tests/indexes/numeric/test_numeric.py

+14
Original file line numberDiff line numberDiff line change
@@ -509,6 +509,20 @@ def test_constructor_coercion_signed_to_unsigned(
509509
with pytest.raises(OverflowError, match=msg):
510510
Index([-1], dtype=any_unsigned_int_numpy_dtype)
511511

512+
def test_constructor_np_signed(self, any_signed_int_numpy_dtype):
513+
# GH#47475
514+
scalar = np.dtype(any_signed_int_numpy_dtype).type(1)
515+
result = Index([scalar])
516+
expected = Int64Index([1])
517+
tm.assert_index_equal(result, expected)
518+
519+
def test_constructor_np_unsigned(self, any_unsigned_int_numpy_dtype):
520+
# GH#47475
521+
scalar = np.dtype(any_unsigned_int_numpy_dtype).type(1)
522+
result = Index([scalar])
523+
expected = UInt64Index([1])
524+
tm.assert_index_equal(result, expected)
525+
512526
def test_coerce_list(self):
513527
# coerce things
514528
arr = Index([1, 2, 3, 4])

pandas/tests/series/test_constructors.py

+19
Original file line numberDiff line numberDiff line change
@@ -745,6 +745,25 @@ def test_constructor_signed_int_overflow_deprecation(self):
745745
expected = Series([1, 200, 50], dtype="uint8")
746746
tm.assert_series_equal(ser, expected)
747747

748+
@pytest.mark.parametrize(
749+
"values",
750+
[
751+
np.array([1], dtype=np.uint16),
752+
np.array([1], dtype=np.uint32),
753+
np.array([1], dtype=np.uint64),
754+
[np.uint16(1)],
755+
[np.uint32(1)],
756+
[np.uint64(1)],
757+
],
758+
)
759+
def test_constructor_numpy_uints(self, values):
760+
# GH#47294
761+
value = values[0]
762+
result = Series(values)
763+
764+
assert result[0].dtype == value.dtype
765+
assert result[0] == value
766+
748767
def test_constructor_unsigned_dtype_overflow(self, any_unsigned_int_numpy_dtype):
749768
# see gh-15832
750769
msg = "Trying to coerce negative values to unsigned integers"

0 commit comments

Comments
 (0)