REGR: maybe_convert_objects ignoring uints (pandas-dev#47475)

rhshadrach · yehoshuadimarsky · commit 196d6557fc8c · 2022-07-13T10:18:06.000-04:00
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -857,6 +857,7 @@ Conversion
 - Bug in :meth:`DataFrame.to_records` returning inconsistent numpy types if the index was a :class:`MultiIndex` (:issue:`47263`)
 - Bug in :meth:`DataFrame.to_dict` for ``orient="list"`` or ``orient="index"`` was not returning native types (:issue:`46751`)
 - Bug in :meth:`DataFrame.apply` that returns a :class:`DataFrame` instead of a :class:`Series` when applied to an empty :class:`DataFrame` and ``axis=1`` (:issue:`39111`)
+- Bug when inferring the dtype from an iterable that is *not* a NumPy ``ndarray`` consisting of all NumPy unsigned integer scalars did not result in an unsigned integer dtype (:issue:`47294`)
 
 Strings
 ^^^^^^^
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -17,6 +17,7 @@ from cpython.number cimport PyNumber_Check
 from cpython.object cimport (
     Py_EQ,
     PyObject_RichCompareBool,
+    PyTypeObject,
 )
 from cpython.ref cimport Py_INCREF
 from cpython.sequence cimport PySequence_Check
@@ -54,6 +55,11 @@ from numpy cimport (
 
 cnp.import_array()
 
+cdef extern from "Python.h":
+    # Note: importing extern-style allows us to declare these as nogil
+    # functions, whereas `from cpython cimport` does not.
+    bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil
+
 cdef extern from "numpy/arrayobject.h":
     # cython's numpy.dtype specification is incorrect, which leads to
     # errors in issubclass(self.dtype.type, np.bool_), so we directly
@@ -71,6 +77,9 @@ cdef extern from "numpy/arrayobject.h":
             object fields
             tuple names
 
+    PyTypeObject PySignedIntegerArrType_Type
+    PyTypeObject PyUnsignedIntegerArrType_Type
+
 cdef extern from "numpy/ndarrayobject.h":
     bint PyArray_CheckScalar(obj) nogil
 
@@ -1283,9 +1292,9 @@ cdef class Seen:
         In addition to setting a flag that an integer was seen, we
         also set two flags depending on the type of integer seen:
 
-        1) sint_ : a negative (signed) number in the
+        1) sint_ : a signed numpy integer type or a negative (signed) number in the
                    range of [-2**63, 0) was encountered
-        2) uint_ : a positive number in the range of
+        2) uint_ : an unsigned numpy integer type or a positive number in the range of
                    [2**63, 2**64) was encountered
 
         Parameters
@@ -1294,8 +1303,18 @@ cdef class Seen:
             Value with which to set the flags.
         """
         self.int_ = True
-        self.sint_ = self.sint_ or (oINT64_MIN <= val < 0)
-        self.uint_ = self.uint_ or (oINT64_MAX < val <= oUINT64_MAX)
+        self.sint_ = (
+            self.sint_
+            or (oINT64_MIN <= val < 0)
+            # Cython equivalent of `isinstance(val, np.signedinteger)`
+            or PyObject_TypeCheck(val, &PySignedIntegerArrType_Type)
+        )
+        self.uint_ = (
+            self.uint_
+            or (oINT64_MAX < val <= oUINT64_MAX)
+            # Cython equivalent of `isinstance(val, np.unsignedinteger)`
+            or PyObject_TypeCheck(val, &PyUnsignedIntegerArrType_Type)
+        )
 
     @property
     def numeric_(self):
@@ -2542,7 +2561,6 @@ def maybe_convert_objects(ndarray[object] objects,
             floats[i] = <float64_t>val
             complexes[i] = <double complex>val
             if not seen.null_:
-                val = int(val)
                 seen.saw_int(val)
 
                 if ((seen.uint_ and seen.sint_) or
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
@@ -700,25 +700,32 @@ def test_convert_int_overflow(self, value):
         result = lib.maybe_convert_objects(arr)
         tm.assert_numpy_array_equal(arr, result)
 
-    def test_maybe_convert_objects_uint64(self):
-        # see gh-4471
-        arr = np.array([2**63], dtype=object)
-        exp = np.array([2**63], dtype=np.uint64)
-        tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)
-
-        # NumPy bug: can't compare uint64 to int64, as that
-        # results in both casting to float64, so we should
-        # make sure that this function is robust against it
-        arr = np.array([np.uint64(2**63)], dtype=object)
-        exp = np.array([2**63], dtype=np.uint64)
-        tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)
-
-        arr = np.array([2, -1], dtype=object)
-        exp = np.array([2, -1], dtype=np.int64)
-        tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)
-
-        arr = np.array([2**63, -1], dtype=object)
-        exp = np.array([2**63, -1], dtype=object)
+    @pytest.mark.parametrize(
+        "value, expected_dtype",
+        [
+            # see gh-4471
+            ([2**63], np.uint64),
+            # NumPy bug: can't compare uint64 to int64, as that
+            # results in both casting to float64, so we should
+            # make sure that this function is robust against it
+            ([np.uint64(2**63)], np.uint64),
+            ([2, -1], np.int64),
+            ([2**63, -1], object),
+            # GH#47294
+            ([np.uint8(1)], np.uint8),
+            ([np.uint16(1)], np.uint16),
+            ([np.uint32(1)], np.uint32),
+            ([np.uint64(1)], np.uint64),
+            ([np.uint8(2), np.uint16(1)], np.uint16),
+            ([np.uint32(2), np.uint16(1)], np.uint32),
+            ([np.uint32(2), -1], object),
+            ([np.uint32(2), 1], np.uint64),
+            ([np.uint32(2), np.int32(1)], object),
+        ],
+    )
+    def test_maybe_convert_objects_uint(self, value, expected_dtype):
+        arr = np.array(value, dtype=object)
+        exp = np.array(value, dtype=expected_dtype)
         tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)
 
     def test_maybe_convert_objects_datetime(self):
diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py
@@ -57,7 +57,9 @@ class mystring(str):
         expected = DataFrame({"a": [1], "b": [2], mystring("c"): [3]}, index=index)
         tm.assert_equal(df, expected)
 
-    @pytest.mark.parametrize("dtype", ["int32", "int64", "float32", "float64"])
+    @pytest.mark.parametrize(
+        "dtype", ["int32", "int64", "uint32", "uint64", "float32", "float64"]
+    )
     def test_setitem_dtype(self, dtype, float_frame):
         arr = np.random.randn(len(float_frame))
 
@@ -210,17 +212,24 @@ def test_setitem_dict_preserves_dtypes(self):
                 "a": Series([0, 1, 2], dtype="int64"),
                 "b": Series([1, 2, 3], dtype=float),
                 "c": Series([1, 2, 3], dtype=float),
+                "d": Series([1, 2, 3], dtype="uint32"),
             }
         )
         df = DataFrame(
             {
                 "a": Series([], dtype="int64"),
                 "b": Series([], dtype=float),
                 "c": Series([], dtype=float),
+                "d": Series([], dtype="uint32"),
             }
         )
         for idx, b in enumerate([1, 2, 3]):
-            df.loc[df.shape[0]] = {"a": int(idx), "b": float(b), "c": float(b)}
+            df.loc[df.shape[0]] = {
+                "a": int(idx),
+                "b": float(b),
+                "c": float(b),
+                "d": np.uint32(b),
+            }
         tm.assert_frame_equal(df, expected)
 
     @pytest.mark.parametrize(
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -434,6 +434,25 @@ def test_constructor_int_overflow(self, values):
         assert result[0].dtype == object
         assert result[0][0] == value
 
+    @pytest.mark.parametrize(
+        "values",
+        [
+            np.array([1], dtype=np.uint16),
+            np.array([1], dtype=np.uint32),
+            np.array([1], dtype=np.uint64),
+            [np.uint16(1)],
+            [np.uint32(1)],
+            [np.uint64(1)],
+        ],
+    )
+    def test_constructor_numpy_uints(self, values):
+        # GH#47294
+        value = values[0]
+        result = DataFrame(values)
+
+        assert result[0].dtype == value.dtype
+        assert result[0][0] == value
+
     def test_constructor_ordereddict(self):
         import random
 
diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py
@@ -540,10 +540,18 @@ def test_union_duplicates(index, request):
     mi1 = MultiIndex.from_arrays([values, [1] * len(values)])
     mi2 = MultiIndex.from_arrays([[values[0]] + values, [1] * (len(values) + 1)])
     result = mi1.union(mi2)
-    tm.assert_index_equal(result, mi2.sort_values())
+    expected = mi2.sort_values()
+    if mi2.levels[0].dtype == np.uint64 and (mi2.get_level_values(0) < 2**63).all():
+        # GH#47294 - union uses lib.fast_zip, converting data to Python integers
+        # and loses type information. Result is then unsigned only when values are
+        # sufficiently large to require unsigned dtype.
+        expected = expected.set_levels(
+            [expected.levels[0].astype(int), expected.levels[1]]
+        )
+    tm.assert_index_equal(result, expected)
 
     result = mi2.union(mi1)
-    tm.assert_index_equal(result, mi2.sort_values())
+    tm.assert_index_equal(result, expected)
 
 
 @pytest.mark.parametrize(
diff --git a/pandas/tests/indexes/numeric/test_numeric.py b/pandas/tests/indexes/numeric/test_numeric.py
@@ -509,6 +509,20 @@ def test_constructor_coercion_signed_to_unsigned(
         with pytest.raises(OverflowError, match=msg):
             Index([-1], dtype=any_unsigned_int_numpy_dtype)
 
+    def test_constructor_np_signed(self, any_signed_int_numpy_dtype):
+        # GH#47475
+        scalar = np.dtype(any_signed_int_numpy_dtype).type(1)
+        result = Index([scalar])
+        expected = Int64Index([1])
+        tm.assert_index_equal(result, expected)
+
+    def test_constructor_np_unsigned(self, any_unsigned_int_numpy_dtype):
+        # GH#47475
+        scalar = np.dtype(any_unsigned_int_numpy_dtype).type(1)
+        result = Index([scalar])
+        expected = UInt64Index([1])
+        tm.assert_index_equal(result, expected)
+
     def test_coerce_list(self):
         # coerce things
         arr = Index([1, 2, 3, 4])
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
@@ -745,6 +745,25 @@ def test_constructor_signed_int_overflow_deprecation(self):
         expected = Series([1, 200, 50], dtype="uint8")
         tm.assert_series_equal(ser, expected)
 
+    @pytest.mark.parametrize(
+        "values",
+        [
+            np.array([1], dtype=np.uint16),
+            np.array([1], dtype=np.uint32),
+            np.array([1], dtype=np.uint64),
+            [np.uint16(1)],
+            [np.uint32(1)],
+            [np.uint64(1)],
+        ],
+    )
+    def test_constructor_numpy_uints(self, values):
+        # GH#47294
+        value = values[0]
+        result = Series(values)
+
+        assert result[0].dtype == value.dtype
+        assert result[0] == value
+
     def test_constructor_unsigned_dtype_overflow(self, any_unsigned_int_numpy_dtype):
         # see gh-15832
         msg = "Trying to coerce negative values to unsigned integers"