diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 7aa1c1e84aa09..f19cc88fa1690 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -857,6 +857,7 @@ Conversion - Bug in :meth:`DataFrame.to_records` returning inconsistent numpy types if the index was a :class:`MultiIndex` (:issue:`47263`) - Bug in :meth:`DataFrame.to_dict` for ``orient="list"`` or ``orient="index"`` was not returning native types (:issue:`46751`) - Bug in :meth:`DataFrame.apply` that returns a :class:`DataFrame` instead of a :class:`Series` when applied to an empty :class:`DataFrame` and ``axis=1`` (:issue:`39111`) +- Bug when inferring the dtype from an iterable that is *not* a NumPy ``ndarray`` consisting of all NumPy unsigned integer scalars did not result in an unsigned integer dtype (:issue:`47294`) Strings ^^^^^^^ diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 2136c410ef4a0..e353d224708b7 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -17,6 +17,7 @@ from cpython.number cimport PyNumber_Check from cpython.object cimport ( Py_EQ, PyObject_RichCompareBool, + PyTypeObject, ) from cpython.ref cimport Py_INCREF from cpython.sequence cimport PySequence_Check @@ -54,6 +55,11 @@ from numpy cimport ( cnp.import_array() +cdef extern from "Python.h": + # Note: importing extern-style allows us to declare these as nogil + # functions, whereas `from cpython cimport` does not. + bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil + cdef extern from "numpy/arrayobject.h": # cython's numpy.dtype specification is incorrect, which leads to # errors in issubclass(self.dtype.type, np.bool_), so we directly @@ -71,6 +77,9 @@ cdef extern from "numpy/arrayobject.h": object fields tuple names + PyTypeObject PySignedIntegerArrType_Type + PyTypeObject PyUnsignedIntegerArrType_Type + cdef extern from "numpy/ndarrayobject.h": bint PyArray_CheckScalar(obj) nogil @@ -1283,9 +1292,9 @@ cdef class Seen: In addition to setting a flag that an integer was seen, we also set two flags depending on the type of integer seen: - 1) sint_ : a negative (signed) number in the + 1) sint_ : a signed numpy integer type or a negative (signed) number in the range of [-2**63, 0) was encountered - 2) uint_ : a positive number in the range of + 2) uint_ : an unsigned numpy integer type or a positive number in the range of [2**63, 2**64) was encountered Parameters @@ -1294,8 +1303,18 @@ cdef class Seen: Value with which to set the flags. """ self.int_ = True - self.sint_ = self.sint_ or (oINT64_MIN <= val < 0) - self.uint_ = self.uint_ or (oINT64_MAX < val <= oUINT64_MAX) + self.sint_ = ( + self.sint_ + or (oINT64_MIN <= val < 0) + # Cython equivalent of `isinstance(val, np.signedinteger)` + or PyObject_TypeCheck(val, &PySignedIntegerArrType_Type) + ) + self.uint_ = ( + self.uint_ + or (oINT64_MAX < val <= oUINT64_MAX) + # Cython equivalent of `isinstance(val, np.unsignedinteger)` + or PyObject_TypeCheck(val, &PyUnsignedIntegerArrType_Type) + ) @property def numeric_(self): @@ -2542,7 +2561,6 @@ def maybe_convert_objects(ndarray[object] objects, floats[i] = val complexes[i] = val if not seen.null_: - val = int(val) seen.saw_int(val) if ((seen.uint_ and seen.sint_) or diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index b12476deccbfc..8fe6abd3b0ed5 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -700,25 +700,32 @@ def test_convert_int_overflow(self, value): result = lib.maybe_convert_objects(arr) tm.assert_numpy_array_equal(arr, result) - def test_maybe_convert_objects_uint64(self): - # see gh-4471 - arr = np.array([2**63], dtype=object) - exp = np.array([2**63], dtype=np.uint64) - tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp) - - # NumPy bug: can't compare uint64 to int64, as that - # results in both casting to float64, so we should - # make sure that this function is robust against it - arr = np.array([np.uint64(2**63)], dtype=object) - exp = np.array([2**63], dtype=np.uint64) - tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp) - - arr = np.array([2, -1], dtype=object) - exp = np.array([2, -1], dtype=np.int64) - tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp) - - arr = np.array([2**63, -1], dtype=object) - exp = np.array([2**63, -1], dtype=object) + @pytest.mark.parametrize( + "value, expected_dtype", + [ + # see gh-4471 + ([2**63], np.uint64), + # NumPy bug: can't compare uint64 to int64, as that + # results in both casting to float64, so we should + # make sure that this function is robust against it + ([np.uint64(2**63)], np.uint64), + ([2, -1], np.int64), + ([2**63, -1], object), + # GH#47294 + ([np.uint8(1)], np.uint8), + ([np.uint16(1)], np.uint16), + ([np.uint32(1)], np.uint32), + ([np.uint64(1)], np.uint64), + ([np.uint8(2), np.uint16(1)], np.uint16), + ([np.uint32(2), np.uint16(1)], np.uint32), + ([np.uint32(2), -1], object), + ([np.uint32(2), 1], np.uint64), + ([np.uint32(2), np.int32(1)], object), + ], + ) + def test_maybe_convert_objects_uint(self, value, expected_dtype): + arr = np.array(value, dtype=object) + exp = np.array(value, dtype=expected_dtype) tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp) def test_maybe_convert_objects_datetime(self): diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 45f36834510ed..cd547819dbe94 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -57,7 +57,9 @@ class mystring(str): expected = DataFrame({"a": [1], "b": [2], mystring("c"): [3]}, index=index) tm.assert_equal(df, expected) - @pytest.mark.parametrize("dtype", ["int32", "int64", "float32", "float64"]) + @pytest.mark.parametrize( + "dtype", ["int32", "int64", "uint32", "uint64", "float32", "float64"] + ) def test_setitem_dtype(self, dtype, float_frame): arr = np.random.randn(len(float_frame)) @@ -210,6 +212,7 @@ def test_setitem_dict_preserves_dtypes(self): "a": Series([0, 1, 2], dtype="int64"), "b": Series([1, 2, 3], dtype=float), "c": Series([1, 2, 3], dtype=float), + "d": Series([1, 2, 3], dtype="uint32"), } ) df = DataFrame( @@ -217,10 +220,16 @@ def test_setitem_dict_preserves_dtypes(self): "a": Series([], dtype="int64"), "b": Series([], dtype=float), "c": Series([], dtype=float), + "d": Series([], dtype="uint32"), } ) for idx, b in enumerate([1, 2, 3]): - df.loc[df.shape[0]] = {"a": int(idx), "b": float(b), "c": float(b)} + df.loc[df.shape[0]] = { + "a": int(idx), + "b": float(b), + "c": float(b), + "d": np.uint32(b), + } tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index f06641002e039..d00cf198b3296 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -434,6 +434,25 @@ def test_constructor_int_overflow(self, values): assert result[0].dtype == object assert result[0][0] == value + @pytest.mark.parametrize( + "values", + [ + np.array([1], dtype=np.uint16), + np.array([1], dtype=np.uint32), + np.array([1], dtype=np.uint64), + [np.uint16(1)], + [np.uint32(1)], + [np.uint64(1)], + ], + ) + def test_constructor_numpy_uints(self, values): + # GH#47294 + value = values[0] + result = DataFrame(values) + + assert result[0].dtype == value.dtype + assert result[0][0] == value + def test_constructor_ordereddict(self): import random diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 57c4af1a0fe1c..39b5e0ffc526c 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -540,10 +540,18 @@ def test_union_duplicates(index, request): mi1 = MultiIndex.from_arrays([values, [1] * len(values)]) mi2 = MultiIndex.from_arrays([[values[0]] + values, [1] * (len(values) + 1)]) result = mi1.union(mi2) - tm.assert_index_equal(result, mi2.sort_values()) + expected = mi2.sort_values() + if mi2.levels[0].dtype == np.uint64 and (mi2.get_level_values(0) < 2**63).all(): + # GH#47294 - union uses lib.fast_zip, converting data to Python integers + # and loses type information. Result is then unsigned only when values are + # sufficiently large to require unsigned dtype. + expected = expected.set_levels( + [expected.levels[0].astype(int), expected.levels[1]] + ) + tm.assert_index_equal(result, expected) result = mi2.union(mi1) - tm.assert_index_equal(result, mi2.sort_values()) + tm.assert_index_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/numeric/test_numeric.py b/pandas/tests/indexes/numeric/test_numeric.py index 7d2bcdf20c795..23262cb2eb768 100644 --- a/pandas/tests/indexes/numeric/test_numeric.py +++ b/pandas/tests/indexes/numeric/test_numeric.py @@ -509,6 +509,20 @@ def test_constructor_coercion_signed_to_unsigned( with pytest.raises(OverflowError, match=msg): Index([-1], dtype=any_unsigned_int_numpy_dtype) + def test_constructor_np_signed(self, any_signed_int_numpy_dtype): + # GH#47475 + scalar = np.dtype(any_signed_int_numpy_dtype).type(1) + result = Index([scalar]) + expected = Int64Index([1]) + tm.assert_index_equal(result, expected) + + def test_constructor_np_unsigned(self, any_unsigned_int_numpy_dtype): + # GH#47475 + scalar = np.dtype(any_unsigned_int_numpy_dtype).type(1) + result = Index([scalar]) + expected = UInt64Index([1]) + tm.assert_index_equal(result, expected) + def test_coerce_list(self): # coerce things arr = Index([1, 2, 3, 4]) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 3dce22a06c1b2..cec06d054d766 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -745,6 +745,25 @@ def test_constructor_signed_int_overflow_deprecation(self): expected = Series([1, 200, 50], dtype="uint8") tm.assert_series_equal(ser, expected) + @pytest.mark.parametrize( + "values", + [ + np.array([1], dtype=np.uint16), + np.array([1], dtype=np.uint32), + np.array([1], dtype=np.uint64), + [np.uint16(1)], + [np.uint32(1)], + [np.uint64(1)], + ], + ) + def test_constructor_numpy_uints(self, values): + # GH#47294 + value = values[0] + result = Series(values) + + assert result[0].dtype == value.dtype + assert result[0] == value + def test_constructor_unsigned_dtype_overflow(self, any_unsigned_int_numpy_dtype): # see gh-15832 msg = "Trying to coerce negative values to unsigned integers"