From 834b1bed762c68817baa11fc021180e70724c75d Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 19 Jun 2022 13:41:34 -0400 Subject: [PATCH 1/6] REGR: maybe_convert_objects ignoring uints --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/_libs/lib.pyx | 17 ++++++--- pandas/_libs/tslibs/util.pxd | 34 +++++++++++++++++ pandas/tests/dtypes/test_inference.py | 45 +++++++++++++---------- pandas/tests/frame/test_constructors.py | 19 ++++++++++ pandas/tests/indexes/multi/test_setops.py | 12 +++++- pandas/tests/series/test_constructors.py | 19 ++++++++++ 7 files changed, 121 insertions(+), 26 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index a5cb716317689..197f8f31a9ff3 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -795,6 +795,7 @@ Conversion - Bug in metaclass of generic abstract dtypes causing :meth:`DataFrame.apply` and :meth:`Series.apply` to raise for the built-in function ``type`` (:issue:`46684`) - Bug in :meth:`DataFrame.to_records` returning inconsistent numpy types if the index was a :class:`MultiIndex` (:issue:`47263`) - Bug in :meth:`DataFrame.to_dict` for ``orient="list"`` or ``orient="index"`` was not returning native types (:issue:`46751`) +- Bug when inferring the dtype from an iterable that is *not* a NumPy ``ndarray`` consisting of all NumPy unsigned integer scalars did not result in an unsigned integer dtype (:issue:`47294`) Strings ^^^^^^^ diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 2136c410ef4a0..f99bf8dac0a6b 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1283,9 +1283,9 @@ cdef class Seen: In addition to setting a flag that an integer was seen, we also set two flags depending on the type of integer seen: - 1) sint_ : a negative (signed) number in the + 1) sint_ : a signed numpy integer type or a negative (signed) number in the range of [-2**63, 0) was encountered - 2) uint_ : a positive number in the range of + 2) uint_ : an unsigned numpy integer type or a positive number in the range of [2**63, 2**64) was encountered Parameters @@ -1294,8 +1294,16 @@ cdef class Seen: Value with which to set the flags. """ self.int_ = True - self.sint_ = self.sint_ or (oINT64_MIN <= val < 0) - self.uint_ = self.uint_ or (oINT64_MAX < val <= oUINT64_MAX) + self.sint_ = ( + self.sint_ + or (oINT64_MIN <= val < 0) + or util.is_sinteger_object(val) + ) + self.uint_ = ( + self.uint_ + or (oINT64_MAX < val <= oUINT64_MAX) + or util.is_uinteger_object(val) + ) @property def numeric_(self): @@ -2542,7 +2550,6 @@ def maybe_convert_objects(ndarray[object] objects, floats[i] = val complexes[i] = val if not seen.null_: - val = int(val) seen.saw_int(val) if ((seen.uint_ and seen.sint_) or diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index 492b7d519551f..46d504a22b8bc 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -33,6 +33,8 @@ from numpy cimport ( cdef extern from "numpy/arrayobject.h": + PyTypeObject PySignedIntegerArrType_Type + PyTypeObject PyUnsignedIntegerArrType_Type PyTypeObject PyFloatingArrType_Type cdef extern from "numpy/ndarrayobject.h": @@ -55,6 +57,38 @@ cdef inline int64_t get_nat(): # -------------------------------------------------------------------- # Type Checking +cdef inline bint is_sinteger_object(object obj) nogil: + """ + Cython equivalent of + + `isinstance(val, np.signedinteger)` + + Parameters + ---------- + val : object + + Returns + ------- + is_sinteger : bool + """ + return PyObject_TypeCheck(obj, &PySignedIntegerArrType_Type) + +cdef inline bint is_uinteger_object(object obj) nogil: + """ + Cython equivalent of + + `isinstance(val, np.unsignedinteger)` + + Parameters + ---------- + val : object + + Returns + ------- + is_uinteger : bool + """ + return PyObject_TypeCheck(obj, &PyUnsignedIntegerArrType_Type) + cdef inline bint is_integer_object(object obj) nogil: """ Cython equivalent of diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index b12476deccbfc..8fe6abd3b0ed5 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -700,25 +700,32 @@ def test_convert_int_overflow(self, value): result = lib.maybe_convert_objects(arr) tm.assert_numpy_array_equal(arr, result) - def test_maybe_convert_objects_uint64(self): - # see gh-4471 - arr = np.array([2**63], dtype=object) - exp = np.array([2**63], dtype=np.uint64) - tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp) - - # NumPy bug: can't compare uint64 to int64, as that - # results in both casting to float64, so we should - # make sure that this function is robust against it - arr = np.array([np.uint64(2**63)], dtype=object) - exp = np.array([2**63], dtype=np.uint64) - tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp) - - arr = np.array([2, -1], dtype=object) - exp = np.array([2, -1], dtype=np.int64) - tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp) - - arr = np.array([2**63, -1], dtype=object) - exp = np.array([2**63, -1], dtype=object) + @pytest.mark.parametrize( + "value, expected_dtype", + [ + # see gh-4471 + ([2**63], np.uint64), + # NumPy bug: can't compare uint64 to int64, as that + # results in both casting to float64, so we should + # make sure that this function is robust against it + ([np.uint64(2**63)], np.uint64), + ([2, -1], np.int64), + ([2**63, -1], object), + # GH#47294 + ([np.uint8(1)], np.uint8), + ([np.uint16(1)], np.uint16), + ([np.uint32(1)], np.uint32), + ([np.uint64(1)], np.uint64), + ([np.uint8(2), np.uint16(1)], np.uint16), + ([np.uint32(2), np.uint16(1)], np.uint32), + ([np.uint32(2), -1], object), + ([np.uint32(2), 1], np.uint64), + ([np.uint32(2), np.int32(1)], object), + ], + ) + def test_maybe_convert_objects_uint(self, value, expected_dtype): + arr = np.array(value, dtype=object) + exp = np.array(value, dtype=expected_dtype) tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp) def test_maybe_convert_objects_datetime(self): diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index f06641002e039..d00cf198b3296 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -434,6 +434,25 @@ def test_constructor_int_overflow(self, values): assert result[0].dtype == object assert result[0][0] == value + @pytest.mark.parametrize( + "values", + [ + np.array([1], dtype=np.uint16), + np.array([1], dtype=np.uint32), + np.array([1], dtype=np.uint64), + [np.uint16(1)], + [np.uint32(1)], + [np.uint64(1)], + ], + ) + def test_constructor_numpy_uints(self, values): + # GH#47294 + value = values[0] + result = DataFrame(values) + + assert result[0].dtype == value.dtype + assert result[0][0] == value + def test_constructor_ordereddict(self): import random diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 57c4af1a0fe1c..39b5e0ffc526c 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -540,10 +540,18 @@ def test_union_duplicates(index, request): mi1 = MultiIndex.from_arrays([values, [1] * len(values)]) mi2 = MultiIndex.from_arrays([[values[0]] + values, [1] * (len(values) + 1)]) result = mi1.union(mi2) - tm.assert_index_equal(result, mi2.sort_values()) + expected = mi2.sort_values() + if mi2.levels[0].dtype == np.uint64 and (mi2.get_level_values(0) < 2**63).all(): + # GH#47294 - union uses lib.fast_zip, converting data to Python integers + # and loses type information. Result is then unsigned only when values are + # sufficiently large to require unsigned dtype. + expected = expected.set_levels( + [expected.levels[0].astype(int), expected.levels[1]] + ) + tm.assert_index_equal(result, expected) result = mi2.union(mi1) - tm.assert_index_equal(result, mi2.sort_values()) + tm.assert_index_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 3dce22a06c1b2..cec06d054d766 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -745,6 +745,25 @@ def test_constructor_signed_int_overflow_deprecation(self): expected = Series([1, 200, 50], dtype="uint8") tm.assert_series_equal(ser, expected) + @pytest.mark.parametrize( + "values", + [ + np.array([1], dtype=np.uint16), + np.array([1], dtype=np.uint32), + np.array([1], dtype=np.uint64), + [np.uint16(1)], + [np.uint32(1)], + [np.uint64(1)], + ], + ) + def test_constructor_numpy_uints(self, values): + # GH#47294 + value = values[0] + result = Series(values) + + assert result[0].dtype == value.dtype + assert result[0] == value + def test_constructor_unsigned_dtype_overflow(self, any_unsigned_int_numpy_dtype): # see gh-15832 msg = "Trying to coerce negative values to unsigned integers" From 6e3857a1d7cad8f444b7f4f0873530b251ef3c92 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 22 Jun 2022 19:19:06 -0400 Subject: [PATCH 2/6] Add test --- pandas/tests/frame/indexing/test_setitem.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index cf6d351aa78a0..2eeab26c9be04 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -57,7 +57,9 @@ class mystring(str): expected = DataFrame({"a": [1], "b": [2], mystring("c"): [3]}, index=index) tm.assert_equal(df, expected) - @pytest.mark.parametrize("dtype", ["int32", "int64", "float32", "float64"]) + @pytest.mark.parametrize( + "dtype", ["int32", "int64", "uint32", "uint64", "float32", "float64"] + ) def test_setitem_dtype(self, dtype, float_frame): arr = np.random.randn(len(float_frame)) @@ -210,6 +212,7 @@ def test_setitem_dict_preserves_dtypes(self): "a": Series([0, 1, 2], dtype="int64"), "b": Series([1, 2, 3], dtype=float), "c": Series([1, 2, 3], dtype=float), + "d": Series([1, 2, 3], dtype="uint32"), } ) df = DataFrame( @@ -217,10 +220,16 @@ def test_setitem_dict_preserves_dtypes(self): "a": Series([], dtype="int64"), "b": Series([], dtype=float), "c": Series([], dtype=float), + "d": Series([], dtype="uint32"), } ) for idx, b in enumerate([1, 2, 3]): - df.loc[df.shape[0]] = {"a": int(idx), "b": float(b), "c": float(b)} + df.loc[df.shape[0]] = { + "a": int(idx), + "b": float(b), + "c": float(b), + "d": np.uint32(b), + } tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( From 4e4cac9bb110e3f1cb171f415cb242b75416e496 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 27 Jun 2022 17:19:36 -0400 Subject: [PATCH 3/6] inline --- pandas/_libs/lib.pyx | 10 +++++++++- pandas/_libs/tslibs/util.pxd | 17 ----------------- 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index f99bf8dac0a6b..022e4a982b35e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -17,6 +17,7 @@ from cpython.number cimport PyNumber_Check from cpython.object cimport ( Py_EQ, PyObject_RichCompareBool, + PyTypeObject, ) from cpython.ref cimport Py_INCREF from cpython.sequence cimport PySequence_Check @@ -54,6 +55,11 @@ from numpy cimport ( cnp.import_array() +cdef extern from "Python.h": + # Note: importing extern-style allows us to declare these as nogil + # functions, whereas `from cpython cimport` does not. + bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil + cdef extern from "numpy/arrayobject.h": # cython's numpy.dtype specification is incorrect, which leads to # errors in issubclass(self.dtype.type, np.bool_), so we directly @@ -71,6 +77,8 @@ cdef extern from "numpy/arrayobject.h": object fields tuple names + PyTypeObject PySignedIntegerArrType_Type + cdef extern from "numpy/ndarrayobject.h": bint PyArray_CheckScalar(obj) nogil @@ -1297,7 +1305,7 @@ cdef class Seen: self.sint_ = ( self.sint_ or (oINT64_MIN <= val < 0) - or util.is_sinteger_object(val) + or PyObject_TypeCheck(val, &PySignedIntegerArrType_Type) ) self.uint_ = ( self.uint_ diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index 46d504a22b8bc..6ad4b2e791498 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -33,7 +33,6 @@ from numpy cimport ( cdef extern from "numpy/arrayobject.h": - PyTypeObject PySignedIntegerArrType_Type PyTypeObject PyUnsignedIntegerArrType_Type PyTypeObject PyFloatingArrType_Type @@ -57,22 +56,6 @@ cdef inline int64_t get_nat(): # -------------------------------------------------------------------- # Type Checking -cdef inline bint is_sinteger_object(object obj) nogil: - """ - Cython equivalent of - - `isinstance(val, np.signedinteger)` - - Parameters - ---------- - val : object - - Returns - ------- - is_sinteger : bool - """ - return PyObject_TypeCheck(obj, &PySignedIntegerArrType_Type) - cdef inline bint is_uinteger_object(object obj) nogil: """ Cython equivalent of From 9458e403ea9da730a353eec9771473d8737082a6 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 27 Jun 2022 17:21:43 -0400 Subject: [PATCH 4/6] inline --- pandas/_libs/lib.pyx | 3 ++- pandas/_libs/tslibs/util.pxd | 17 ----------------- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 022e4a982b35e..27cc167c0aeeb 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -78,6 +78,7 @@ cdef extern from "numpy/arrayobject.h": tuple names PyTypeObject PySignedIntegerArrType_Type + PyTypeObject PyUnsignedIntegerArrType_Type cdef extern from "numpy/ndarrayobject.h": bint PyArray_CheckScalar(obj) nogil @@ -1310,7 +1311,7 @@ cdef class Seen: self.uint_ = ( self.uint_ or (oINT64_MAX < val <= oUINT64_MAX) - or util.is_uinteger_object(val) + or PyObject_TypeCheck(val, &PyUnsignedIntegerArrType_Type) ) @property diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index 6ad4b2e791498..492b7d519551f 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -33,7 +33,6 @@ from numpy cimport ( cdef extern from "numpy/arrayobject.h": - PyTypeObject PyUnsignedIntegerArrType_Type PyTypeObject PyFloatingArrType_Type cdef extern from "numpy/ndarrayobject.h": @@ -56,22 +55,6 @@ cdef inline int64_t get_nat(): # -------------------------------------------------------------------- # Type Checking -cdef inline bint is_uinteger_object(object obj) nogil: - """ - Cython equivalent of - - `isinstance(val, np.unsignedinteger)` - - Parameters - ---------- - val : object - - Returns - ------- - is_uinteger : bool - """ - return PyObject_TypeCheck(obj, &PyUnsignedIntegerArrType_Type) - cdef inline bint is_integer_object(object obj) nogil: """ Cython equivalent of From 5151dca8575b678652c3d5a5738c9a3bd4ecafb2 Mon Sep 17 00:00:00 2001 From: richard Date: Sat, 9 Jul 2022 11:23:49 -0400 Subject: [PATCH 5/6] Added tests for Index construction --- pandas/tests/indexes/numeric/test_numeric.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/indexes/numeric/test_numeric.py b/pandas/tests/indexes/numeric/test_numeric.py index 7d2bcdf20c795..23262cb2eb768 100644 --- a/pandas/tests/indexes/numeric/test_numeric.py +++ b/pandas/tests/indexes/numeric/test_numeric.py @@ -509,6 +509,20 @@ def test_constructor_coercion_signed_to_unsigned( with pytest.raises(OverflowError, match=msg): Index([-1], dtype=any_unsigned_int_numpy_dtype) + def test_constructor_np_signed(self, any_signed_int_numpy_dtype): + # GH#47475 + scalar = np.dtype(any_signed_int_numpy_dtype).type(1) + result = Index([scalar]) + expected = Int64Index([1]) + tm.assert_index_equal(result, expected) + + def test_constructor_np_unsigned(self, any_unsigned_int_numpy_dtype): + # GH#47475 + scalar = np.dtype(any_unsigned_int_numpy_dtype).type(1) + result = Index([scalar]) + expected = UInt64Index([1]) + tm.assert_index_equal(result, expected) + def test_coerce_list(self): # coerce things arr = Index([1, 2, 3, 4]) From 7c1c72efe3345fb1192913c269ca7d6f8858a9b4 Mon Sep 17 00:00:00 2001 From: richard Date: Sat, 9 Jul 2022 11:53:42 -0400 Subject: [PATCH 6/6] Added comments --- pandas/_libs/lib.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 27cc167c0aeeb..e353d224708b7 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1306,11 +1306,13 @@ cdef class Seen: self.sint_ = ( self.sint_ or (oINT64_MIN <= val < 0) + # Cython equivalent of `isinstance(val, np.signedinteger)` or PyObject_TypeCheck(val, &PySignedIntegerArrType_Type) ) self.uint_ = ( self.uint_ or (oINT64_MAX < val <= oUINT64_MAX) + # Cython equivalent of `isinstance(val, np.unsignedinteger)` or PyObject_TypeCheck(val, &PyUnsignedIntegerArrType_Type) )