API/BUG: Raise when int-dtype coercions fail

gfyoung · gfyoung · commit e895c12c4150 · 2018-06-12T23:22:15.000-07:00
Related to the Index and Series constructors. Closes gh-15832.
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -36,6 +36,7 @@ Datetimelike API Changes
 Other API Changes
 ^^^^^^^^^^^^^^^^^
 
+- Series and Index constructors now raise when the data is incompatible with the specified dtype (:issue:`15832`)
 -
 -
 -
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -20,6 +20,7 @@
                      is_dtype_equal,
                      is_float_dtype, is_complex_dtype,
                      is_integer_dtype,
+                     is_unsigned_integer_dtype,
                      is_datetime_or_timedelta_dtype,
                      is_bool_dtype, is_scalar,
                      is_string_dtype, _string_dtypes,
@@ -1269,3 +1270,64 @@ def construct_1d_ndarray_preserving_na(values, dtype=None, copy=False):
         subarr = subarr2
 
     return subarr
+
+
+def maybe_cast_to_integer_array(arr, dtype, copy=False):
+    """
+    Takes any dtype and returns the casted version, raising for when data is
+    incompatible with integer/unsigned integer dtypes.
+
+    .. versionadded:: 0.24.0
+
+    Parameters
+    ----------
+    arr : ndarray
+        The array to cast.
+    dtype : str, np.dtype
+        The integer dtype to cast the array to.
+    copy: boolean, default False
+        Whether to make a copy of the array before returning.
+
+    Returns
+    -------
+    int_arr : ndarray
+        An array of integer or unsigned integer dtype
+
+    Raises
+    ------
+    OverflowError : the dtype is incompatible with the data
+    ValueError : loss of precision has occurred during casting
+
+    Examples
+    --------
+    If you try to coerce negative values to unsigned integers, it raises:
+
+    >>> Series([-1], dtype="uint64")
+    Traceback (most recent call last):
+        ...
+    OverflowError: Trying to coerce negative values to unsigned integers
+
+    Also, if you try to coerce float values to integers, it raises:
+
+    >>> Series([1, 2, 3.5], dtype="int64")
+    Traceback (most recent call last):
+        ...
+    ValueError: Trying to coerce float values to integers
+    """
+
+    try:
+        casted = arr.astype(dtype, copy=copy)
+    except OverflowError:
+        raise OverflowError("The elements provided in the data cannot all be "
+                            "casted to the dtype {dtype}".format(dtype=dtype))
+
+    if np.array(arr == casted).all():
+        return casted
+
+    if is_unsigned_integer_dtype(dtype) and (arr < 0).any():
+        raise OverflowError("Trying to coerce negative values "
+                            "to unsigned integers")
+
+    if is_integer_dtype(dtype) and (is_float_dtype(arr) or
+                                    is_object_dtype(arr)):
+        raise ValueError("Trying to coerce float values to integers")
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -21,6 +21,7 @@
     ABCPeriodIndex, ABCTimedeltaIndex,
     ABCDateOffset)
 from pandas.core.dtypes.missing import isna, array_equivalent
+from pandas.core.dtypes.cast import maybe_cast_to_integer_array
 from pandas.core.dtypes.common import (
     _ensure_int64,
     _ensure_object,
@@ -309,19 +310,16 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
                     if is_integer_dtype(dtype):
                         inferred = lib.infer_dtype(data)
                         if inferred == 'integer':
-                            try:
-                                data = np.array(data, copy=copy, dtype=dtype)
-                            except OverflowError:
-                                # gh-15823: a more user-friendly error message
-                                raise OverflowError(
-                                    "the elements provided in the data cannot "
-                                    "all be casted to the dtype {dtype}"
-                                    .format(dtype=dtype))
+                            data = maybe_cast_to_integer_array(data, dtype,
+                                                               copy=copy)
                         elif inferred in ['floating', 'mixed-integer-float']:
                             if isna(data).any():
                                 raise ValueError('cannot convert float '
                                                  'NaN to integer')
 
+                            if inferred == "mixed-integer-float":
+                                maybe_cast_to_integer_array(data, dtype)
+
                             # If we are actually all equal to integers,
                             # then coerce to integer.
                             try:
@@ -350,7 +348,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
 
                 except (TypeError, ValueError) as e:
                     msg = str(e)
-                    if 'cannot convert float' in msg:
+                    if ("cannot convert float" in msg or
+                            "Trying to coerce float values to integer" in msg):
                         raise
 
             # maybe coerce to a sub-class
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -41,7 +41,8 @@
     maybe_cast_to_datetime, maybe_castable,
     construct_1d_arraylike_from_scalar,
     construct_1d_ndarray_preserving_na,
-    construct_1d_object_array_from_listlike)
+    construct_1d_object_array_from_listlike,
+    maybe_cast_to_integer_array)
 from pandas.core.dtypes.missing import (
     isna,
     notna,
@@ -4067,6 +4068,9 @@ def _try_cast(arr, take_fast_path):
                 return arr
 
         try:
+            if is_float_dtype(dtype) or is_integer_dtype(dtype):
+                subarr = maybe_cast_to_integer_array(np.asarray(arr), dtype)
+
             subarr = maybe_cast_to_datetime(arr, dtype)
             # Take care in creating object arrays (but iterators are not
             # supported):
diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py
@@ -199,11 +199,11 @@ def test_downcast(self):
         self._compare(result, expected)
 
     def test_constructor_compound_dtypes(self):
-        # GH 5191
-        # compound dtypes should raise not-implementederror
+        # see gh-5191
+        # Compound dtypes should raise NotImplementedError.
 
         def f(dtype):
-            return self._construct(shape=3, dtype=dtype)
+            return self._construct(shape=3, value=1, dtype=dtype)
 
         pytest.raises(NotImplementedError, f, [("A", "datetime64[h]"),
                                                ("B", "str"),
@@ -534,14 +534,14 @@ def test_truncate_out_of_bounds(self):
 
         # small
         shape = [int(2e3)] + ([1] * (self._ndim - 1))
-        small = self._construct(shape, dtype='int8')
+        small = self._construct(shape, dtype='int8', value=1)
         self._compare(small.truncate(), small)
         self._compare(small.truncate(before=0, after=3e3), small)
         self._compare(small.truncate(before=-1, after=2e3), small)
 
         # big
         shape = [int(2e6)] + ([1] * (self._ndim - 1))
-        big = self._construct(shape, dtype='int8')
+        big = self._construct(shape, dtype='int8', value=1)
         self._compare(big.truncate(), big)
         self._compare(big.truncate(before=0, after=3e6), big)
         self._compare(big.truncate(before=-1, after=2e6), big)
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
@@ -476,7 +476,7 @@ def test_constructor_nonhashable_name(self, indices):
 
     def test_constructor_overflow_int64(self):
         # see gh-15832
-        msg = ("the elements provided in the data cannot "
+        msg = ("The elements provided in the data cannot "
                "all be casted to the dtype int64")
         with tm.assert_raises_regex(OverflowError, msg):
             Index([np.iinfo(np.uint64).max - 1], dtype="int64")
diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py
@@ -451,6 +451,20 @@ def test_astype(self):
             i = Float64Index([0, 1.1, np.NAN])
             pytest.raises(ValueError, lambda: i.astype(dtype))
 
+    @pytest.mark.parametrize("int_dtype", ["uint8", "uint16", "uint32",
+                                           "uint64", "int32", "int64",
+                                           "int16", "int8"])
+    @pytest.mark.parametrize("float_dtype", ["float16", "float32"])
+    def test_type_coercion(self, int_dtype, float_dtype):
+
+        # see gh-15832
+        msg = "Trying to coerce float values to integers"
+        with tm.assert_raises_regex(ValueError, msg):
+            Index([1, 2, 3.5], dtype=int_dtype)
+
+        i = Index([1, 2, 3.5], dtype=float_dtype)
+        tm.assert_index_equal(i, Index([1, 2, 3.5]))
+
     def test_equals_numeric(self):
 
         i = Float64Index([1.0, 2.0])
@@ -862,6 +876,16 @@ def test_constructor_corner(self):
         with tm.assert_raises_regex(TypeError, 'casting'):
             Int64Index(arr_with_floats)
 
+    @pytest.mark.parametrize("uint_dtype", ["uint8", "uint16",
+                                            "uint32", "uint64"])
+    def test_constructor_coercion_signed_to_unsigned(self, uint_dtype):
+
+        # see gh-15832
+        msg = "Trying to coerce negative values to unsigned integers"
+
+        with tm.assert_raises_regex(OverflowError, msg):
+            Index([-1], dtype=uint_dtype)
+
     def test_coerce_list(self):
         # coerce things
         arr = Index([1, 2, 3, 4])
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
@@ -542,12 +542,35 @@ def test_constructor_pass_nan_nat(self):
         tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp)
 
     def test_constructor_cast(self):
-        pytest.raises(ValueError, Series, ['a', 'b', 'c'], dtype=float)
+        msg = "could not convert string to float"
+        with tm.assert_raises_regex(ValueError, msg):
+            Series(["a", "b", "c"], dtype=float)
+
+    @pytest.mark.parametrize("uint_dtype", ["uint8", "uint16",
+                                            "uint32", "uint64"])
+    def test_constructor_unsigned_dtype_overflow(self, uint_dtype):
+        # see gh-15832
+        msg = 'Trying to coerce negative values to unsigned integers'
+        with tm.assert_raises_regex(OverflowError, msg):
+            Series([-1], dtype=uint_dtype)
+
+    @pytest.mark.parametrize("int_dtype", ["uint8", "uint16", "uint32",
+                                           "uint64", "int32", "int64",
+                                           "int16", "int8"])
+    @pytest.mark.parametrize("float_dtype", ["float16", "float32"])
+    def test_constructor_coerce_float_fail(self, int_dtype, float_dtype):
+        # see gh-15832
+        msg = "Trying to coerce float values to integers"
+        with tm.assert_raises_regex(ValueError, msg):
+            Series([1, 2, 3.5], dtype=int_dtype)
+
+        s = Series([1, 2, 3.5], dtype=float_dtype)
+        expected = Series([1, 2, 3.5]).astype(float_dtype)
+        assert_series_equal(s, expected)
 
-    def test_constructor_dtype_nocast(self):
-        # 1572
+    def test_constructor_dtype_no_cast(self):
+        # see gh-1572
         s = Series([1, 2, 3])
-
         s2 = Series(s, dtype=np.int64)
 
         s2[1] = 5

Original file line number	Diff line number	Diff line change
`@@ -36,6 +36,7 @@ Datetimelike API Changes`
`36`	`36`	`Other API Changes`
`37`	`37`	`^^^^^^^^^^^^^^^^^`
`38`	`38`
	`39`	+- Series and Index constructors now raise when the data is incompatible with the specified dtype (:issue:`15832`)
`39`	`40`	`-`
`40`	`41`	`-`
`41`	`42`	`-`