API/BUG: Raise when int-dtype coercions fail (#21456)

gfyoung · jreback · commit b36b451a74bc · 2018-06-20T06:35:09.000-04:00
Closes gh-15832.
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -26,7 +26,7 @@ Other Enhancements
 Backwards incompatible API changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. _whatsnew_0240.api.datetimelike.normalize
+.. _whatsnew_0240.api.datetimelike.normalize:
 
 Tick DateOffset Normalize Restrictions
 --------------------------------------
@@ -73,6 +73,32 @@ Datetimelike API Changes
 Other API Changes
 ^^^^^^^^^^^^^^^^^
 
+.. _whatsnew_0240.api.other.incompatibilities:
+
+Series and Index Data-Dtype Incompatibilities
+---------------------------------------------
+
+``Series`` and ``Index`` constructors now raise when the
+data is incompatible with a passed ``dtype=`` (:issue:`15832`)
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+    In [4]: pd.Series([-1], dtype="uint64")
+    Out [4]:
+    0    18446744073709551615
+    dtype: uint64
+
+Current Behavior:
+
+.. code-block:: ipython
+
+    In [4]: pd.Series([-1], dtype="uint64")
+    Out [4]:
+    ...
+    OverflowError: Trying to coerce negative values to unsigned integers
+
 - :class:`DatetimeIndex` now accepts :class:`Int64Index` arguments as epoch timestamps (:issue:`20997`)
 -
 -
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -20,6 +20,7 @@
                      is_dtype_equal,
                      is_float_dtype, is_complex_dtype,
                      is_integer_dtype,
+                     is_unsigned_integer_dtype,
                      is_datetime_or_timedelta_dtype,
                      is_bool_dtype, is_scalar,
                      is_string_dtype, _string_dtypes,
@@ -1269,3 +1270,74 @@ def construct_1d_ndarray_preserving_na(values, dtype=None, copy=False):
         subarr = subarr2
 
     return subarr
+
+
+def maybe_cast_to_integer_array(arr, dtype, copy=False):
+    """
+    Takes any dtype and returns the casted version, raising for when data is
+    incompatible with integer/unsigned integer dtypes.
+
+    .. versionadded:: 0.24.0
+
+    Parameters
+    ----------
+    arr : array-like
+        The array to cast.
+    dtype : str, np.dtype
+        The integer dtype to cast the array to.
+    copy: boolean, default False
+        Whether to make a copy of the array before returning.
+
+    Returns
+    -------
+    int_arr : ndarray
+        An array of integer or unsigned integer dtype
+
+    Raises
+    ------
+    OverflowError : the dtype is incompatible with the data
+    ValueError : loss of precision has occurred during casting
+
+    Examples
+    --------
+    If you try to coerce negative values to unsigned integers, it raises:
+
+    >>> Series([-1], dtype="uint64")
+    Traceback (most recent call last):
+        ...
+    OverflowError: Trying to coerce negative values to unsigned integers
+
+    Also, if you try to coerce float values to integers, it raises:
+
+    >>> Series([1, 2, 3.5], dtype="int64")
+    Traceback (most recent call last):
+        ...
+    ValueError: Trying to coerce float values to integers
+    """
+
+    try:
+        if not hasattr(arr, "astype"):
+            casted = np.array(arr, dtype=dtype, copy=copy)
+        else:
+            casted = arr.astype(dtype, copy=copy)
+    except OverflowError:
+        raise OverflowError("The elements provided in the data cannot all be "
+                            "casted to the dtype {dtype}".format(dtype=dtype))
+
+    if np.array_equal(arr, casted):
+        return casted
+
+    # We do this casting to allow for proper
+    # data and dtype checking.
+    #
+    # We didn't do this earlier because NumPy
+    # doesn't handle `uint64` correctly.
+    arr = np.asarray(arr)
+
+    if is_unsigned_integer_dtype(dtype) and (arr < 0).any():
+        raise OverflowError("Trying to coerce negative values "
+                            "to unsigned integers")
+
+    if is_integer_dtype(dtype) and (is_float_dtype(arr) or
+                                    is_object_dtype(arr)):
+        raise ValueError("Trying to coerce float values to integers")
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -21,6 +21,7 @@
     ABCPeriodIndex, ABCTimedeltaIndex,
     ABCDateOffset)
 from pandas.core.dtypes.missing import isna, array_equivalent
+from pandas.core.dtypes.cast import maybe_cast_to_integer_array
 from pandas.core.dtypes.common import (
     _ensure_int64,
     _ensure_object,
@@ -311,19 +312,16 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
                     if is_integer_dtype(dtype):
                         inferred = lib.infer_dtype(data)
                         if inferred == 'integer':
-                            try:
-                                data = np.array(data, copy=copy, dtype=dtype)
-                            except OverflowError:
-                                # gh-15823: a more user-friendly error message
-                                raise OverflowError(
-                                    "the elements provided in the data cannot "
-                                    "all be casted to the dtype {dtype}"
-                                    .format(dtype=dtype))
+                            data = maybe_cast_to_integer_array(data, dtype,
+                                                               copy=copy)
                         elif inferred in ['floating', 'mixed-integer-float']:
                             if isna(data).any():
                                 raise ValueError('cannot convert float '
                                                  'NaN to integer')
 
+                            if inferred == "mixed-integer-float":
+                                data = maybe_cast_to_integer_array(data, dtype)
+
                             # If we are actually all equal to integers,
                             # then coerce to integer.
                             try:
@@ -352,7 +350,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
 
                 except (TypeError, ValueError) as e:
                     msg = str(e)
-                    if 'cannot convert float' in msg:
+                    if ("cannot convert float" in msg or
+                            "Trying to coerce float values to integer" in msg):
                         raise
 
             # maybe coerce to a sub-class
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -41,7 +41,8 @@
     maybe_cast_to_datetime, maybe_castable,
     construct_1d_arraylike_from_scalar,
     construct_1d_ndarray_preserving_na,
-    construct_1d_object_array_from_listlike)
+    construct_1d_object_array_from_listlike,
+    maybe_cast_to_integer_array)
 from pandas.core.dtypes.missing import (
     isna,
     notna,
@@ -4068,6 +4069,11 @@ def _try_cast(arr, take_fast_path):
                 return arr
 
         try:
+            # gh-15832: Check if we are requesting a numeric dype and
+            # that we can convert the data to the requested dtype.
+            if is_float_dtype(dtype) or is_integer_dtype(dtype):
+                subarr = maybe_cast_to_integer_array(arr, dtype)
+
             subarr = maybe_cast_to_datetime(arr, dtype)
             # Take care in creating object arrays (but iterators are not
             # supported):
diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py
@@ -199,11 +199,11 @@ def test_downcast(self):
         self._compare(result, expected)
 
     def test_constructor_compound_dtypes(self):
-        # GH 5191
-        # compound dtypes should raise not-implementederror
+        # see gh-5191
+        # Compound dtypes should raise NotImplementedError.
 
         def f(dtype):
-            return self._construct(shape=3, dtype=dtype)
+            return self._construct(shape=3, value=1, dtype=dtype)
 
         pytest.raises(NotImplementedError, f, [("A", "datetime64[h]"),
                                                ("B", "str"),
@@ -534,14 +534,14 @@ def test_truncate_out_of_bounds(self):
 
         # small
         shape = [int(2e3)] + ([1] * (self._ndim - 1))
-        small = self._construct(shape, dtype='int8')
+        small = self._construct(shape, dtype='int8', value=1)
         self._compare(small.truncate(), small)
         self._compare(small.truncate(before=0, after=3e3), small)
         self._compare(small.truncate(before=-1, after=2e3), small)
 
         # big
         shape = [int(2e6)] + ([1] * (self._ndim - 1))
-        big = self._construct(shape, dtype='int8')
+        big = self._construct(shape, dtype='int8', value=1)
         self._compare(big.truncate(), big)
         self._compare(big.truncate(before=0, after=3e6), big)
         self._compare(big.truncate(before=-1, after=2e6), big)
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
@@ -486,11 +486,18 @@ def test_constructor_nonhashable_name(self, indices):
 
     def test_constructor_overflow_int64(self):
         # see gh-15832
-        msg = ("the elements provided in the data cannot "
+        msg = ("The elements provided in the data cannot "
                "all be casted to the dtype int64")
         with tm.assert_raises_regex(OverflowError, msg):
             Index([np.iinfo(np.uint64).max - 1], dtype="int64")
 
+    @pytest.mark.xfail(reason="see gh-21311: Index "
+                              "doesn't enforce dtype argument")
+    def test_constructor_cast(self):
+        msg = "could not convert string to float"
+        with tm.assert_raises_regex(ValueError, msg):
+            Index(["a", "b", "c"], dtype=float)
+
     def test_view_with_args(self):
 
         restricted = ['unicodeIndex', 'strIndex', 'catIndex', 'boolIndex',
diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py
@@ -451,6 +451,18 @@ def test_astype(self):
             i = Float64Index([0, 1.1, np.NAN])
             pytest.raises(ValueError, lambda: i.astype(dtype))
 
+    def test_type_coercion_fail(self, any_int_dtype):
+        # see gh-15832
+        msg = "Trying to coerce float values to integers"
+        with tm.assert_raises_regex(ValueError, msg):
+            Index([1, 2, 3.5], dtype=any_int_dtype)
+
+    def test_type_coercion_valid(self, float_dtype):
+        # There is no Float32Index, so we always
+        # generate Float64Index.
+        i = Index([1, 2, 3.5], dtype=float_dtype)
+        tm.assert_index_equal(i, Index([1, 2, 3.5]))
+
     def test_equals_numeric(self):
 
         i = Float64Index([1.0, 2.0])
@@ -862,6 +874,14 @@ def test_constructor_corner(self):
         with tm.assert_raises_regex(TypeError, 'casting'):
             Int64Index(arr_with_floats)
 
+    def test_constructor_coercion_signed_to_unsigned(self, uint_dtype):
+
+        # see gh-15832
+        msg = "Trying to coerce negative values to unsigned integers"
+
+        with tm.assert_raises_regex(OverflowError, msg):
+            Index([-1], dtype=uint_dtype)
+
     def test_coerce_list(self):
         # coerce things
         arr = Index([1, 2, 3, 4])
diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py
@@ -2047,7 +2047,7 @@ def test_table_values_dtypes_roundtrip(self):
             assert df1.dtypes[0] == 'float32'
 
             # check with mixed dtypes
-            df1 = DataFrame(dict((c, Series(np.random.randn(5), dtype=c))
+            df1 = DataFrame(dict((c, Series(np.random.randint(5), dtype=c))
                                  for c in ['float32', 'float64', 'int32',
                                            'int64', 'int16', 'int8']))
             df1['string'] = 'foo'
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
@@ -542,12 +542,30 @@ def test_constructor_pass_nan_nat(self):
         tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp)
 
     def test_constructor_cast(self):
-        pytest.raises(ValueError, Series, ['a', 'b', 'c'], dtype=float)
+        msg = "could not convert string to float"
+        with tm.assert_raises_regex(ValueError, msg):
+            Series(["a", "b", "c"], dtype=float)
+
+    def test_constructor_unsigned_dtype_overflow(self, uint_dtype):
+        # see gh-15832
+        msg = 'Trying to coerce negative values to unsigned integers'
+        with tm.assert_raises_regex(OverflowError, msg):
+            Series([-1], dtype=uint_dtype)
+
+    def test_constructor_coerce_float_fail(self, any_int_dtype):
+        # see gh-15832
+        msg = "Trying to coerce float values to integers"
+        with tm.assert_raises_regex(ValueError, msg):
+            Series([1, 2, 3.5], dtype=any_int_dtype)
+
+    def test_constructor_coerce_float_valid(self, float_dtype):
+        s = Series([1, 2, 3.5], dtype=float_dtype)
+        expected = Series([1, 2, 3.5]).astype(float_dtype)
+        assert_series_equal(s, expected)
 
-    def test_constructor_dtype_nocast(self):
-        # 1572
+    def test_constructor_dtype_no_cast(self):
+        # see gh-1572
         s = Series([1, 2, 3])
-
         s2 = Series(s, dtype=np.int64)
 
         s2[1] = 5