diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index c23ed006ff637..15c5cc97b8426 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -26,7 +26,7 @@ Other Enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. _whatsnew_0240.api.datetimelike.normalize +.. _whatsnew_0240.api.datetimelike.normalize: Tick DateOffset Normalize Restrictions -------------------------------------- @@ -73,6 +73,32 @@ Datetimelike API Changes Other API Changes ^^^^^^^^^^^^^^^^^ +.. _whatsnew_0240.api.other.incompatibilities: + +Series and Index Data-Dtype Incompatibilities +--------------------------------------------- + +``Series`` and ``Index`` constructors now raise when the +data is incompatible with a passed ``dtype=`` (:issue:`15832`) + +Previous Behavior: + +.. code-block:: ipython + + In [4]: pd.Series([-1], dtype="uint64") + Out [4]: + 0 18446744073709551615 + dtype: uint64 + +Current Behavior: + +.. code-block:: ipython + + In [4]: pd.Series([-1], dtype="uint64") + Out [4]: + ... + OverflowError: Trying to coerce negative values to unsigned integers + - :class:`DatetimeIndex` now accepts :class:`Int64Index` arguments as epoch timestamps (:issue:`20997`) - - diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index ebc7a13234a98..65328dfc7347e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -20,6 +20,7 @@ is_dtype_equal, is_float_dtype, is_complex_dtype, is_integer_dtype, + is_unsigned_integer_dtype, is_datetime_or_timedelta_dtype, is_bool_dtype, is_scalar, is_string_dtype, _string_dtypes, @@ -1269,3 +1270,74 @@ def construct_1d_ndarray_preserving_na(values, dtype=None, copy=False): subarr = subarr2 return subarr + + +def maybe_cast_to_integer_array(arr, dtype, copy=False): + """ + Takes any dtype and returns the casted version, raising for when data is + incompatible with integer/unsigned integer dtypes. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + arr : array-like + The array to cast. + dtype : str, np.dtype + The integer dtype to cast the array to. + copy: boolean, default False + Whether to make a copy of the array before returning. + + Returns + ------- + int_arr : ndarray + An array of integer or unsigned integer dtype + + Raises + ------ + OverflowError : the dtype is incompatible with the data + ValueError : loss of precision has occurred during casting + + Examples + -------- + If you try to coerce negative values to unsigned integers, it raises: + + >>> Series([-1], dtype="uint64") + Traceback (most recent call last): + ... + OverflowError: Trying to coerce negative values to unsigned integers + + Also, if you try to coerce float values to integers, it raises: + + >>> Series([1, 2, 3.5], dtype="int64") + Traceback (most recent call last): + ... + ValueError: Trying to coerce float values to integers + """ + + try: + if not hasattr(arr, "astype"): + casted = np.array(arr, dtype=dtype, copy=copy) + else: + casted = arr.astype(dtype, copy=copy) + except OverflowError: + raise OverflowError("The elements provided in the data cannot all be " + "casted to the dtype {dtype}".format(dtype=dtype)) + + if np.array_equal(arr, casted): + return casted + + # We do this casting to allow for proper + # data and dtype checking. + # + # We didn't do this earlier because NumPy + # doesn't handle `uint64` correctly. + arr = np.asarray(arr) + + if is_unsigned_integer_dtype(dtype) and (arr < 0).any(): + raise OverflowError("Trying to coerce negative values " + "to unsigned integers") + + if is_integer_dtype(dtype) and (is_float_dtype(arr) or + is_object_dtype(arr)): + raise ValueError("Trying to coerce float values to integers") diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 490fd872125ff..4834f799b92eb 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -21,6 +21,7 @@ ABCPeriodIndex, ABCTimedeltaIndex, ABCDateOffset) from pandas.core.dtypes.missing import isna, array_equivalent +from pandas.core.dtypes.cast import maybe_cast_to_integer_array from pandas.core.dtypes.common import ( _ensure_int64, _ensure_object, @@ -311,19 +312,16 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, if is_integer_dtype(dtype): inferred = lib.infer_dtype(data) if inferred == 'integer': - try: - data = np.array(data, copy=copy, dtype=dtype) - except OverflowError: - # gh-15823: a more user-friendly error message - raise OverflowError( - "the elements provided in the data cannot " - "all be casted to the dtype {dtype}" - .format(dtype=dtype)) + data = maybe_cast_to_integer_array(data, dtype, + copy=copy) elif inferred in ['floating', 'mixed-integer-float']: if isna(data).any(): raise ValueError('cannot convert float ' 'NaN to integer') + if inferred == "mixed-integer-float": + data = maybe_cast_to_integer_array(data, dtype) + # If we are actually all equal to integers, # then coerce to integer. try: @@ -352,7 +350,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, except (TypeError, ValueError) as e: msg = str(e) - if 'cannot convert float' in msg: + if ("cannot convert float" in msg or + "Trying to coerce float values to integer" in msg): raise # maybe coerce to a sub-class diff --git a/pandas/core/series.py b/pandas/core/series.py index 23c4bbe082f28..2f762dff4aeab 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -41,7 +41,8 @@ maybe_cast_to_datetime, maybe_castable, construct_1d_arraylike_from_scalar, construct_1d_ndarray_preserving_na, - construct_1d_object_array_from_listlike) + construct_1d_object_array_from_listlike, + maybe_cast_to_integer_array) from pandas.core.dtypes.missing import ( isna, notna, @@ -4068,6 +4069,11 @@ def _try_cast(arr, take_fast_path): return arr try: + # gh-15832: Check if we are requesting a numeric dype and + # that we can convert the data to the requested dtype. + if is_float_dtype(dtype) or is_integer_dtype(dtype): + subarr = maybe_cast_to_integer_array(arr, dtype) + subarr = maybe_cast_to_datetime(arr, dtype) # Take care in creating object arrays (but iterators are not # supported): diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 311c71f734945..533bff0384ad9 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -199,11 +199,11 @@ def test_downcast(self): self._compare(result, expected) def test_constructor_compound_dtypes(self): - # GH 5191 - # compound dtypes should raise not-implementederror + # see gh-5191 + # Compound dtypes should raise NotImplementedError. def f(dtype): - return self._construct(shape=3, dtype=dtype) + return self._construct(shape=3, value=1, dtype=dtype) pytest.raises(NotImplementedError, f, [("A", "datetime64[h]"), ("B", "str"), @@ -534,14 +534,14 @@ def test_truncate_out_of_bounds(self): # small shape = [int(2e3)] + ([1] * (self._ndim - 1)) - small = self._construct(shape, dtype='int8') + small = self._construct(shape, dtype='int8', value=1) self._compare(small.truncate(), small) self._compare(small.truncate(before=0, after=3e3), small) self._compare(small.truncate(before=-1, after=2e3), small) # big shape = [int(2e6)] + ([1] * (self._ndim - 1)) - big = self._construct(shape, dtype='int8') + big = self._construct(shape, dtype='int8', value=1) self._compare(big.truncate(), big) self._compare(big.truncate(before=0, after=3e6), big) self._compare(big.truncate(before=-1, after=2e6), big) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 1d8a958c3413f..daba56e0c1e29 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -486,11 +486,18 @@ def test_constructor_nonhashable_name(self, indices): def test_constructor_overflow_int64(self): # see gh-15832 - msg = ("the elements provided in the data cannot " + msg = ("The elements provided in the data cannot " "all be casted to the dtype int64") with tm.assert_raises_regex(OverflowError, msg): Index([np.iinfo(np.uint64).max - 1], dtype="int64") + @pytest.mark.xfail(reason="see gh-21311: Index " + "doesn't enforce dtype argument") + def test_constructor_cast(self): + msg = "could not convert string to float" + with tm.assert_raises_regex(ValueError, msg): + Index(["a", "b", "c"], dtype=float) + def test_view_with_args(self): restricted = ['unicodeIndex', 'strIndex', 'catIndex', 'boolIndex', diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 49322d9b7abd6..166af4c89877d 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -451,6 +451,18 @@ def test_astype(self): i = Float64Index([0, 1.1, np.NAN]) pytest.raises(ValueError, lambda: i.astype(dtype)) + def test_type_coercion_fail(self, any_int_dtype): + # see gh-15832 + msg = "Trying to coerce float values to integers" + with tm.assert_raises_regex(ValueError, msg): + Index([1, 2, 3.5], dtype=any_int_dtype) + + def test_type_coercion_valid(self, float_dtype): + # There is no Float32Index, so we always + # generate Float64Index. + i = Index([1, 2, 3.5], dtype=float_dtype) + tm.assert_index_equal(i, Index([1, 2, 3.5])) + def test_equals_numeric(self): i = Float64Index([1.0, 2.0]) @@ -862,6 +874,14 @@ def test_constructor_corner(self): with tm.assert_raises_regex(TypeError, 'casting'): Int64Index(arr_with_floats) + def test_constructor_coercion_signed_to_unsigned(self, uint_dtype): + + # see gh-15832 + msg = "Trying to coerce negative values to unsigned integers" + + with tm.assert_raises_regex(OverflowError, msg): + Index([-1], dtype=uint_dtype) + def test_coerce_list(self): # coerce things arr = Index([1, 2, 3, 4]) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index d590cfd6b6c64..f96e7eeb40ea2 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -2047,7 +2047,7 @@ def test_table_values_dtypes_roundtrip(self): assert df1.dtypes[0] == 'float32' # check with mixed dtypes - df1 = DataFrame(dict((c, Series(np.random.randn(5), dtype=c)) + df1 = DataFrame(dict((c, Series(np.random.randint(5), dtype=c)) for c in ['float32', 'float64', 'int32', 'int64', 'int16', 'int8'])) df1['string'] = 'foo' diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 906d2aacd5586..27cfec0dbf20d 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -542,12 +542,30 @@ def test_constructor_pass_nan_nat(self): tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp) def test_constructor_cast(self): - pytest.raises(ValueError, Series, ['a', 'b', 'c'], dtype=float) + msg = "could not convert string to float" + with tm.assert_raises_regex(ValueError, msg): + Series(["a", "b", "c"], dtype=float) + + def test_constructor_unsigned_dtype_overflow(self, uint_dtype): + # see gh-15832 + msg = 'Trying to coerce negative values to unsigned integers' + with tm.assert_raises_regex(OverflowError, msg): + Series([-1], dtype=uint_dtype) + + def test_constructor_coerce_float_fail(self, any_int_dtype): + # see gh-15832 + msg = "Trying to coerce float values to integers" + with tm.assert_raises_regex(ValueError, msg): + Series([1, 2, 3.5], dtype=any_int_dtype) + + def test_constructor_coerce_float_valid(self, float_dtype): + s = Series([1, 2, 3.5], dtype=float_dtype) + expected = Series([1, 2, 3.5]).astype(float_dtype) + assert_series_equal(s, expected) - def test_constructor_dtype_nocast(self): - # 1572 + def test_constructor_dtype_no_cast(self): + # see gh-1572 s = Series([1, 2, 3]) - s2 = Series(s, dtype=np.int64) s2[1] = 5