diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index a6b6d704737bd..7b8312d25641d 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -54,7 +54,7 @@ Backwards incompatible API changes Other API Changes ^^^^^^^^^^^^^^^^^ - +- Series and Index constructors now raises when data is incompatible with a passed dtype= kwarg (:issue:`15832`) - Moved definition of ``MergeError`` to the ``pandas.errors`` module. diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index fd61813a57c98..56235a99d5f02 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -16,6 +16,7 @@ is_timedelta64_dtype, is_dtype_equal, is_float_dtype, is_complex_dtype, is_integer_dtype, + is_unsigned_integer_dtype, is_datetime_or_timedelta_dtype, is_bool_dtype, is_scalar, _string_dtypes, @@ -1026,3 +1027,56 @@ def find_common_type(types): return np.object return np.find_common_type(types, []) + + +def maybe_cast_to_integer_array(arr, dtype, copy=False): + """ + Takes any dtype and returns the casted version, raising for when data is + incompatible with integer/unsigned integer dtypes. + + .. versionadded:: 0.21.0 + + Parameters + ---------- + arr : ndarray + dtype : np.dtype + copy: boolean, default False + + Returns + ------- + integer or unsigned integer array + + Raises + ------ + OverflowError + * If ``dtype`` is incompatible + ValueError + * If coercion from float to integer loses precision + + Examples + -------- + If you try to coerce negative values to unsigned integers, it raises: + + >>> Series([-1], dtype='uint64') + Traceback (most recent call last): + ... + OverflowError: Trying to coerce negative values to unsigned integers + + Also, if you try to coerce float values to integers, it raises: + >>> Series([1, 2, 3.5], dtype='int64') + Traceback (most recent call last): + ... + ValueError: Trying to coerce float values to integers + + """ + casted = arr.astype(dtype, copy=copy) + if np.array(arr == casted).all(): + return casted + + if is_unsigned_integer_dtype(dtype) and (arr < 0).any(): + raise OverflowError("Trying to coerce negative values to unsigned " + "integers") + + if is_integer_dtype(dtype) and (is_float_dtype(arr) or + is_object_dtype(arr)): + raise ValueError("Trying to coerce float values to integers") diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2af4f112ca941..0c69508146e1a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -15,6 +15,7 @@ from pandas.core.dtypes.generic import ABCSeries, ABCMultiIndex, ABCPeriodIndex from pandas.core.dtypes.missing import isnull, array_equivalent +from pandas.core.dtypes.cast import maybe_cast_to_integer_array from pandas.core.dtypes.common import ( _ensure_int64, _ensure_object, @@ -212,11 +213,14 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, if is_integer_dtype(dtype): inferred = lib.infer_dtype(data) if inferred == 'integer': - data = np.array(data, copy=copy, dtype=dtype) + data = maybe_cast_to_integer_array(data, dtype, + copy=copy) elif inferred in ['floating', 'mixed-integer-float']: if isnull(data).any(): raise ValueError('cannot convert float ' 'NaN to integer') + if inferred == 'mixed-integer-float': + maybe_cast_to_integer_array(data, dtype) # If we are actually all equal to integers, # then coerce to integer. @@ -246,7 +250,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, except (TypeError, ValueError) as e: msg = str(e) - if 'cannot convert float' in msg: + if ('cannot convert float' in msg or + 'Trying to coerce float values to integer') in msg: raise # maybe coerce to a sub-class diff --git a/pandas/core/series.py b/pandas/core/series.py index 129f291e5f843..9d16619c47f3a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -35,7 +35,8 @@ from pandas.core.dtypes.cast import ( maybe_upcast, infer_dtype_from_scalar, maybe_convert_platform, - maybe_cast_to_datetime, maybe_castable) + maybe_cast_to_datetime, maybe_castable, + maybe_cast_to_integer_array) from pandas.core.dtypes.missing import isnull, notnull from pandas.core.common import (is_bool_indexer, @@ -2941,9 +2942,13 @@ def _try_cast(arr, take_fast_path): return arr try: + if is_float_dtype(dtype) or is_integer_dtype(dtype): + subarr = maybe_cast_to_integer_array(np.asarray(arr), dtype) + subarr = maybe_cast_to_datetime(arr, dtype) if not is_extension_type(subarr): subarr = np.array(subarr, dtype=dtype, copy=copy) + except (ValueError, TypeError): if is_categorical_dtype(dtype): subarr = Categorical(arr) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 3d06f1672ae32..16db27cd7e2d2 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -304,6 +304,20 @@ def test_astype(self): i = Float64Index([0, 1.1, np.NAN]) pytest.raises(ValueError, lambda: i.astype(dtype)) + @pytest.mark.parametrize("int_dtype", ['uint8', 'uint16', 'uint32', + 'uint64', 'int32', 'int64', 'int16', + 'int8']) + @pytest.mark.parametrize("float_dtype", ['float16', 'float32']) + def test_type_coercion(self, int_dtype, float_dtype): + + # GH 15832 + msg = 'Trying to coerce float values to integers' + with tm.assert_raises_regex(ValueError, msg): + Index([1, 2, 3.5], dtype=int_dtype) + + i = Index([1, 2, 3.5], dtype=float_dtype) + tm.assert_index_equal(i, Index([1, 2, 3.5])) + def test_equals_numeric(self): i = Float64Index([1.0, 2.0]) @@ -678,6 +692,13 @@ def test_constructor_corner(self): with tm.assert_raises_regex(TypeError, 'casting'): Int64Index(arr_with_floats) + @pytest.mark.parametrize("uints", ['uint8', 'uint16', 'uint32', 'uint64']) + def test_constructor_overflow_coercion_signed_to_unsigned(self, uints): + # GH 15832 + msg = 'Trying to coerce negative values to unsigned integers' + with tm.assert_raises_regex(OverflowError, msg): + Index([-1], dtype=uints) + def test_coerce_list(self): # coerce things arr = Index([1, 2, 3, 4]) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 17f524cc279c0..5d5aefa3fad37 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -2080,7 +2080,7 @@ def test_table_values_dtypes_roundtrip(self): assert df1.dtypes[0] == 'float32' # check with mixed dtypes - df1 = DataFrame(dict([(c, Series(np.random.randn(5), dtype=c)) + df1 = DataFrame(dict([(c, Series(np.random.randn(5).astype(c))) for c in ['float32', 'float64', 'int32', 'int64', 'int16', 'int8']])) df1['string'] = 'foo' @@ -2094,7 +2094,8 @@ def test_table_values_dtypes_roundtrip(self): result = store.select('df_mixed_dtypes1').get_dtype_counts() expected = Series({'float32': 2, 'float64': 1, 'int32': 1, 'bool': 1, 'int16': 1, 'int8': 1, - 'int64': 1, 'object': 1, 'datetime64[ns]': 2}) + 'int64': 1, 'object': 1, + 'datetime64[ns]': 2}) result = result.sort_index() result = expected.sort_index() tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index d591aa4f567a9..691cb60a102f4 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1,30 +1,26 @@ # coding=utf-8 # pylint: disable-msg=E1101,W0612 -import pytest - from datetime import datetime, timedelta -from numpy import nan import numpy as np import numpy.ma as ma import pandas as pd - -from pandas.core.dtypes.common import ( - is_categorical_dtype, - is_datetime64tz_dtype) +import pytest +from numpy import nan from pandas import (Index, Series, isnull, date_range, NaT, period_range, MultiIndex, IntervalIndex) -from pandas.core.indexes.datetimes import Timestamp, DatetimeIndex +from pandas import compat +from pandas.compat import lrange, range, zip, OrderedDict, long +import pandas.util.testing as tm from pandas._libs import lib from pandas._libs.tslib import iNaT - -from pandas.compat import lrange, range, zip, OrderedDict, long -from pandas import compat +from pandas.core.dtypes.common import ( + is_categorical_dtype, + is_datetime64tz_dtype) +from pandas.core.indexes.datetimes import Timestamp, DatetimeIndex from pandas.util.testing import assert_series_equal -import pandas.util.testing as tm - from .common import TestData @@ -301,12 +297,35 @@ def test_constructor_pass_nan_nat(self): tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp) def test_constructor_cast(self): - pytest.raises(ValueError, Series, ['a', 'b', 'c'], dtype=float) + msg = "could not convert string to float" + with tm.assert_raises_regex(ValueError, msg): + Series(['a', 'b', 'c'], dtype=float) + + @pytest.mark.parametrize("unsigned_integers", ['uint8', 'uint16', 'uint32', + 'uint64']) + def test_constructor_unsigned_dtype_overflow(self, unsigned_integers): + # GH 15832 + msg = 'Trying to coerce negative values to unsigned integers' + with tm.assert_raises_regex(OverflowError, msg): + Series([-1], dtype=unsigned_integers) + + @pytest.mark.parametrize("integers", ['uint8', 'uint16', 'uint32', + 'uint64', 'int32', 'int64', 'int16', + 'int8']) + @pytest.mark.parametrize("floats", ['float16', 'float32']) + def test_constructor_coerce_float_fail(self, integers, floats): + # GH 15832 + msg = 'Trying to coerce float values to integers' + with tm.assert_raises_regex(ValueError, msg): + Series([1, 2, 3.5], dtype=integers) + + s = Series([1, 2, 3.5], dtype=floats) + expected = Series([1, 2, 3.5]).astype(floats) + assert_series_equal(s, expected) def test_constructor_dtype_nocast(self): # 1572 s = Series([1, 2, 3]) - s2 = Series(s, dtype=np.int64) s2[1] = 5