Skip to content

Commit b36b451

Browse files
gfyoungjreback
authored andcommitted
API/BUG: Raise when int-dtype coercions fail (#21456)
Closes gh-15832.
1 parent ec20207 commit b36b451

File tree

9 files changed

+170
-22
lines changed

9 files changed

+170
-22
lines changed

doc/source/whatsnew/v0.24.0.txt

+27-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ Other Enhancements
2626
Backwards incompatible API changes
2727
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2828

29-
.. _whatsnew_0240.api.datetimelike.normalize
29+
.. _whatsnew_0240.api.datetimelike.normalize:
3030

3131
Tick DateOffset Normalize Restrictions
3232
--------------------------------------
@@ -73,6 +73,32 @@ Datetimelike API Changes
7373
Other API Changes
7474
^^^^^^^^^^^^^^^^^
7575

76+
.. _whatsnew_0240.api.other.incompatibilities:
77+
78+
Series and Index Data-Dtype Incompatibilities
79+
---------------------------------------------
80+
81+
``Series`` and ``Index`` constructors now raise when the
82+
data is incompatible with a passed ``dtype=`` (:issue:`15832`)
83+
84+
Previous Behavior:
85+
86+
.. code-block:: ipython
87+
88+
In [4]: pd.Series([-1], dtype="uint64")
89+
Out [4]:
90+
0 18446744073709551615
91+
dtype: uint64
92+
93+
Current Behavior:
94+
95+
.. code-block:: ipython
96+
97+
In [4]: pd.Series([-1], dtype="uint64")
98+
Out [4]:
99+
...
100+
OverflowError: Trying to coerce negative values to unsigned integers
101+
76102
- :class:`DatetimeIndex` now accepts :class:`Int64Index` arguments as epoch timestamps (:issue:`20997`)
77103
-
78104
-

pandas/core/dtypes/cast.py

+72
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
is_dtype_equal,
2121
is_float_dtype, is_complex_dtype,
2222
is_integer_dtype,
23+
is_unsigned_integer_dtype,
2324
is_datetime_or_timedelta_dtype,
2425
is_bool_dtype, is_scalar,
2526
is_string_dtype, _string_dtypes,
@@ -1269,3 +1270,74 @@ def construct_1d_ndarray_preserving_na(values, dtype=None, copy=False):
12691270
subarr = subarr2
12701271

12711272
return subarr
1273+
1274+
1275+
def maybe_cast_to_integer_array(arr, dtype, copy=False):
1276+
"""
1277+
Takes any dtype and returns the casted version, raising for when data is
1278+
incompatible with integer/unsigned integer dtypes.
1279+
1280+
.. versionadded:: 0.24.0
1281+
1282+
Parameters
1283+
----------
1284+
arr : array-like
1285+
The array to cast.
1286+
dtype : str, np.dtype
1287+
The integer dtype to cast the array to.
1288+
copy: boolean, default False
1289+
Whether to make a copy of the array before returning.
1290+
1291+
Returns
1292+
-------
1293+
int_arr : ndarray
1294+
An array of integer or unsigned integer dtype
1295+
1296+
Raises
1297+
------
1298+
OverflowError : the dtype is incompatible with the data
1299+
ValueError : loss of precision has occurred during casting
1300+
1301+
Examples
1302+
--------
1303+
If you try to coerce negative values to unsigned integers, it raises:
1304+
1305+
>>> Series([-1], dtype="uint64")
1306+
Traceback (most recent call last):
1307+
...
1308+
OverflowError: Trying to coerce negative values to unsigned integers
1309+
1310+
Also, if you try to coerce float values to integers, it raises:
1311+
1312+
>>> Series([1, 2, 3.5], dtype="int64")
1313+
Traceback (most recent call last):
1314+
...
1315+
ValueError: Trying to coerce float values to integers
1316+
"""
1317+
1318+
try:
1319+
if not hasattr(arr, "astype"):
1320+
casted = np.array(arr, dtype=dtype, copy=copy)
1321+
else:
1322+
casted = arr.astype(dtype, copy=copy)
1323+
except OverflowError:
1324+
raise OverflowError("The elements provided in the data cannot all be "
1325+
"casted to the dtype {dtype}".format(dtype=dtype))
1326+
1327+
if np.array_equal(arr, casted):
1328+
return casted
1329+
1330+
# We do this casting to allow for proper
1331+
# data and dtype checking.
1332+
#
1333+
# We didn't do this earlier because NumPy
1334+
# doesn't handle `uint64` correctly.
1335+
arr = np.asarray(arr)
1336+
1337+
if is_unsigned_integer_dtype(dtype) and (arr < 0).any():
1338+
raise OverflowError("Trying to coerce negative values "
1339+
"to unsigned integers")
1340+
1341+
if is_integer_dtype(dtype) and (is_float_dtype(arr) or
1342+
is_object_dtype(arr)):
1343+
raise ValueError("Trying to coerce float values to integers")

pandas/core/indexes/base.py

+8-9
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
ABCPeriodIndex, ABCTimedeltaIndex,
2222
ABCDateOffset)
2323
from pandas.core.dtypes.missing import isna, array_equivalent
24+
from pandas.core.dtypes.cast import maybe_cast_to_integer_array
2425
from pandas.core.dtypes.common import (
2526
_ensure_int64,
2627
_ensure_object,
@@ -311,19 +312,16 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
311312
if is_integer_dtype(dtype):
312313
inferred = lib.infer_dtype(data)
313314
if inferred == 'integer':
314-
try:
315-
data = np.array(data, copy=copy, dtype=dtype)
316-
except OverflowError:
317-
# gh-15823: a more user-friendly error message
318-
raise OverflowError(
319-
"the elements provided in the data cannot "
320-
"all be casted to the dtype {dtype}"
321-
.format(dtype=dtype))
315+
data = maybe_cast_to_integer_array(data, dtype,
316+
copy=copy)
322317
elif inferred in ['floating', 'mixed-integer-float']:
323318
if isna(data).any():
324319
raise ValueError('cannot convert float '
325320
'NaN to integer')
326321

322+
if inferred == "mixed-integer-float":
323+
data = maybe_cast_to_integer_array(data, dtype)
324+
327325
# If we are actually all equal to integers,
328326
# then coerce to integer.
329327
try:
@@ -352,7 +350,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
352350

353351
except (TypeError, ValueError) as e:
354352
msg = str(e)
355-
if 'cannot convert float' in msg:
353+
if ("cannot convert float" in msg or
354+
"Trying to coerce float values to integer" in msg):
356355
raise
357356

358357
# maybe coerce to a sub-class

pandas/core/series.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@
4141
maybe_cast_to_datetime, maybe_castable,
4242
construct_1d_arraylike_from_scalar,
4343
construct_1d_ndarray_preserving_na,
44-
construct_1d_object_array_from_listlike)
44+
construct_1d_object_array_from_listlike,
45+
maybe_cast_to_integer_array)
4546
from pandas.core.dtypes.missing import (
4647
isna,
4748
notna,
@@ -4068,6 +4069,11 @@ def _try_cast(arr, take_fast_path):
40684069
return arr
40694070

40704071
try:
4072+
# gh-15832: Check if we are requesting a numeric dype and
4073+
# that we can convert the data to the requested dtype.
4074+
if is_float_dtype(dtype) or is_integer_dtype(dtype):
4075+
subarr = maybe_cast_to_integer_array(arr, dtype)
4076+
40714077
subarr = maybe_cast_to_datetime(arr, dtype)
40724078
# Take care in creating object arrays (but iterators are not
40734079
# supported):

pandas/tests/generic/test_generic.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -199,11 +199,11 @@ def test_downcast(self):
199199
self._compare(result, expected)
200200

201201
def test_constructor_compound_dtypes(self):
202-
# GH 5191
203-
# compound dtypes should raise not-implementederror
202+
# see gh-5191
203+
# Compound dtypes should raise NotImplementedError.
204204

205205
def f(dtype):
206-
return self._construct(shape=3, dtype=dtype)
206+
return self._construct(shape=3, value=1, dtype=dtype)
207207

208208
pytest.raises(NotImplementedError, f, [("A", "datetime64[h]"),
209209
("B", "str"),
@@ -534,14 +534,14 @@ def test_truncate_out_of_bounds(self):
534534

535535
# small
536536
shape = [int(2e3)] + ([1] * (self._ndim - 1))
537-
small = self._construct(shape, dtype='int8')
537+
small = self._construct(shape, dtype='int8', value=1)
538538
self._compare(small.truncate(), small)
539539
self._compare(small.truncate(before=0, after=3e3), small)
540540
self._compare(small.truncate(before=-1, after=2e3), small)
541541

542542
# big
543543
shape = [int(2e6)] + ([1] * (self._ndim - 1))
544-
big = self._construct(shape, dtype='int8')
544+
big = self._construct(shape, dtype='int8', value=1)
545545
self._compare(big.truncate(), big)
546546
self._compare(big.truncate(before=0, after=3e6), big)
547547
self._compare(big.truncate(before=-1, after=2e6), big)

pandas/tests/indexes/test_base.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -486,11 +486,18 @@ def test_constructor_nonhashable_name(self, indices):
486486

487487
def test_constructor_overflow_int64(self):
488488
# see gh-15832
489-
msg = ("the elements provided in the data cannot "
489+
msg = ("The elements provided in the data cannot "
490490
"all be casted to the dtype int64")
491491
with tm.assert_raises_regex(OverflowError, msg):
492492
Index([np.iinfo(np.uint64).max - 1], dtype="int64")
493493

494+
@pytest.mark.xfail(reason="see gh-21311: Index "
495+
"doesn't enforce dtype argument")
496+
def test_constructor_cast(self):
497+
msg = "could not convert string to float"
498+
with tm.assert_raises_regex(ValueError, msg):
499+
Index(["a", "b", "c"], dtype=float)
500+
494501
def test_view_with_args(self):
495502

496503
restricted = ['unicodeIndex', 'strIndex', 'catIndex', 'boolIndex',

pandas/tests/indexes/test_numeric.py

+20
Original file line numberDiff line numberDiff line change
@@ -451,6 +451,18 @@ def test_astype(self):
451451
i = Float64Index([0, 1.1, np.NAN])
452452
pytest.raises(ValueError, lambda: i.astype(dtype))
453453

454+
def test_type_coercion_fail(self, any_int_dtype):
455+
# see gh-15832
456+
msg = "Trying to coerce float values to integers"
457+
with tm.assert_raises_regex(ValueError, msg):
458+
Index([1, 2, 3.5], dtype=any_int_dtype)
459+
460+
def test_type_coercion_valid(self, float_dtype):
461+
# There is no Float32Index, so we always
462+
# generate Float64Index.
463+
i = Index([1, 2, 3.5], dtype=float_dtype)
464+
tm.assert_index_equal(i, Index([1, 2, 3.5]))
465+
454466
def test_equals_numeric(self):
455467

456468
i = Float64Index([1.0, 2.0])
@@ -862,6 +874,14 @@ def test_constructor_corner(self):
862874
with tm.assert_raises_regex(TypeError, 'casting'):
863875
Int64Index(arr_with_floats)
864876

877+
def test_constructor_coercion_signed_to_unsigned(self, uint_dtype):
878+
879+
# see gh-15832
880+
msg = "Trying to coerce negative values to unsigned integers"
881+
882+
with tm.assert_raises_regex(OverflowError, msg):
883+
Index([-1], dtype=uint_dtype)
884+
865885
def test_coerce_list(self):
866886
# coerce things
867887
arr = Index([1, 2, 3, 4])

pandas/tests/io/test_pytables.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2047,7 +2047,7 @@ def test_table_values_dtypes_roundtrip(self):
20472047
assert df1.dtypes[0] == 'float32'
20482048

20492049
# check with mixed dtypes
2050-
df1 = DataFrame(dict((c, Series(np.random.randn(5), dtype=c))
2050+
df1 = DataFrame(dict((c, Series(np.random.randint(5), dtype=c))
20512051
for c in ['float32', 'float64', 'int32',
20522052
'int64', 'int16', 'int8']))
20532053
df1['string'] = 'foo'

pandas/tests/series/test_constructors.py

+22-4
Original file line numberDiff line numberDiff line change
@@ -542,12 +542,30 @@ def test_constructor_pass_nan_nat(self):
542542
tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp)
543543

544544
def test_constructor_cast(self):
545-
pytest.raises(ValueError, Series, ['a', 'b', 'c'], dtype=float)
545+
msg = "could not convert string to float"
546+
with tm.assert_raises_regex(ValueError, msg):
547+
Series(["a", "b", "c"], dtype=float)
548+
549+
def test_constructor_unsigned_dtype_overflow(self, uint_dtype):
550+
# see gh-15832
551+
msg = 'Trying to coerce negative values to unsigned integers'
552+
with tm.assert_raises_regex(OverflowError, msg):
553+
Series([-1], dtype=uint_dtype)
554+
555+
def test_constructor_coerce_float_fail(self, any_int_dtype):
556+
# see gh-15832
557+
msg = "Trying to coerce float values to integers"
558+
with tm.assert_raises_regex(ValueError, msg):
559+
Series([1, 2, 3.5], dtype=any_int_dtype)
560+
561+
def test_constructor_coerce_float_valid(self, float_dtype):
562+
s = Series([1, 2, 3.5], dtype=float_dtype)
563+
expected = Series([1, 2, 3.5]).astype(float_dtype)
564+
assert_series_equal(s, expected)
546565

547-
def test_constructor_dtype_nocast(self):
548-
# 1572
566+
def test_constructor_dtype_no_cast(self):
567+
# see gh-1572
549568
s = Series([1, 2, 3])
550-
551569
s2 = Series(s, dtype=np.int64)
552570

553571
s2[1] = 5

0 commit comments

Comments
 (0)