-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
API/BUG: Handling Dtype Coercions in Series/Index (GH 15832) #15859
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 21 commits
676a4e5
e12bca7
9fc617b
8b463cb
43456a5
faa5c5c
1c90e7e
278c2fb
a8cd752
bbdea4b
d2e26ac
3c868a4
20ac5c6
14ed83b
3d0e76f
1726408
1f8e9b7
83cfc5d
417188a
939ae11
86e9d5e
359086d
50950f5
012fb57
35a5ff1
b1e6632
a1033cb
b78f4cc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,7 @@ | |
is_timedelta64_dtype, is_dtype_equal, | ||
is_float_dtype, is_complex_dtype, | ||
is_integer_dtype, | ||
is_unsigned_integer_dtype, | ||
is_datetime_or_timedelta_dtype, | ||
is_bool_dtype, is_scalar, | ||
_string_dtypes, | ||
|
@@ -1026,3 +1027,28 @@ def find_common_type(types): | |
return np.object | ||
|
||
return np.find_common_type(types, []) | ||
|
||
|
||
def maybe_cast_to_integer(arr, dtype): | ||
""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This actually can take any dtype and will return a casted version. It happens to check for integer/unsigned integer dtypes. So pls expand a little. |
||
Find a common data type among the given dtypes. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. be more explicit about what this does. It takes an integer dtype and returns the casted version, raising for an incompat dtype. |
||
|
||
Parameters | ||
---------- | ||
arr : array | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ndarray |
||
dtype : dtype | ||
|
||
Returns | ||
------- | ||
integer or unsigned integer array (or raise if the dtype is incompatible) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a Raises section |
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the spirit of good documentation, let's add some examples here! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Excellent! Nice examples. |
||
""" | ||
|
||
if is_unsigned_integer_dtype(dtype) and (np.asarray(arr) < 0).any(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these are ndarays, so you don't need the |
||
raise OverflowError("Trying to coerce negative values to negative " | ||
"integers") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this then needs to cast, no? e.g. (before the return)
|
||
elif is_integer_dtype(dtype) and is_float_dtype(np.asarray(arr)): | ||
if ((np.asarray(arr) % np.asarray(arr).astype(int)) > 0).any(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for floats, the check is its not an |
||
raise OverflowError("Trying to coerce float values to integers") | ||
else: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. here is where you cast, you don't need the else |
||
return arr |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,6 +52,7 @@ | |
from pandas.core.ops import _comp_method_OBJECT_ARRAY | ||
from pandas.core.strings import StringAccessorMixin | ||
from pandas.core.config import get_option | ||
from pandas.core.dtypes.cast import maybe_cast_to_integer | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. put this with the other imports from cast |
||
|
||
|
||
# simplify | ||
|
@@ -212,12 +213,21 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, | |
if is_integer_dtype(dtype): | ||
inferred = lib.infer_dtype(data) | ||
if inferred == 'integer': | ||
data = maybe_cast_to_integer(data, dtype=dtype) | ||
data = np.array(data, copy=copy, dtype=dtype) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we don't need the |
||
elif inferred in ['floating', 'mixed-integer-float']: | ||
if isnull(data).any(): | ||
raise ValueError('cannot convert float ' | ||
'NaN to integer') | ||
|
||
if is_integer_dtype(dtype) and not \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can call |
||
(np.asarray(data).astype(int) == 0).all(): | ||
if ((np.asarray(data) % np.asarray(data). | ||
astype(int)) > 0).any(): | ||
raise OverflowError("Trying to coerce " | ||
"float values to " | ||
"integers") | ||
|
||
# If we are actually all equal to integers, | ||
# then coerce to integer. | ||
try: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,7 +35,8 @@ | |
from pandas.core.dtypes.cast import ( | ||
maybe_upcast, infer_dtype_from_scalar, | ||
maybe_convert_platform, | ||
maybe_cast_to_datetime, maybe_castable) | ||
maybe_cast_to_datetime, maybe_castable, | ||
maybe_cast_to_integer) | ||
from pandas.core.dtypes.missing import isnull, notnull | ||
|
||
from pandas.core.common import (is_bool_indexer, | ||
|
@@ -2916,9 +2917,11 @@ def _try_cast(arr, take_fast_path): | |
return arr | ||
|
||
try: | ||
subarr = maybe_cast_to_integer(arr, dtype) | ||
subarr = maybe_cast_to_datetime(arr, dtype) | ||
if not is_extension_type(subarr): | ||
subarr = np.array(subarr, dtype=dtype, copy=copy) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am going to move this entire section (e.g. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if you want to move part/all of this ok with that as well. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
except (ValueError, TypeError): | ||
if is_categorical_dtype(dtype): | ||
subarr = Categorical(arr) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -678,6 +678,17 @@ def test_constructor_corner(self): | |
with tm.assert_raises_regex(TypeError, 'casting'): | ||
Int64Index(arr_with_floats) | ||
|
||
def test_constructor_overflow_coercion_signed_to_unsigned(self): | ||
# GH 15832 | ||
for t in ['uint8', 'uint16', 'uint32', 'uint64']: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's utilize There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ucals : This one has not yet been fully addressed (see |
||
with pytest.raises(OverflowError): | ||
Index([-1], dtype=t) | ||
|
||
def test_constructor_overflow_coercion_float_to_int(self): | ||
# GH 15832 | ||
with pytest.raises(OverflowError): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. run thru all int & uint dtypes here I think we have some other tests that this should be grouped with, that are testing the same thing but with NaN's |
||
Index([1, 2, 3.5], dtype=int) | ||
|
||
def test_coerce_list(self): | ||
# coerce things | ||
arr = Index([1, 2, 3, 4]) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -285,8 +285,10 @@ def test_setitem_index_int64(self): | |
self._assert_setitem_index_conversion(obj, 5, exp_index, np.int64) | ||
|
||
# int + float -> float | ||
exp_index = pd.Index([0, 1, 2, 3, 1.1]) | ||
self._assert_setitem_index_conversion(obj, 1.1, exp_index, np.float64) | ||
with pytest.raises(OverflowError): | ||
exp_index = pd.Index([0, 1, 2, 3, 1.1]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these should not raise, rather the results coerce |
||
self._assert_setitem_index_conversion(obj, 1.1, exp_index, | ||
np.float64) | ||
|
||
# int + object -> object | ||
exp_index = pd.Index([0, 1, 2, 3, 'x']) | ||
|
@@ -373,8 +375,9 @@ def test_insert_index_int64(self): | |
self._assert_insert_conversion(obj, 1, exp, np.int64) | ||
|
||
# int + float -> float | ||
exp = pd.Index([1, 1.1, 2, 3, 4]) | ||
self._assert_insert_conversion(obj, 1.1, exp, np.float64) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same, this is a legit operation |
||
with pytest.raises(OverflowError): | ||
exp = pd.Index([1, 1.1, 2, 3, 4]) | ||
self._assert_insert_conversion(obj, 1.1, exp, np.float64) | ||
|
||
# int + bool -> int | ||
exp = pd.Index([1, 0, 2, 3, 4]) | ||
|
@@ -622,7 +625,8 @@ def test_where_series_int64(self): | |
self._where_int64_common(pd.Series) | ||
|
||
def test_where_index_int64(self): | ||
self._where_int64_common(pd.Index) | ||
with pytest.raises(OverflowError): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same, remove the tests changes here |
||
self._where_int64_common(pd.Index) | ||
|
||
def _where_float64_common(self, klass): | ||
obj = klass([1.1, 2.2, 3.3, 4.4]) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2080,24 +2080,26 @@ def test_table_values_dtypes_roundtrip(self): | |
assert df1.dtypes[0] == 'float32' | ||
|
||
# check with mixed dtypes | ||
df1 = DataFrame(dict([(c, Series(np.random.randn(5), dtype=c)) | ||
for c in ['float32', 'float64', 'int32', | ||
'int64', 'int16', 'int8']])) | ||
df1['string'] = 'foo' | ||
df1['float322'] = 1. | ||
df1['float322'] = df1['float322'].astype('float32') | ||
df1['bool'] = df1['float32'] > 0 | ||
df1['time1'] = Timestamp('20130101') | ||
df1['time2'] = Timestamp('20130102') | ||
|
||
store.append('df_mixed_dtypes1', df1) | ||
result = store.select('df_mixed_dtypes1').get_dtype_counts() | ||
expected = Series({'float32': 2, 'float64': 1, 'int32': 1, | ||
'bool': 1, 'int16': 1, 'int8': 1, | ||
'int64': 1, 'object': 1, 'datetime64[ns]': 2}) | ||
result = result.sort_index() | ||
result = expected.sort_index() | ||
tm.assert_series_equal(result, expected) | ||
with pytest.raises(OverflowError): | ||
df1 = DataFrame(dict([(c, Series(np.random.randn(5), dtype=c)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. huh? why are you having this raise. this is a legitimate operation. pls revert this. |
||
for c in ['float32', 'float64', 'int32', | ||
'int64', 'int16', 'int8']])) | ||
df1['string'] = 'foo' | ||
df1['float322'] = 1. | ||
df1['float322'] = df1['float322'].astype('float32') | ||
df1['bool'] = df1['float32'] > 0 | ||
df1['time1'] = Timestamp('20130101') | ||
df1['time2'] = Timestamp('20130102') | ||
|
||
store.append('df_mixed_dtypes1', df1) | ||
result = store.select('df_mixed_dtypes1').get_dtype_counts() | ||
expected = Series({'float32': 2, 'float64': 1, 'int32': 1, | ||
'bool': 1, 'int16': 1, 'int8': 1, | ||
'int64': 1, 'object': 1, | ||
'datetime64[ns]': 2}) | ||
result = result.sort_index() | ||
result = expected.sort_index() | ||
tm.assert_series_equal(result, expected) | ||
|
||
def test_table_mixed_dtypes(self): | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,30 +1,26 @@ | ||
# coding=utf-8 | ||
# pylint: disable-msg=E1101,W0612 | ||
|
||
import pytest | ||
|
||
from datetime import datetime, timedelta | ||
|
||
from numpy import nan | ||
import numpy as np | ||
import numpy.ma as ma | ||
import pandas as pd | ||
|
||
from pandas.core.dtypes.common import ( | ||
is_categorical_dtype, | ||
is_datetime64tz_dtype) | ||
import pytest | ||
from numpy import nan | ||
from pandas import (Index, Series, isnull, date_range, | ||
NaT, period_range, MultiIndex, IntervalIndex) | ||
from pandas.core.indexes.datetimes import Timestamp, DatetimeIndex | ||
from pandas import compat | ||
from pandas.compat import lrange, range, zip, OrderedDict, long | ||
|
||
import pandas.util.testing as tm | ||
from pandas._libs import lib | ||
from pandas._libs.tslib import iNaT | ||
|
||
from pandas.compat import lrange, range, zip, OrderedDict, long | ||
from pandas import compat | ||
from pandas.core.dtypes.common import ( | ||
is_categorical_dtype, | ||
is_datetime64tz_dtype) | ||
from pandas.core.indexes.datetimes import Timestamp, DatetimeIndex | ||
from pandas.util.testing import assert_series_equal | ||
import pandas.util.testing as tm | ||
|
||
from .common import TestData | ||
|
||
|
||
|
@@ -888,3 +884,14 @@ def test_constructor_generic_timestamp_deprecated(self): | |
msg = "cannot convert datetimelike" | ||
with tm.assert_raises_regex(TypeError, msg): | ||
Series([], dtype='M8[ps]') | ||
|
||
def test_constructor_overflow_coercion_signed_to_unsigned(self): | ||
# GH 15832 | ||
for t in ['uint8', 'uint16', 'uint32', 'uint64']: | ||
with pytest.raises(OverflowError): | ||
Series([-1], dtype=t) | ||
|
||
def test_constructor_overflow_coercion_float_to_int(self): | ||
# GH 15832 | ||
with pytest.raises(OverflowError): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same as above, move with related tests & test all uint & int dtypes. |
||
Series([1, 2, 3.5], dtype=int) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
rename to
maybe_cast_to_integer_array
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jreback : Perhaps we could just generalize to accept scalar as well as
array
(as consistent with other casting functions in this module) and keep the name as is?