-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: add downcast to pd.to_numeric #13425
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -291,6 +291,83 @@ def test_non_hashable(self): | |
with self.assertRaisesRegexp(TypeError, "Invalid object type"): | ||
pd.to_numeric(s) | ||
|
||
def test_downcast(self): | ||
# see gh-13352 | ||
mixed_data = ['1', 2, 3] | ||
int_data = [1, 2, 3] | ||
date_data = np.array(['1970-01-02', '1970-01-03', | ||
'1970-01-04'], dtype='datetime64[D]') | ||
|
||
invalid_downcast = 'unsigned-integer' | ||
msg = 'invalid downcasting method provided' | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you test this on windows? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If testing didn't take forever on Appveyor, I would suggest that we have an Appveyor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you just need to create an account and it will work There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suppose, but I mention it because it isn't a requirement before merging a PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it doesn't matter because we get notified it some does fail - generally the differences are pretty small and we have a way more comprehensive test suite that numpy - it takes more time because it tests a heck of a lot more things and configs There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fair enough. As I said, if |
||
smallest_int_dtype = np.dtype(np.typecodes['Integer'][0]) | ||
smallest_uint_dtype = np.dtype(np.typecodes['UnsignedInteger'][0]) | ||
|
||
# support below np.float32 is rare and far between | ||
float_32_char = np.dtype(np.float32).char | ||
smallest_float_dtype = float_32_char | ||
|
||
for data in (mixed_data, int_data, date_data): | ||
with self.assertRaisesRegexp(ValueError, msg): | ||
pd.to_numeric(data, downcast=invalid_downcast) | ||
|
||
expected = np.array([1, 2, 3], dtype=np.int64) | ||
|
||
res = pd.to_numeric(data) | ||
tm.assert_numpy_array_equal(res, expected) | ||
|
||
res = pd.to_numeric(data, downcast=None) | ||
tm.assert_numpy_array_equal(res, expected) | ||
|
||
expected = np.array([1, 2, 3], dtype=smallest_int_dtype) | ||
|
||
for signed_downcast in ('integer', 'signed'): | ||
res = pd.to_numeric(data, downcast=signed_downcast) | ||
tm.assert_numpy_array_equal(res, expected) | ||
|
||
expected = np.array([1, 2, 3], dtype=smallest_uint_dtype) | ||
res = pd.to_numeric(data, downcast='unsigned') | ||
tm.assert_numpy_array_equal(res, expected) | ||
|
||
expected = np.array([1, 2, 3], dtype=smallest_float_dtype) | ||
res = pd.to_numeric(data, downcast='float') | ||
tm.assert_numpy_array_equal(res, expected) | ||
|
||
# if we can't successfully cast the given | ||
# data to a numeric dtype, do not bother | ||
# with the downcast parameter | ||
data = ['foo', 2, 3] | ||
expected = np.array(data, dtype=object) | ||
res = pd.to_numeric(data, errors='ignore', | ||
downcast='unsigned') | ||
tm.assert_numpy_array_equal(res, expected) | ||
|
||
# cannot cast to an unsigned integer because | ||
# we have a negative number | ||
data = ['-1', 2, 3] | ||
expected = np.array([-1, 2, 3], dtype=np.int64) | ||
res = pd.to_numeric(data, downcast='unsigned') | ||
tm.assert_numpy_array_equal(res, expected) | ||
|
||
# cannot cast to an integer (signed or unsigned) | ||
# because we have a float number | ||
data = ['1.1', 2, 3] | ||
expected = np.array([1.1, 2, 3], dtype=np.float64) | ||
|
||
for downcast in ('integer', 'signed', 'unsigned'): | ||
res = pd.to_numeric(data, downcast=downcast) | ||
tm.assert_numpy_array_equal(res, expected) | ||
|
||
# the smallest integer dtype need not be np.(u)int8 | ||
data = ['256', 257, 258] | ||
|
||
for downcast, expected_dtype in zip( | ||
['integer', 'signed', 'unsigned'], | ||
[np.int16, np.int16, np.uint16]): | ||
expected = np.array([256, 257, 258], dtype=expected_dtype) | ||
res = pd.to_numeric(data, downcast=downcast) | ||
tm.assert_numpy_array_equal(res, expected) | ||
|
||
if __name__ == '__main__': | ||
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -50,7 +50,7 @@ def compose(*funcs): | |
return reduce(_compose2, funcs) | ||
|
||
|
||
def to_numeric(arg, errors='raise'): | ||
def to_numeric(arg, errors='raise', downcast=None): | ||
""" | ||
Convert argument to a numeric type. | ||
|
||
|
@@ -61,6 +61,27 @@ def to_numeric(arg, errors='raise'): | |
- If 'raise', then invalid parsing will raise an exception | ||
- If 'coerce', then invalid parsing will be set as NaN | ||
- If 'ignore', then invalid parsing will return the input | ||
downcast : {'integer', 'signed', 'unsigned', 'float'} , default None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. lgtm. I am ok with both integer and signed. |
||
If not None, and if the data has been successfully cast to a | ||
numerical dtype (or if the data was numeric to begin with), | ||
downcast that resulting data to the smallest numerical dtype | ||
possible according to the following rules: | ||
|
||
- 'integer' or 'signed': smallest signed int dtype (min.: np.int8) | ||
- 'unsigned': smallest unsigned int dtype (min.: np.uint8) | ||
- 'float': smallest float dtype (min.: np.float32) | ||
|
||
As this behaviour is separate from the core conversion to | ||
numeric values, any errors raised during the downcasting | ||
will be surfaced regardless of the value of the 'errors' input. | ||
|
||
In addition, downcasting will only occur if the size | ||
of the resulting data's dtype is strictly larger than | ||
the dtype it is to be cast to, so if none of the dtypes | ||
checked satisfy that specification, no downcasting will be | ||
performed on the data. | ||
|
||
.. versionadded:: 0.19.0 | ||
|
||
Returns | ||
------- | ||
|
@@ -74,10 +95,37 @@ def to_numeric(arg, errors='raise'): | |
>>> import pandas as pd | ||
>>> s = pd.Series(['1.0', '2', -3]) | ||
>>> pd.to_numeric(s) | ||
0 1.0 | ||
1 2.0 | ||
2 -3.0 | ||
dtype: float64 | ||
>>> pd.to_numeric(s, downcast='float') | ||
0 1.0 | ||
1 2.0 | ||
2 -3.0 | ||
dtype: float32 | ||
>>> pd.to_numeric(s, downcast='signed') | ||
0 1 | ||
1 2 | ||
2 -3 | ||
dtype: int8 | ||
>>> s = pd.Series(['apple', '1.0', '2', -3]) | ||
>>> pd.to_numeric(s, errors='ignore') | ||
0 apple | ||
1 1.0 | ||
2 2 | ||
3 -3 | ||
dtype: object | ||
>>> pd.to_numeric(s, errors='coerce') | ||
0 NaN | ||
1 1.0 | ||
2 2.0 | ||
3 -3.0 | ||
dtype: float64 | ||
""" | ||
if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'): | ||
raise ValueError('invalid downcasting method provided') | ||
|
||
is_series = False | ||
is_index = False | ||
is_scalar = False | ||
|
@@ -102,20 +150,51 @@ def to_numeric(arg, errors='raise'): | |
else: | ||
values = arg | ||
|
||
if com.is_numeric_dtype(values): | ||
pass | ||
elif com.is_datetime_or_timedelta_dtype(values): | ||
values = values.astype(np.int64) | ||
else: | ||
values = com._ensure_object(values) | ||
coerce_numeric = False if errors in ('ignore', 'raise') else True | ||
try: | ||
if com.is_numeric_dtype(values): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add some asv's for this? e.g. hit the cases shown here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, done. |
||
pass | ||
elif com.is_datetime_or_timedelta_dtype(values): | ||
values = values.astype(np.int64) | ||
else: | ||
values = com._ensure_object(values) | ||
coerce_numeric = False if errors in ('ignore', 'raise') else True | ||
|
||
try: | ||
values = lib.maybe_convert_numeric(values, set(), | ||
coerce_numeric=coerce_numeric) | ||
except: | ||
if errors == 'raise': | ||
raise | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this part of the routine should do in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Which part? The box you selected doesn't tell me anything AFAIU. |
||
except Exception: | ||
if errors == 'raise': | ||
raise | ||
|
||
# attempt downcast only if the data has been successfully converted | ||
# to a numerical dtype and if a downcast method has been specified | ||
if downcast is not None and com.is_numeric_dtype(values): | ||
typecodes = None | ||
|
||
if downcast in ('integer', 'signed'): | ||
typecodes = np.typecodes['Integer'] | ||
elif downcast == 'unsigned' and np.min(values) > 0: | ||
typecodes = np.typecodes['UnsignedInteger'] | ||
elif downcast == 'float': | ||
typecodes = np.typecodes['Float'] | ||
|
||
# pandas support goes only to np.float32, | ||
# as float dtypes smaller than that are | ||
# extremely rare and not well supported | ||
float_32_char = np.dtype(np.float32).char | ||
float_32_ind = typecodes.index(float_32_char) | ||
typecodes = typecodes[float_32_ind:] | ||
|
||
if typecodes is not None: | ||
# from smallest to largest | ||
for dtype in typecodes: | ||
if np.dtype(dtype).itemsize < values.dtype.itemsize: | ||
values = com._possibly_downcast_to_dtype( | ||
values, dtype) | ||
|
||
# successful conversion | ||
if values.dtype == dtype: | ||
break | ||
|
||
if is_series: | ||
return pd.Series(values, index=arg.index, name=arg.name) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just use lists here - no need to use a numpy array
also say these operate on 1dim things (or scalars)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fair enough. Done.