Skip to content

PERF: to_numeric for numeric dtypes #12777

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion asv_bench/benchmarks/miscellaneous.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,26 @@ def prop(self):
self.obj = Foo()

def time_misc_cache_readonly(self):
self.obj.prop
self.obj.prop


class to_numeric(object):
goal_time = 0.2

def setup(self):
self.n = 10000
self.float = Series(np.random.randn(self.n * 100))
self.numstr = self.float.astype('str')
self.str = Series(tm.makeStringIndex(self.n))

def time_from_float(self):
pd.to_numeric(self.float)

def time_from_numeric_str(self):
pd.to_numeric(self.numstr)

def time_from_str_ignore(self):
pd.to_numeric(self.str, errors='ignore')

def time_from_str_coerce(self):
pd.to_numeric(self.str, errors='coerce')
5 changes: 5 additions & 0 deletions doc/source/whatsnew/v0.18.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,8 @@ Performance Improvements
- Improved performance of ``DataFrame.to_sql`` when checking case sensitivity for tables. Now only checks if table has been created correctly when table name is not lower case. (:issue:`12876`)
- Improved performance of ``Period`` construction and time series plotting (:issue:`12903`, :issue:`11831`).
- Improved performance of ``.str.encode()`` and ``.str.decode()`` methods (:issue:`13008`)
- Improved performance of ``to_numeric`` if input is numeric dtype (:issue:`12777`)




Expand Down Expand Up @@ -516,3 +518,6 @@ Bug Fixes
- Bug in ``.describe()`` resets categorical columns information (:issue:`11558`)
- Bug where ``loffset`` argument was not applied when calling ``resample().count()`` on a timeseries (:issue:`12725`)
- ``pd.read_excel()`` now accepts column names associated with keyword argument ``names`` (:issue:`12870`)
- Bug in ``to_numeric`` with ``Index`` returns ``np.ndarray``, rather than ``Index`` (:issue:`12777`)
- Bug in ``to_numeric`` with datetime-like may raise ``TypeError`` (:issue:`12777`)
- Bug in ``to_numeric`` with scalar raises ``ValueError`` (:issue:`12777`)
137 changes: 135 additions & 2 deletions pandas/tools/tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import locale
import codecs
import nose
from nose.tools import assert_raises

import numpy as np
from numpy.testing import assert_equal
Expand Down Expand Up @@ -102,9 +101,25 @@ def test_series(self):
res = to_numeric(s)
tm.assert_series_equal(res, expected)

def test_series_numeric(self):
s = pd.Series([1, 3, 4, 5], index=list('ABCD'), name='XXX')
res = to_numeric(s)
tm.assert_series_equal(res, s)

s = pd.Series([1., 3., 4., 5.], index=list('ABCD'), name='XXX')
res = to_numeric(s)
tm.assert_series_equal(res, s)

# bool is regarded as numeric
s = pd.Series([True, False, True, True],
index=list('ABCD'), name='XXX')
res = to_numeric(s)
tm.assert_series_equal(res, s)

def test_error(self):
s = pd.Series([1, -3.14, 'apple'])
assert_raises(ValueError, to_numeric, s, errors='raise')
with tm.assertRaises(ValueError):
to_numeric(s, errors='raise')

res = to_numeric(s, errors='ignore')
expected = pd.Series([1, -3.14, 'apple'])
Expand All @@ -114,12 +129,40 @@ def test_error(self):
expected = pd.Series([1, -3.14, np.nan])
tm.assert_series_equal(res, expected)

def test_error_seen_bool(self):
s = pd.Series([True, False, 'apple'])
with tm.assertRaises(ValueError):
to_numeric(s, errors='raise')

res = to_numeric(s, errors='ignore')
expected = pd.Series([True, False, 'apple'])
tm.assert_series_equal(res, expected)

# coerces to float
res = to_numeric(s, errors='coerce')
expected = pd.Series([1., 0., np.nan])
tm.assert_series_equal(res, expected)

def test_list(self):
s = ['1', '-3.14', '7']
res = to_numeric(s)
expected = np.array([1, -3.14, 7])
tm.assert_numpy_array_equal(res, expected)

def test_list_numeric(self):
s = [1, 3, 4, 5]
res = to_numeric(s)
tm.assert_numpy_array_equal(res, np.array(s))

s = [1., 3., 4., 5.]
res = to_numeric(s)
tm.assert_numpy_array_equal(res, np.array(s))

# bool is regarded as numeric
s = [True, False, True, True]
res = to_numeric(s)
tm.assert_numpy_array_equal(res, np.array(s))

def test_numeric(self):
s = pd.Series([1, -3.14, 7], dtype='O')
res = to_numeric(s)
Expand All @@ -145,6 +188,96 @@ def test_type_check(self):
with tm.assertRaisesRegexp(TypeError, "1-d array"):
to_numeric(df, errors=errors)

def test_scalar(self):
self.assertEqual(pd.to_numeric(1), 1)
self.assertEqual(pd.to_numeric(1.1), 1.1)

self.assertEqual(pd.to_numeric('1'), 1)
self.assertEqual(pd.to_numeric('1.1'), 1.1)

with tm.assertRaises(ValueError):
to_numeric('XX', errors='raise')

self.assertEqual(to_numeric('XX', errors='ignore'), 'XX')
self.assertTrue(np.isnan(to_numeric('XX', errors='coerce')))

def test_numeric_dtypes(self):
idx = pd.Index([1, 2, 3], name='xxx')
res = pd.to_numeric(idx)
tm.assert_index_equal(res, idx)

res = pd.to_numeric(pd.Series(idx, name='xxx'))
tm.assert_series_equal(res, pd.Series(idx, name='xxx'))

res = pd.to_numeric(idx.values)
tm.assert_numpy_array_equal(res, idx.values)

idx = pd.Index([1., np.nan, 3., np.nan], name='xxx')
res = pd.to_numeric(idx)
tm.assert_index_equal(res, idx)

res = pd.to_numeric(pd.Series(idx, name='xxx'))
tm.assert_series_equal(res, pd.Series(idx, name='xxx'))

res = pd.to_numeric(idx.values)
tm.assert_numpy_array_equal(res, idx.values)

def test_str(self):
idx = pd.Index(['1', '2', '3'], name='xxx')
exp = np.array([1, 2, 3])
res = pd.to_numeric(idx)
tm.assert_index_equal(res, pd.Index(exp, name='xxx'))

res = pd.to_numeric(pd.Series(idx, name='xxx'))
tm.assert_series_equal(res, pd.Series(exp, name='xxx'))

res = pd.to_numeric(idx.values)
tm.assert_numpy_array_equal(res, exp)

idx = pd.Index(['1.5', '2.7', '3.4'], name='xxx')
exp = np.array([1.5, 2.7, 3.4])
res = pd.to_numeric(idx)
tm.assert_index_equal(res, pd.Index(exp, name='xxx'))

res = pd.to_numeric(pd.Series(idx, name='xxx'))
tm.assert_series_equal(res, pd.Series(exp, name='xxx'))

res = pd.to_numeric(idx.values)
tm.assert_numpy_array_equal(res, exp)

def test_datetimelike(self):
for tz in [None, 'US/Eastern', 'Asia/Tokyo']:
idx = pd.date_range('20130101', periods=3, tz=tz, name='xxx')
res = pd.to_numeric(idx)
tm.assert_index_equal(res, pd.Index(idx.asi8, name='xxx'))

res = pd.to_numeric(pd.Series(idx, name='xxx'))
tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx'))

res = pd.to_numeric(idx.values)
tm.assert_numpy_array_equal(res, idx.asi8)

def test_timedelta(self):
idx = pd.timedelta_range('1 days', periods=3, freq='D', name='xxx')
res = pd.to_numeric(idx)
tm.assert_index_equal(res, pd.Index(idx.asi8, name='xxx'))

res = pd.to_numeric(pd.Series(idx, name='xxx'))
tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx'))

res = pd.to_numeric(idx.values)
tm.assert_numpy_array_equal(res, idx.asi8)

def test_period(self):
idx = pd.period_range('2011-01', periods=3, freq='M', name='xxx')
res = pd.to_numeric(idx)
tm.assert_index_equal(res, pd.Index(idx.asi8, name='xxx'))

# ToDo: enable when we can support native PeriodDtype
# res = pd.to_numeric(pd.Series(idx, name='xxx'))
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Avoided to use is_period_array ATM. we can have faster impl when period dtype is added.

# tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx'))


if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)
61 changes: 42 additions & 19 deletions pandas/tools/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,29 +78,52 @@ def to_numeric(arg, errors='raise'):
>>> pd.to_numeric(s, errors='ignore')
>>> pd.to_numeric(s, errors='coerce')
"""
is_series = False
is_index = False
is_scalar = False

index = name = None
if isinstance(arg, pd.Series):
index, name = arg.index, arg.name
is_series = True
values = arg.values
elif isinstance(arg, pd.Index):
is_index = True
values = arg.asi8
if values is None:
values = arg.values
elif isinstance(arg, (list, tuple)):
arg = np.array(arg, dtype='O')
values = np.array(arg, dtype='O')
elif np.isscalar(arg):
if com.is_number(arg):
return arg
is_scalar = True
values = np.array([arg], dtype='O')
elif getattr(arg, 'ndim', 1) > 1:
raise TypeError('arg must be a list, tuple, 1-d array, or Series')
else:
values = arg

conv = arg
arg = com._ensure_object(arg)

coerce_numeric = False if errors in ('ignore', 'raise') else True

try:
conv = lib.maybe_convert_numeric(arg,
set(),
coerce_numeric=coerce_numeric)
except:
if errors == 'raise':
raise

if index is not None:
return pd.Series(conv, index=index, name=name)
if com.is_numeric_dtype(values):
pass
elif com.is_datetime_or_timedelta_dtype(values):
values = values.astype(np.int64)
else:
values = com._ensure_object(values)
coerce_numeric = False if errors in ('ignore', 'raise') else True

try:
values = lib.maybe_convert_numeric(values, set(),
coerce_numeric=coerce_numeric)
except:
if errors == 'raise':
raise

if is_series:
return pd.Series(values, index=arg.index, name=arg.name)
elif is_index:
# because we want to coerce to numeric if possible,
# do not use _shallow_copy_with_infer
return Index(values, name=arg.name)
elif is_scalar:
return values[0]
else:
return conv
return values