Skip to content

Commit 6f2cbd7

Browse files
committed
PERF: to_numeric for numeric dtypes
1 parent b56cea2 commit 6f2cbd7

File tree

4 files changed

+205
-22
lines changed

4 files changed

+205
-22
lines changed

asv_bench/benchmarks/miscellaneous.py

+23-1
Original file line numberDiff line numberDiff line change
@@ -27,4 +27,26 @@ def prop(self):
2727
self.obj = Foo()
2828

2929
def time_misc_cache_readonly(self):
30-
self.obj.prop
30+
self.obj.prop
31+
32+
33+
class to_numeric(object):
34+
goal_time = 0.2
35+
36+
def setup(self):
37+
self.n = 10000
38+
self.float = Series(np.random.randn(self.n * 100))
39+
self.numstr = self.float.astype('str')
40+
self.str = Series(tm.makeStringIndex(self.n))
41+
42+
def time_from_float(self):
43+
pd.to_numeric(self.float)
44+
45+
def time_from_numeric_str(self):
46+
pd.to_numeric(self.numstr)
47+
48+
def time_from_str_ignore(self):
49+
pd.to_numeric(self.str, errors='ignore')
50+
51+
def time_from_str_coerce(self):
52+
pd.to_numeric(self.str, errors='coerce')

doc/source/whatsnew/v0.18.1.txt

+5
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,8 @@ Performance Improvements
409409
- Improved performance of ``DataFrame.to_sql`` when checking case sensitivity for tables. Now only checks if table has been created correctly when table name is not lower case. (:issue:`12876`)
410410
- Improved performance of ``Period`` construction and time series plotting (:issue:`12903`, :issue:`11831`).
411411
- Improved performance of ``.str.encode()`` and ``.str.decode()`` methods (:issue:`13008`)
412+
- Improved performance of ``to_numeric`` if input is numeric dtype (:issue:`12777`)
413+
412414

413415

414416

@@ -516,3 +518,6 @@ Bug Fixes
516518
- Bug in ``.describe()`` resets categorical columns information (:issue:`11558`)
517519
- Bug where ``loffset`` argument was not applied when calling ``resample().count()`` on a timeseries (:issue:`12725`)
518520
- ``pd.read_excel()`` now accepts column names associated with keyword argument ``names`` (:issue:`12870`)
521+
- Bug in ``to_numeric`` with ``Index`` returns ``np.ndarray``, rather than ``Index`` (:issue:`12777`)
522+
- Bug in ``to_numeric`` with datetime-like may raise ``TypeError`` (:issue:`12777`)
523+
- Bug in ``to_numeric`` with scalar raises ``ValueError`` (:issue:`12777`)

pandas/tools/tests/test_util.py

+135-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
import locale
33
import codecs
44
import nose
5-
from nose.tools import assert_raises
65

76
import numpy as np
87
from numpy.testing import assert_equal
@@ -102,9 +101,25 @@ def test_series(self):
102101
res = to_numeric(s)
103102
tm.assert_series_equal(res, expected)
104103

104+
def test_series_numeric(self):
105+
s = pd.Series([1, 3, 4, 5], index=list('ABCD'), name='XXX')
106+
res = to_numeric(s)
107+
tm.assert_series_equal(res, s)
108+
109+
s = pd.Series([1., 3., 4., 5.], index=list('ABCD'), name='XXX')
110+
res = to_numeric(s)
111+
tm.assert_series_equal(res, s)
112+
113+
# bool is regarded as numeric
114+
s = pd.Series([True, False, True, True],
115+
index=list('ABCD'), name='XXX')
116+
res = to_numeric(s)
117+
tm.assert_series_equal(res, s)
118+
105119
def test_error(self):
106120
s = pd.Series([1, -3.14, 'apple'])
107-
assert_raises(ValueError, to_numeric, s, errors='raise')
121+
with tm.assertRaises(ValueError):
122+
to_numeric(s, errors='raise')
108123

109124
res = to_numeric(s, errors='ignore')
110125
expected = pd.Series([1, -3.14, 'apple'])
@@ -114,12 +129,40 @@ def test_error(self):
114129
expected = pd.Series([1, -3.14, np.nan])
115130
tm.assert_series_equal(res, expected)
116131

132+
def test_error_seen_bool(self):
133+
s = pd.Series([True, False, 'apple'])
134+
with tm.assertRaises(ValueError):
135+
to_numeric(s, errors='raise')
136+
137+
res = to_numeric(s, errors='ignore')
138+
expected = pd.Series([True, False, 'apple'])
139+
tm.assert_series_equal(res, expected)
140+
141+
# coerces to float
142+
res = to_numeric(s, errors='coerce')
143+
expected = pd.Series([1., 0., np.nan])
144+
tm.assert_series_equal(res, expected)
145+
117146
def test_list(self):
118147
s = ['1', '-3.14', '7']
119148
res = to_numeric(s)
120149
expected = np.array([1, -3.14, 7])
121150
tm.assert_numpy_array_equal(res, expected)
122151

152+
def test_list_numeric(self):
153+
s = [1, 3, 4, 5]
154+
res = to_numeric(s)
155+
tm.assert_numpy_array_equal(res, np.array(s))
156+
157+
s = [1., 3., 4., 5.]
158+
res = to_numeric(s)
159+
tm.assert_numpy_array_equal(res, np.array(s))
160+
161+
# bool is regarded as numeric
162+
s = [True, False, True, True]
163+
res = to_numeric(s)
164+
tm.assert_numpy_array_equal(res, np.array(s))
165+
123166
def test_numeric(self):
124167
s = pd.Series([1, -3.14, 7], dtype='O')
125168
res = to_numeric(s)
@@ -145,6 +188,96 @@ def test_type_check(self):
145188
with tm.assertRaisesRegexp(TypeError, "1-d array"):
146189
to_numeric(df, errors=errors)
147190

191+
def test_scalar(self):
192+
self.assertEqual(pd.to_numeric(1), 1)
193+
self.assertEqual(pd.to_numeric(1.1), 1.1)
194+
195+
self.assertEqual(pd.to_numeric('1'), 1)
196+
self.assertEqual(pd.to_numeric('1.1'), 1.1)
197+
198+
with tm.assertRaises(ValueError):
199+
to_numeric('XX', errors='raise')
200+
201+
self.assertEqual(to_numeric('XX', errors='ignore'), 'XX')
202+
self.assertTrue(np.isnan(to_numeric('XX', errors='coerce')))
203+
204+
def test_numeric_dtypes(self):
205+
idx = pd.Index([1, 2, 3], name='xxx')
206+
res = pd.to_numeric(idx)
207+
tm.assert_index_equal(res, idx)
208+
209+
res = pd.to_numeric(pd.Series(idx, name='xxx'))
210+
tm.assert_series_equal(res, pd.Series(idx, name='xxx'))
211+
212+
res = pd.to_numeric(idx.values)
213+
tm.assert_numpy_array_equal(res, idx.values)
214+
215+
idx = pd.Index([1., np.nan, 3., np.nan], name='xxx')
216+
res = pd.to_numeric(idx)
217+
tm.assert_index_equal(res, idx)
218+
219+
res = pd.to_numeric(pd.Series(idx, name='xxx'))
220+
tm.assert_series_equal(res, pd.Series(idx, name='xxx'))
221+
222+
res = pd.to_numeric(idx.values)
223+
tm.assert_numpy_array_equal(res, idx.values)
224+
225+
def test_str(self):
226+
idx = pd.Index(['1', '2', '3'], name='xxx')
227+
exp = np.array([1, 2, 3])
228+
res = pd.to_numeric(idx)
229+
tm.assert_index_equal(res, pd.Index(exp, name='xxx'))
230+
231+
res = pd.to_numeric(pd.Series(idx, name='xxx'))
232+
tm.assert_series_equal(res, pd.Series(exp, name='xxx'))
233+
234+
res = pd.to_numeric(idx.values)
235+
tm.assert_numpy_array_equal(res, exp)
236+
237+
idx = pd.Index(['1.5', '2.7', '3.4'], name='xxx')
238+
exp = np.array([1.5, 2.7, 3.4])
239+
res = pd.to_numeric(idx)
240+
tm.assert_index_equal(res, pd.Index(exp, name='xxx'))
241+
242+
res = pd.to_numeric(pd.Series(idx, name='xxx'))
243+
tm.assert_series_equal(res, pd.Series(exp, name='xxx'))
244+
245+
res = pd.to_numeric(idx.values)
246+
tm.assert_numpy_array_equal(res, exp)
247+
248+
def test_datetimelike(self):
249+
for tz in [None, 'US/Eastern', 'Asia/Tokyo']:
250+
idx = pd.date_range('20130101', periods=3, tz=tz, name='xxx')
251+
res = pd.to_numeric(idx)
252+
tm.assert_index_equal(res, pd.Index(idx.asi8, name='xxx'))
253+
254+
res = pd.to_numeric(pd.Series(idx, name='xxx'))
255+
tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx'))
256+
257+
res = pd.to_numeric(idx.values)
258+
tm.assert_numpy_array_equal(res, idx.asi8)
259+
260+
def test_timedelta(self):
261+
idx = pd.timedelta_range('1 days', periods=3, freq='D', name='xxx')
262+
res = pd.to_numeric(idx)
263+
tm.assert_index_equal(res, pd.Index(idx.asi8, name='xxx'))
264+
265+
res = pd.to_numeric(pd.Series(idx, name='xxx'))
266+
tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx'))
267+
268+
res = pd.to_numeric(idx.values)
269+
tm.assert_numpy_array_equal(res, idx.asi8)
270+
271+
def test_period(self):
272+
idx = pd.period_range('2011-01', periods=3, freq='M', name='xxx')
273+
res = pd.to_numeric(idx)
274+
tm.assert_index_equal(res, pd.Index(idx.asi8, name='xxx'))
275+
276+
# ToDo: enable when we can support native PeriodDtype
277+
# res = pd.to_numeric(pd.Series(idx, name='xxx'))
278+
# tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx'))
279+
280+
148281
if __name__ == '__main__':
149282
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
150283
exit=False)

pandas/tools/util.py

+42-19
Original file line numberDiff line numberDiff line change
@@ -78,29 +78,52 @@ def to_numeric(arg, errors='raise'):
7878
>>> pd.to_numeric(s, errors='ignore')
7979
>>> pd.to_numeric(s, errors='coerce')
8080
"""
81+
is_series = False
82+
is_index = False
83+
is_scalar = False
8184

82-
index = name = None
8385
if isinstance(arg, pd.Series):
84-
index, name = arg.index, arg.name
86+
is_series = True
87+
values = arg.values
88+
elif isinstance(arg, pd.Index):
89+
is_index = True
90+
values = arg.asi8
91+
if values is None:
92+
values = arg.values
8593
elif isinstance(arg, (list, tuple)):
86-
arg = np.array(arg, dtype='O')
94+
values = np.array(arg, dtype='O')
95+
elif np.isscalar(arg):
96+
if com.is_number(arg):
97+
return arg
98+
is_scalar = True
99+
values = np.array([arg], dtype='O')
87100
elif getattr(arg, 'ndim', 1) > 1:
88101
raise TypeError('arg must be a list, tuple, 1-d array, or Series')
102+
else:
103+
values = arg
89104

90-
conv = arg
91-
arg = com._ensure_object(arg)
92-
93-
coerce_numeric = False if errors in ('ignore', 'raise') else True
94-
95-
try:
96-
conv = lib.maybe_convert_numeric(arg,
97-
set(),
98-
coerce_numeric=coerce_numeric)
99-
except:
100-
if errors == 'raise':
101-
raise
102-
103-
if index is not None:
104-
return pd.Series(conv, index=index, name=name)
105+
if com.is_numeric_dtype(values):
106+
pass
107+
elif com.is_datetime_or_timedelta_dtype(values):
108+
values = values.astype(np.int64)
109+
else:
110+
values = com._ensure_object(values)
111+
coerce_numeric = False if errors in ('ignore', 'raise') else True
112+
113+
try:
114+
values = lib.maybe_convert_numeric(values, set(),
115+
coerce_numeric=coerce_numeric)
116+
except:
117+
if errors == 'raise':
118+
raise
119+
120+
if is_series:
121+
return pd.Series(values, index=arg.index, name=arg.name)
122+
elif is_index:
123+
# because we want to coerce to numeric if possible,
124+
# do not use _shallow_copy_with_infer
125+
return Index(values, name=arg.name)
126+
elif is_scalar:
127+
return values[0]
105128
else:
106-
return conv
129+
return values

0 commit comments

Comments
 (0)