diff --git a/asv_bench/benchmarks/miscellaneous.py b/asv_bench/benchmarks/miscellaneous.py index fe610ef4cb376..f9d577a2b56d7 100644 --- a/asv_bench/benchmarks/miscellaneous.py +++ b/asv_bench/benchmarks/miscellaneous.py @@ -27,4 +27,26 @@ def prop(self): self.obj = Foo() def time_misc_cache_readonly(self): - self.obj.prop \ No newline at end of file + self.obj.prop + + +class to_numeric(object): + goal_time = 0.2 + + def setup(self): + self.n = 10000 + self.float = Series(np.random.randn(self.n * 100)) + self.numstr = self.float.astype('str') + self.str = Series(tm.makeStringIndex(self.n)) + + def time_from_float(self): + pd.to_numeric(self.float) + + def time_from_numeric_str(self): + pd.to_numeric(self.numstr) + + def time_from_str_ignore(self): + pd.to_numeric(self.str, errors='ignore') + + def time_from_str_coerce(self): + pd.to_numeric(self.str, errors='coerce') diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index dfe5eaa66df01..b29f5a3f0c0be 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -409,6 +409,8 @@ Performance Improvements - Improved performance of ``DataFrame.to_sql`` when checking case sensitivity for tables. Now only checks if table has been created correctly when table name is not lower case. (:issue:`12876`) - Improved performance of ``Period`` construction and time series plotting (:issue:`12903`, :issue:`11831`). - Improved performance of ``.str.encode()`` and ``.str.decode()`` methods (:issue:`13008`) +- Improved performance of ``to_numeric`` if input is numeric dtype (:issue:`12777`) + @@ -516,3 +518,6 @@ Bug Fixes - Bug in ``.describe()`` resets categorical columns information (:issue:`11558`) - Bug where ``loffset`` argument was not applied when calling ``resample().count()`` on a timeseries (:issue:`12725`) - ``pd.read_excel()`` now accepts column names associated with keyword argument ``names`` (:issue:`12870`) +- Bug in ``to_numeric`` with ``Index`` returns ``np.ndarray``, rather than ``Index`` (:issue:`12777`) +- Bug in ``to_numeric`` with datetime-like may raise ``TypeError`` (:issue:`12777`) +- Bug in ``to_numeric`` with scalar raises ``ValueError`` (:issue:`12777`) diff --git a/pandas/tools/tests/test_util.py b/pandas/tools/tests/test_util.py index 8a40f65af869a..de02ff4c7139d 100644 --- a/pandas/tools/tests/test_util.py +++ b/pandas/tools/tests/test_util.py @@ -2,7 +2,6 @@ import locale import codecs import nose -from nose.tools import assert_raises import numpy as np from numpy.testing import assert_equal @@ -102,9 +101,25 @@ def test_series(self): res = to_numeric(s) tm.assert_series_equal(res, expected) + def test_series_numeric(self): + s = pd.Series([1, 3, 4, 5], index=list('ABCD'), name='XXX') + res = to_numeric(s) + tm.assert_series_equal(res, s) + + s = pd.Series([1., 3., 4., 5.], index=list('ABCD'), name='XXX') + res = to_numeric(s) + tm.assert_series_equal(res, s) + + # bool is regarded as numeric + s = pd.Series([True, False, True, True], + index=list('ABCD'), name='XXX') + res = to_numeric(s) + tm.assert_series_equal(res, s) + def test_error(self): s = pd.Series([1, -3.14, 'apple']) - assert_raises(ValueError, to_numeric, s, errors='raise') + with tm.assertRaises(ValueError): + to_numeric(s, errors='raise') res = to_numeric(s, errors='ignore') expected = pd.Series([1, -3.14, 'apple']) @@ -114,12 +129,40 @@ def test_error(self): expected = pd.Series([1, -3.14, np.nan]) tm.assert_series_equal(res, expected) + def test_error_seen_bool(self): + s = pd.Series([True, False, 'apple']) + with tm.assertRaises(ValueError): + to_numeric(s, errors='raise') + + res = to_numeric(s, errors='ignore') + expected = pd.Series([True, False, 'apple']) + tm.assert_series_equal(res, expected) + + # coerces to float + res = to_numeric(s, errors='coerce') + expected = pd.Series([1., 0., np.nan]) + tm.assert_series_equal(res, expected) + def test_list(self): s = ['1', '-3.14', '7'] res = to_numeric(s) expected = np.array([1, -3.14, 7]) tm.assert_numpy_array_equal(res, expected) + def test_list_numeric(self): + s = [1, 3, 4, 5] + res = to_numeric(s) + tm.assert_numpy_array_equal(res, np.array(s)) + + s = [1., 3., 4., 5.] + res = to_numeric(s) + tm.assert_numpy_array_equal(res, np.array(s)) + + # bool is regarded as numeric + s = [True, False, True, True] + res = to_numeric(s) + tm.assert_numpy_array_equal(res, np.array(s)) + def test_numeric(self): s = pd.Series([1, -3.14, 7], dtype='O') res = to_numeric(s) @@ -145,6 +188,96 @@ def test_type_check(self): with tm.assertRaisesRegexp(TypeError, "1-d array"): to_numeric(df, errors=errors) + def test_scalar(self): + self.assertEqual(pd.to_numeric(1), 1) + self.assertEqual(pd.to_numeric(1.1), 1.1) + + self.assertEqual(pd.to_numeric('1'), 1) + self.assertEqual(pd.to_numeric('1.1'), 1.1) + + with tm.assertRaises(ValueError): + to_numeric('XX', errors='raise') + + self.assertEqual(to_numeric('XX', errors='ignore'), 'XX') + self.assertTrue(np.isnan(to_numeric('XX', errors='coerce'))) + + def test_numeric_dtypes(self): + idx = pd.Index([1, 2, 3], name='xxx') + res = pd.to_numeric(idx) + tm.assert_index_equal(res, idx) + + res = pd.to_numeric(pd.Series(idx, name='xxx')) + tm.assert_series_equal(res, pd.Series(idx, name='xxx')) + + res = pd.to_numeric(idx.values) + tm.assert_numpy_array_equal(res, idx.values) + + idx = pd.Index([1., np.nan, 3., np.nan], name='xxx') + res = pd.to_numeric(idx) + tm.assert_index_equal(res, idx) + + res = pd.to_numeric(pd.Series(idx, name='xxx')) + tm.assert_series_equal(res, pd.Series(idx, name='xxx')) + + res = pd.to_numeric(idx.values) + tm.assert_numpy_array_equal(res, idx.values) + + def test_str(self): + idx = pd.Index(['1', '2', '3'], name='xxx') + exp = np.array([1, 2, 3]) + res = pd.to_numeric(idx) + tm.assert_index_equal(res, pd.Index(exp, name='xxx')) + + res = pd.to_numeric(pd.Series(idx, name='xxx')) + tm.assert_series_equal(res, pd.Series(exp, name='xxx')) + + res = pd.to_numeric(idx.values) + tm.assert_numpy_array_equal(res, exp) + + idx = pd.Index(['1.5', '2.7', '3.4'], name='xxx') + exp = np.array([1.5, 2.7, 3.4]) + res = pd.to_numeric(idx) + tm.assert_index_equal(res, pd.Index(exp, name='xxx')) + + res = pd.to_numeric(pd.Series(idx, name='xxx')) + tm.assert_series_equal(res, pd.Series(exp, name='xxx')) + + res = pd.to_numeric(idx.values) + tm.assert_numpy_array_equal(res, exp) + + def test_datetimelike(self): + for tz in [None, 'US/Eastern', 'Asia/Tokyo']: + idx = pd.date_range('20130101', periods=3, tz=tz, name='xxx') + res = pd.to_numeric(idx) + tm.assert_index_equal(res, pd.Index(idx.asi8, name='xxx')) + + res = pd.to_numeric(pd.Series(idx, name='xxx')) + tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx')) + + res = pd.to_numeric(idx.values) + tm.assert_numpy_array_equal(res, idx.asi8) + + def test_timedelta(self): + idx = pd.timedelta_range('1 days', periods=3, freq='D', name='xxx') + res = pd.to_numeric(idx) + tm.assert_index_equal(res, pd.Index(idx.asi8, name='xxx')) + + res = pd.to_numeric(pd.Series(idx, name='xxx')) + tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx')) + + res = pd.to_numeric(idx.values) + tm.assert_numpy_array_equal(res, idx.asi8) + + def test_period(self): + idx = pd.period_range('2011-01', periods=3, freq='M', name='xxx') + res = pd.to_numeric(idx) + tm.assert_index_equal(res, pd.Index(idx.asi8, name='xxx')) + + # ToDo: enable when we can support native PeriodDtype + # res = pd.to_numeric(pd.Series(idx, name='xxx')) + # tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx')) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tools/util.py b/pandas/tools/util.py index cef5dad72e50b..61d2c0adce2fe 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -78,29 +78,52 @@ def to_numeric(arg, errors='raise'): >>> pd.to_numeric(s, errors='ignore') >>> pd.to_numeric(s, errors='coerce') """ + is_series = False + is_index = False + is_scalar = False - index = name = None if isinstance(arg, pd.Series): - index, name = arg.index, arg.name + is_series = True + values = arg.values + elif isinstance(arg, pd.Index): + is_index = True + values = arg.asi8 + if values is None: + values = arg.values elif isinstance(arg, (list, tuple)): - arg = np.array(arg, dtype='O') + values = np.array(arg, dtype='O') + elif np.isscalar(arg): + if com.is_number(arg): + return arg + is_scalar = True + values = np.array([arg], dtype='O') elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a list, tuple, 1-d array, or Series') + else: + values = arg - conv = arg - arg = com._ensure_object(arg) - - coerce_numeric = False if errors in ('ignore', 'raise') else True - - try: - conv = lib.maybe_convert_numeric(arg, - set(), - coerce_numeric=coerce_numeric) - except: - if errors == 'raise': - raise - - if index is not None: - return pd.Series(conv, index=index, name=name) + if com.is_numeric_dtype(values): + pass + elif com.is_datetime_or_timedelta_dtype(values): + values = values.astype(np.int64) + else: + values = com._ensure_object(values) + coerce_numeric = False if errors in ('ignore', 'raise') else True + + try: + values = lib.maybe_convert_numeric(values, set(), + coerce_numeric=coerce_numeric) + except: + if errors == 'raise': + raise + + if is_series: + return pd.Series(values, index=arg.index, name=arg.name) + elif is_index: + # because we want to coerce to numeric if possible, + # do not use _shallow_copy_with_infer + return Index(values, name=arg.name) + elif is_scalar: + return values[0] else: - return conv + return values