From 4758dccce15b60b4ccb5a2c2ca06250af6e55c0d Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 10 Jun 2016 23:34:00 +0100 Subject: [PATCH] ENH: add 'downcast' to pd.to_numeric Closes gh-13352. --- asv_bench/benchmarks/inference.py | 21 +++++- doc/source/basics.rst | 102 ++++++++++++++++++++++------- doc/source/whatsnew/v0.19.0.txt | 7 ++ pandas/tools/tests/test_util.py | 77 ++++++++++++++++++++++ pandas/tools/util.py | 103 ++++++++++++++++++++++++++---- 5 files changed, 273 insertions(+), 37 deletions(-) diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 3fceed087facb..6809c351beade 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -135,4 +135,23 @@ def setup(self): self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) def time_dtype_infer_uint32(self): - (self.df_uint32['A'] + self.df_uint32['B']) \ No newline at end of file + (self.df_uint32['A'] + self.df_uint32['B']) + + +class to_numeric(object): + N = 500000 + + param_names = ['data', 'downcast'] + params = [ + [(['1'] * N / 2) + ([2] * N / 2), + (['-1'] * N / 2) + ([2] * N / 2), + np.repeat(np.array('1970-01-01', '1970-01-02', + dtype='datetime64[D]'), N), + (['1.1'] * N / 2) + ([2] * N / 2), + ([1] * N / 2) + ([2] * N / 2), + np.repeat(np.int32(1), N)], + [None, 'integer', 'signed', 'unsigned', 'float'], + ] + + def time_to_numeric(self, data, downcast): + pd.to_numeric(data, downcast=downcast) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 8145e9536a82a..203ff6a2ef2f6 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1754,39 +1754,93 @@ Convert a subset of columns to a specified type using :meth:`~DataFrame.astype` object conversion ~~~~~~~~~~~~~~~~~ -:meth:`~DataFrame.convert_objects` is a method to try to force conversion of types from the ``object`` dtype to other types. -To force conversion of specific types that are *number like*, e.g. could be a string that represents a number, -pass ``convert_numeric=True``. This will force strings and numbers alike to be numbers if possible, otherwise -they will be set to ``np.nan``. +pandas offers various functions to try to force conversion of types from the ``object`` dtype to other types. +The following functions are available for one dimensional object arrays or scalars: + + 1) :meth:`~pandas.to_datetime` (conversion to datetime objects) + + .. ipython:: python + + import datetime + m = ['2016-07-09', datetime.datetime(2016, 3, 2)] + pd.to_datetime(m) + + 2) :meth:`~pandas.to_numeric` (conversion to numeric dtypes) + + .. ipython:: python + + m = ['1.1', 2, 3] + pd.to_numeric(m) + + 3) :meth:`~pandas.to_timedelta` (conversion to timedelta objects) + + .. ipython:: python + + m = ['5us', pd.Timedelta('1day')] + pd.to_timedelta(m) + +To force a conversion, we can pass in an ``errors`` argument, which specifies how pandas should deal with elements +that cannot be converted to desired dtype or object. By default, ``errors='raise'``, meaning that any errors encountered +will be raised during the conversion process. However, if ``errors='coerce'``, these errors will be ignored and pandas +will convert problematic elements to ``pd.NaT`` (for datetime and timedelta) or ``np.nan`` (for numeric). This might be +useful if you are reading in data which is mostly of the desired dtype (e.g. numeric, datetime), but occasionally has +non-conforming elements intermixed that you want to represent as missing: .. ipython:: python - :okwarning: - df3['D'] = '1.' - df3['E'] = '1' - df3.convert_objects(convert_numeric=True).dtypes + import datetime + m = ['apple', datetime.datetime(2016, 3, 2)] + pd.to_datetime(m, errors='coerce') - # same, but specific dtype conversion - df3['D'] = df3['D'].astype('float16') - df3['E'] = df3['E'].astype('int32') - df3.dtypes + m = ['apple', 2, 3] + pd.to_numeric(m, errors='coerce') + + m = ['apple', pd.Timedelta('1day')] + pd.to_timedelta(m, errors='coerce') -To force conversion to ``datetime64[ns]``, pass ``convert_dates='coerce'``. -This will convert any datetime-like object to dates, forcing other values to ``NaT``. -This might be useful if you are reading in data which is mostly dates, -but occasionally has non-dates intermixed and you want to represent as missing. +The ``errors`` parameter has a third option of ``errors='ignore'``, which will simply return the passed in data if it +encounters any errors with the conversion to a desired data type: .. ipython:: python - import datetime - s = pd.Series([datetime.datetime(2001,1,1,0,0), - 'foo', 1.0, 1, pd.Timestamp('20010104'), - '20010105'], dtype='O') - s - pd.to_datetime(s, errors='coerce') + import datetime + m = ['apple', datetime.datetime(2016, 3, 2)] + pd.to_datetime(m, errors='ignore') + + m = ['apple', 2, 3] + pd.to_numeric(m, errors='ignore') + + m = ['apple', pd.Timedelta('1day')] + pd.to_timedelta(m, errors='ignore') + +In addition to object conversion, :meth:`~pandas.to_numeric` provides another argument `downcast`, which gives the +option of downcasting the newly (or already) numeric data to a smaller dtype, which can conserve memory: + +.. ipython:: python + + m = ['1', 2, 3] + pd.to_numeric(m, downcast='integer') # smallest signed int dtype + pd.to_numeric(m, downcast='signed') # same as 'integer' + pd.to_numeric(m, downcast='unsigned') # smallest unsigned int dtype + pd.to_numeric(m, downcast='float') # smallest float dtype + +As these methods apply only to one-dimensional arrays, they cannot be used directly on multi-dimensional objects such +as DataFrames. However, with :meth:`~pandas.DataFrame.apply`, we can "apply" the function over all elements: -In addition, :meth:`~DataFrame.convert_objects` will attempt the *soft* conversion of any *object* dtypes, meaning that if all -the objects in a Series are of the same type, the Series will have that dtype. +.. ipython:: python + + import datetime + df = pd.DataFrame([['2016-07-09', datetime.datetime(2016, 3, 2)]] * 2, dtype='O') + df + df.apply(pd.to_datetime) + + df = pd.DataFrame([['1.1', 2, 3]] * 2, dtype='O') + df + df.apply(pd.to_numeric) + + df = pd.DataFrame([['5us', pd.Timedelta('1day')]] * 2, dtype='O') + df + df.apply(pd.to_timedelta) gotchas ~~~~~~~ diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 657de7ec26efc..3a6a0b6213a9c 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -186,6 +186,13 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`) +- ``pd.to_numeric()`` now accepts a ``downcast`` parameter, which will downcast the data if possible to smallest specified numerical dtype (:issue:`13352`) + + .. ipython:: python + + s = ['1', 2, 3] + pd.to_numeric(s, downcast='unsigned') + pd.to_numeric(s, downcast='integer') - ``Index`` now supports ``.str.extractall()`` which returns a ``DataFrame``, see :ref:`documentation here ` (:issue:`10008`, :issue:`13156`) - ``.to_hdf/read_hdf()`` now accept path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path (:issue:`11773`) diff --git a/pandas/tools/tests/test_util.py b/pandas/tools/tests/test_util.py index c592b33bdab9a..5b738086a1ad4 100644 --- a/pandas/tools/tests/test_util.py +++ b/pandas/tools/tests/test_util.py @@ -291,6 +291,83 @@ def test_non_hashable(self): with self.assertRaisesRegexp(TypeError, "Invalid object type"): pd.to_numeric(s) + def test_downcast(self): + # see gh-13352 + mixed_data = ['1', 2, 3] + int_data = [1, 2, 3] + date_data = np.array(['1970-01-02', '1970-01-03', + '1970-01-04'], dtype='datetime64[D]') + + invalid_downcast = 'unsigned-integer' + msg = 'invalid downcasting method provided' + + smallest_int_dtype = np.dtype(np.typecodes['Integer'][0]) + smallest_uint_dtype = np.dtype(np.typecodes['UnsignedInteger'][0]) + + # support below np.float32 is rare and far between + float_32_char = np.dtype(np.float32).char + smallest_float_dtype = float_32_char + + for data in (mixed_data, int_data, date_data): + with self.assertRaisesRegexp(ValueError, msg): + pd.to_numeric(data, downcast=invalid_downcast) + + expected = np.array([1, 2, 3], dtype=np.int64) + + res = pd.to_numeric(data) + tm.assert_numpy_array_equal(res, expected) + + res = pd.to_numeric(data, downcast=None) + tm.assert_numpy_array_equal(res, expected) + + expected = np.array([1, 2, 3], dtype=smallest_int_dtype) + + for signed_downcast in ('integer', 'signed'): + res = pd.to_numeric(data, downcast=signed_downcast) + tm.assert_numpy_array_equal(res, expected) + + expected = np.array([1, 2, 3], dtype=smallest_uint_dtype) + res = pd.to_numeric(data, downcast='unsigned') + tm.assert_numpy_array_equal(res, expected) + + expected = np.array([1, 2, 3], dtype=smallest_float_dtype) + res = pd.to_numeric(data, downcast='float') + tm.assert_numpy_array_equal(res, expected) + + # if we can't successfully cast the given + # data to a numeric dtype, do not bother + # with the downcast parameter + data = ['foo', 2, 3] + expected = np.array(data, dtype=object) + res = pd.to_numeric(data, errors='ignore', + downcast='unsigned') + tm.assert_numpy_array_equal(res, expected) + + # cannot cast to an unsigned integer because + # we have a negative number + data = ['-1', 2, 3] + expected = np.array([-1, 2, 3], dtype=np.int64) + res = pd.to_numeric(data, downcast='unsigned') + tm.assert_numpy_array_equal(res, expected) + + # cannot cast to an integer (signed or unsigned) + # because we have a float number + data = ['1.1', 2, 3] + expected = np.array([1.1, 2, 3], dtype=np.float64) + + for downcast in ('integer', 'signed', 'unsigned'): + res = pd.to_numeric(data, downcast=downcast) + tm.assert_numpy_array_equal(res, expected) + + # the smallest integer dtype need not be np.(u)int8 + data = ['256', 257, 258] + + for downcast, expected_dtype in zip( + ['integer', 'signed', 'unsigned'], + [np.int16, np.int16, np.uint16]): + expected = np.array([256, 257, 258], dtype=expected_dtype) + res = pd.to_numeric(data, downcast=downcast) + tm.assert_numpy_array_equal(res, expected) if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tools/util.py b/pandas/tools/util.py index 61d2c0adce2fe..d70904e1bf286 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -50,7 +50,7 @@ def compose(*funcs): return reduce(_compose2, funcs) -def to_numeric(arg, errors='raise'): +def to_numeric(arg, errors='raise', downcast=None): """ Convert argument to a numeric type. @@ -61,6 +61,27 @@ def to_numeric(arg, errors='raise'): - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaN - If 'ignore', then invalid parsing will return the input + downcast : {'integer', 'signed', 'unsigned', 'float'} , default None + If not None, and if the data has been successfully cast to a + numerical dtype (or if the data was numeric to begin with), + downcast that resulting data to the smallest numerical dtype + possible according to the following rules: + + - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) + - 'unsigned': smallest unsigned int dtype (min.: np.uint8) + - 'float': smallest float dtype (min.: np.float32) + + As this behaviour is separate from the core conversion to + numeric values, any errors raised during the downcasting + will be surfaced regardless of the value of the 'errors' input. + + In addition, downcasting will only occur if the size + of the resulting data's dtype is strictly larger than + the dtype it is to be cast to, so if none of the dtypes + checked satisfy that specification, no downcasting will be + performed on the data. + + .. versionadded:: 0.19.0 Returns ------- @@ -74,10 +95,37 @@ def to_numeric(arg, errors='raise'): >>> import pandas as pd >>> s = pd.Series(['1.0', '2', -3]) >>> pd.to_numeric(s) + 0 1.0 + 1 2.0 + 2 -3.0 + dtype: float64 + >>> pd.to_numeric(s, downcast='float') + 0 1.0 + 1 2.0 + 2 -3.0 + dtype: float32 + >>> pd.to_numeric(s, downcast='signed') + 0 1 + 1 2 + 2 -3 + dtype: int8 >>> s = pd.Series(['apple', '1.0', '2', -3]) >>> pd.to_numeric(s, errors='ignore') + 0 apple + 1 1.0 + 2 2 + 3 -3 + dtype: object >>> pd.to_numeric(s, errors='coerce') + 0 NaN + 1 1.0 + 2 2.0 + 3 -3.0 + dtype: float64 """ + if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'): + raise ValueError('invalid downcasting method provided') + is_series = False is_index = False is_scalar = False @@ -102,20 +150,51 @@ def to_numeric(arg, errors='raise'): else: values = arg - if com.is_numeric_dtype(values): - pass - elif com.is_datetime_or_timedelta_dtype(values): - values = values.astype(np.int64) - else: - values = com._ensure_object(values) - coerce_numeric = False if errors in ('ignore', 'raise') else True + try: + if com.is_numeric_dtype(values): + pass + elif com.is_datetime_or_timedelta_dtype(values): + values = values.astype(np.int64) + else: + values = com._ensure_object(values) + coerce_numeric = False if errors in ('ignore', 'raise') else True - try: values = lib.maybe_convert_numeric(values, set(), coerce_numeric=coerce_numeric) - except: - if errors == 'raise': - raise + + except Exception: + if errors == 'raise': + raise + + # attempt downcast only if the data has been successfully converted + # to a numerical dtype and if a downcast method has been specified + if downcast is not None and com.is_numeric_dtype(values): + typecodes = None + + if downcast in ('integer', 'signed'): + typecodes = np.typecodes['Integer'] + elif downcast == 'unsigned' and np.min(values) > 0: + typecodes = np.typecodes['UnsignedInteger'] + elif downcast == 'float': + typecodes = np.typecodes['Float'] + + # pandas support goes only to np.float32, + # as float dtypes smaller than that are + # extremely rare and not well supported + float_32_char = np.dtype(np.float32).char + float_32_ind = typecodes.index(float_32_char) + typecodes = typecodes[float_32_ind:] + + if typecodes is not None: + # from smallest to largest + for dtype in typecodes: + if np.dtype(dtype).itemsize < values.dtype.itemsize: + values = com._possibly_downcast_to_dtype( + values, dtype) + + # successful conversion + if values.dtype == dtype: + break if is_series: return pd.Series(values, index=arg.index, name=arg.name)