ENH: add 'downcast' to pd.to_numeric

gfyoung · gfyoung · commit 4758dccce15b · 2016-07-09T17:54:26.000-04:00
Closes pandas-devgh-13352.
diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py
@@ -135,4 +135,23 @@ def setup(self):
         self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B']))
 
     def time_dtype_infer_uint32(self):
-        (self.df_uint32['A'] + self.df_uint32['B'])
+        (self.df_uint32['A'] + self.df_uint32['B'])
+
+
+class to_numeric(object):
+    N = 500000
+
+    param_names = ['data', 'downcast']
+    params = [
+        [(['1'] * N / 2) + ([2] * N / 2),
+         (['-1'] * N / 2) + ([2] * N / 2),
+         np.repeat(np.array('1970-01-01', '1970-01-02',
+                            dtype='datetime64[D]'), N),
+         (['1.1'] * N / 2) + ([2] * N / 2),
+         ([1] * N / 2) + ([2] * N / 2),
+         np.repeat(np.int32(1), N)],
+        [None, 'integer', 'signed', 'unsigned', 'float'],
+    ]
+
+    def time_to_numeric(self, data, downcast):
+        pd.to_numeric(data, downcast=downcast)
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -1754,39 +1754,93 @@ Convert a subset of columns to a specified type using :meth:`~DataFrame.astype`
 object conversion
 ~~~~~~~~~~~~~~~~~
 
-:meth:`~DataFrame.convert_objects` is a method to try to force conversion of types from the ``object`` dtype to other types.
-To force conversion of specific types that are *number like*, e.g. could be a string that represents a number,
-pass ``convert_numeric=True``. This will force strings and numbers alike to be numbers if possible, otherwise
-they will be set to ``np.nan``.
+pandas offers various functions to try to force conversion of types from the ``object`` dtype to other types.
+The following functions are available for one dimensional object arrays or scalars:
+
+ 1) :meth:`~pandas.to_datetime` (conversion to datetime objects)
+
+ .. ipython:: python
+
+    import datetime
+    m = ['2016-07-09', datetime.datetime(2016, 3, 2)]
+    pd.to_datetime(m)
+
+ 2) :meth:`~pandas.to_numeric` (conversion to numeric dtypes)
+
+ .. ipython:: python
+
+    m = ['1.1', 2, 3]
+    pd.to_numeric(m)
+
+ 3) :meth:`~pandas.to_timedelta` (conversion to timedelta objects)
+
+ .. ipython:: python
+
+    m = ['5us', pd.Timedelta('1day')]
+    pd.to_timedelta(m)
+
+To force a conversion, we can pass in an ``errors`` argument, which specifies how pandas should deal with elements
+that cannot be converted to desired dtype or object. By default, ``errors='raise'``, meaning that any errors encountered
+will be raised during the conversion process. However, if ``errors='coerce'``, these errors will be ignored and pandas
+will convert problematic elements to ``pd.NaT`` (for datetime and timedelta) or ``np.nan`` (for numeric). This might be
+useful if you are reading in data which is mostly of the desired dtype (e.g. numeric, datetime), but occasionally has
+non-conforming elements intermixed that you want to represent as missing:
 
 .. ipython:: python
-   :okwarning:
 
-   df3['D'] = '1.'
-   df3['E'] = '1'
-   df3.convert_objects(convert_numeric=True).dtypes
+    import datetime
+    m = ['apple', datetime.datetime(2016, 3, 2)]
+    pd.to_datetime(m, errors='coerce')
 
-   # same, but specific dtype conversion
-   df3['D'] = df3['D'].astype('float16')
-   df3['E'] = df3['E'].astype('int32')
-   df3.dtypes
+    m = ['apple', 2, 3]
+    pd.to_numeric(m, errors='coerce')
+
+    m = ['apple', pd.Timedelta('1day')]
+    pd.to_timedelta(m, errors='coerce')
 
-To force conversion to ``datetime64[ns]``, pass ``convert_dates='coerce'``.
-This will convert any datetime-like object to dates, forcing other values to ``NaT``.
-This might be useful if you are reading in data which is mostly dates,
-but occasionally has non-dates intermixed and you want to represent as missing.
+The ``errors`` parameter has a third option of ``errors='ignore'``, which will simply return the passed in data if it
+encounters any errors with the conversion to a desired data type:
 
 .. ipython:: python
 
-   import datetime
-   s = pd.Series([datetime.datetime(2001,1,1,0,0),
-                 'foo', 1.0, 1, pd.Timestamp('20010104'),
-                 '20010105'], dtype='O')
-   s
-   pd.to_datetime(s, errors='coerce')
+    import datetime
+    m = ['apple', datetime.datetime(2016, 3, 2)]
+    pd.to_datetime(m, errors='ignore')
+
+    m = ['apple', 2, 3]
+    pd.to_numeric(m, errors='ignore')
+
+    m = ['apple', pd.Timedelta('1day')]
+    pd.to_timedelta(m, errors='ignore')
+
+In addition to object conversion, :meth:`~pandas.to_numeric` provides another argument `downcast`, which gives the
+option of downcasting the newly (or already) numeric data to a smaller dtype, which can conserve memory:
+
+.. ipython:: python
+
+    m = ['1', 2, 3]
+    pd.to_numeric(m, downcast='integer')   # smallest signed int dtype
+    pd.to_numeric(m, downcast='signed')    # same as 'integer'
+    pd.to_numeric(m, downcast='unsigned')  # smallest unsigned int dtype
+    pd.to_numeric(m, downcast='float')     # smallest float dtype
+
+As these methods apply only to one-dimensional arrays, they cannot be used directly on multi-dimensional objects such
+as DataFrames. However, with :meth:`~pandas.DataFrame.apply`, we can "apply" the function over all elements:
 
-In addition, :meth:`~DataFrame.convert_objects` will attempt the *soft* conversion of any *object* dtypes, meaning that if all
-the objects in a Series are of the same type, the Series will have that dtype.
+.. ipython:: python
+
+    import datetime
+    df = pd.DataFrame([['2016-07-09', datetime.datetime(2016, 3, 2)]] * 2, dtype='O')
+    df
+    df.apply(pd.to_datetime)
+
+    df = pd.DataFrame([['1.1', 2, 3]] * 2, dtype='O')
+    df
+    df.apply(pd.to_numeric)
+
+    df = pd.DataFrame([['5us', pd.Timedelta('1day')]] * 2, dtype='O')
+    df
+    df.apply(pd.to_timedelta)
 
 gotchas
 ~~~~~~~
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -186,6 +186,13 @@ Other enhancements
 ^^^^^^^^^^^^^^^^^^
 
 - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`)
+- ``pd.to_numeric()`` now accepts a ``downcast`` parameter, which will downcast the data if possible to smallest specified numerical dtype (:issue:`13352`)
+
+  .. ipython:: python
+
+     s = ['1', 2, 3]
+     pd.to_numeric(s, downcast='unsigned')
+     pd.to_numeric(s, downcast='integer')
 
 - ``Index`` now supports ``.str.extractall()`` which returns a ``DataFrame``, see :ref:`documentation here <text.extractall>` (:issue:`10008`, :issue:`13156`)
 - ``.to_hdf/read_hdf()`` now accept path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path (:issue:`11773`)
diff --git a/pandas/tools/tests/test_util.py b/pandas/tools/tests/test_util.py
@@ -291,6 +291,83 @@ def test_non_hashable(self):
         with self.assertRaisesRegexp(TypeError, "Invalid object type"):
             pd.to_numeric(s)
 
+    def test_downcast(self):
+        # see gh-13352
+        mixed_data = ['1', 2, 3]
+        int_data = [1, 2, 3]
+        date_data = np.array(['1970-01-02', '1970-01-03',
+                              '1970-01-04'], dtype='datetime64[D]')
+
+        invalid_downcast = 'unsigned-integer'
+        msg = 'invalid downcasting method provided'
+
+        smallest_int_dtype = np.dtype(np.typecodes['Integer'][0])
+        smallest_uint_dtype = np.dtype(np.typecodes['UnsignedInteger'][0])
+
+        # support below np.float32 is rare and far between
+        float_32_char = np.dtype(np.float32).char
+        smallest_float_dtype = float_32_char
+
+        for data in (mixed_data, int_data, date_data):
+            with self.assertRaisesRegexp(ValueError, msg):
+                pd.to_numeric(data, downcast=invalid_downcast)
+
+            expected = np.array([1, 2, 3], dtype=np.int64)
+
+            res = pd.to_numeric(data)
+            tm.assert_numpy_array_equal(res, expected)
+
+            res = pd.to_numeric(data, downcast=None)
+            tm.assert_numpy_array_equal(res, expected)
+
+            expected = np.array([1, 2, 3], dtype=smallest_int_dtype)
+
+            for signed_downcast in ('integer', 'signed'):
+                res = pd.to_numeric(data, downcast=signed_downcast)
+                tm.assert_numpy_array_equal(res, expected)
+
+            expected = np.array([1, 2, 3], dtype=smallest_uint_dtype)
+            res = pd.to_numeric(data, downcast='unsigned')
+            tm.assert_numpy_array_equal(res, expected)
+
+            expected = np.array([1, 2, 3], dtype=smallest_float_dtype)
+            res = pd.to_numeric(data, downcast='float')
+            tm.assert_numpy_array_equal(res, expected)
+
+        # if we can't successfully cast the given
+        # data to a numeric dtype, do not bother
+        # with the downcast parameter
+        data = ['foo', 2, 3]
+        expected = np.array(data, dtype=object)
+        res = pd.to_numeric(data, errors='ignore',
+                            downcast='unsigned')
+        tm.assert_numpy_array_equal(res, expected)
+
+        # cannot cast to an unsigned integer because
+        # we have a negative number
+        data = ['-1', 2, 3]
+        expected = np.array([-1, 2, 3], dtype=np.int64)
+        res = pd.to_numeric(data, downcast='unsigned')
+        tm.assert_numpy_array_equal(res, expected)
+
+        # cannot cast to an integer (signed or unsigned)
+        # because we have a float number
+        data = ['1.1', 2, 3]
+        expected = np.array([1.1, 2, 3], dtype=np.float64)
+
+        for downcast in ('integer', 'signed', 'unsigned'):
+            res = pd.to_numeric(data, downcast=downcast)
+            tm.assert_numpy_array_equal(res, expected)
+
+        # the smallest integer dtype need not be np.(u)int8
+        data = ['256', 257, 258]
+
+        for downcast, expected_dtype in zip(
+                ['integer', 'signed', 'unsigned'],
+                [np.int16, np.int16, np.uint16]):
+            expected = np.array([256, 257, 258], dtype=expected_dtype)
+            res = pd.to_numeric(data, downcast=downcast)
+            tm.assert_numpy_array_equal(res, expected)
 
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
diff --git a/pandas/tools/util.py b/pandas/tools/util.py
@@ -50,7 +50,7 @@ def compose(*funcs):
     return reduce(_compose2, funcs)
 
 
-def to_numeric(arg, errors='raise'):
+def to_numeric(arg, errors='raise', downcast=None):
     """
     Convert argument to a numeric type.
 
@@ -61,6 +61,27 @@ def to_numeric(arg, errors='raise'):
         - If 'raise', then invalid parsing will raise an exception
         - If 'coerce', then invalid parsing will be set as NaN
         - If 'ignore', then invalid parsing will return the input
+    downcast : {'integer', 'signed', 'unsigned', 'float'} , default None
+        If not None, and if the data has been successfully cast to a
+        numerical dtype (or if the data was numeric to begin with),
+        downcast that resulting data to the smallest numerical dtype
+        possible according to the following rules:
+
+        - 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
+        - 'unsigned': smallest unsigned int dtype (min.: np.uint8)
+        - 'float': smallest float dtype (min.: np.float32)
+
+        As this behaviour is separate from the core conversion to
+        numeric values, any errors raised during the downcasting
+        will be surfaced regardless of the value of the 'errors' input.
+
+        In addition, downcasting will only occur if the size
+        of the resulting data's dtype is strictly larger than
+        the dtype it is to be cast to, so if none of the dtypes
+        checked satisfy that specification, no downcasting will be
+        performed on the data.
+
+        .. versionadded:: 0.19.0
 
     Returns
     -------
@@ -74,10 +95,37 @@ def to_numeric(arg, errors='raise'):
     >>> import pandas as pd
     >>> s = pd.Series(['1.0', '2', -3])
     >>> pd.to_numeric(s)
+    0    1.0
+    1    2.0
+    2   -3.0
+    dtype: float64
+    >>> pd.to_numeric(s, downcast='float')
+    0    1.0
+    1    2.0
+    2   -3.0
+    dtype: float32
+    >>> pd.to_numeric(s, downcast='signed')
+    0    1
+    1    2
+    2   -3
+    dtype: int8
     >>> s = pd.Series(['apple', '1.0', '2', -3])
     >>> pd.to_numeric(s, errors='ignore')
+    0    apple
+    1      1.0
+    2        2
+    3       -3
+    dtype: object
     >>> pd.to_numeric(s, errors='coerce')
+    0    NaN
+    1    1.0
+    2    2.0
+    3   -3.0
+    dtype: float64
     """
+    if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'):
+        raise ValueError('invalid downcasting method provided')
+
     is_series = False
     is_index = False
     is_scalar = False
@@ -102,20 +150,51 @@ def to_numeric(arg, errors='raise'):
     else:
         values = arg
 
-    if com.is_numeric_dtype(values):
-        pass
-    elif com.is_datetime_or_timedelta_dtype(values):
-        values = values.astype(np.int64)
-    else:
-        values = com._ensure_object(values)
-        coerce_numeric = False if errors in ('ignore', 'raise') else True
+    try:
+        if com.is_numeric_dtype(values):
+            pass
+        elif com.is_datetime_or_timedelta_dtype(values):
+            values = values.astype(np.int64)
+        else:
+            values = com._ensure_object(values)
+            coerce_numeric = False if errors in ('ignore', 'raise') else True
 
-        try:
             values = lib.maybe_convert_numeric(values, set(),
                                                coerce_numeric=coerce_numeric)
-        except:
-            if errors == 'raise':
-                raise
+
+    except Exception:
+        if errors == 'raise':
+            raise
+
+    # attempt downcast only if the data has been successfully converted
+    # to a numerical dtype and if a downcast method has been specified
+    if downcast is not None and com.is_numeric_dtype(values):
+        typecodes = None
+
+        if downcast in ('integer', 'signed'):
+            typecodes = np.typecodes['Integer']
+        elif downcast == 'unsigned' and np.min(values) > 0:
+            typecodes = np.typecodes['UnsignedInteger']
+        elif downcast == 'float':
+            typecodes = np.typecodes['Float']
+
+            # pandas support goes only to np.float32,
+            # as float dtypes smaller than that are
+            # extremely rare and not well supported
+            float_32_char = np.dtype(np.float32).char
+            float_32_ind = typecodes.index(float_32_char)
+            typecodes = typecodes[float_32_ind:]
+
+        if typecodes is not None:
+            # from smallest to largest
+            for dtype in typecodes:
+                if np.dtype(dtype).itemsize < values.dtype.itemsize:
+                    values = com._possibly_downcast_to_dtype(
+                        values, dtype)
+
+                    # successful conversion
+                    if values.dtype == dtype:
+                        break
 
     if is_series:
         return pd.Series(values, index=arg.index, name=arg.name)