Skip to content

Commit 4758dcc

Browse files
committed
ENH: add 'downcast' to pd.to_numeric
Closes pandas-devgh-13352.
1 parent 5701c69 commit 4758dcc

File tree

5 files changed

+273
-37
lines changed

5 files changed

+273
-37
lines changed

asv_bench/benchmarks/inference.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -135,4 +135,23 @@ def setup(self):
135135
self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B']))
136136

137137
def time_dtype_infer_uint32(self):
138-
(self.df_uint32['A'] + self.df_uint32['B'])
138+
(self.df_uint32['A'] + self.df_uint32['B'])
139+
140+
141+
class to_numeric(object):
142+
N = 500000
143+
144+
param_names = ['data', 'downcast']
145+
params = [
146+
[(['1'] * N / 2) + ([2] * N / 2),
147+
(['-1'] * N / 2) + ([2] * N / 2),
148+
np.repeat(np.array('1970-01-01', '1970-01-02',
149+
dtype='datetime64[D]'), N),
150+
(['1.1'] * N / 2) + ([2] * N / 2),
151+
([1] * N / 2) + ([2] * N / 2),
152+
np.repeat(np.int32(1), N)],
153+
[None, 'integer', 'signed', 'unsigned', 'float'],
154+
]
155+
156+
def time_to_numeric(self, data, downcast):
157+
pd.to_numeric(data, downcast=downcast)

doc/source/basics.rst

+78-24
Original file line numberDiff line numberDiff line change
@@ -1754,39 +1754,93 @@ Convert a subset of columns to a specified type using :meth:`~DataFrame.astype`
17541754
object conversion
17551755
~~~~~~~~~~~~~~~~~
17561756

1757-
:meth:`~DataFrame.convert_objects` is a method to try to force conversion of types from the ``object`` dtype to other types.
1758-
To force conversion of specific types that are *number like*, e.g. could be a string that represents a number,
1759-
pass ``convert_numeric=True``. This will force strings and numbers alike to be numbers if possible, otherwise
1760-
they will be set to ``np.nan``.
1757+
pandas offers various functions to try to force conversion of types from the ``object`` dtype to other types.
1758+
The following functions are available for one dimensional object arrays or scalars:
1759+
1760+
1) :meth:`~pandas.to_datetime` (conversion to datetime objects)
1761+
1762+
.. ipython:: python
1763+
1764+
import datetime
1765+
m = ['2016-07-09', datetime.datetime(2016, 3, 2)]
1766+
pd.to_datetime(m)
1767+
1768+
2) :meth:`~pandas.to_numeric` (conversion to numeric dtypes)
1769+
1770+
.. ipython:: python
1771+
1772+
m = ['1.1', 2, 3]
1773+
pd.to_numeric(m)
1774+
1775+
3) :meth:`~pandas.to_timedelta` (conversion to timedelta objects)
1776+
1777+
.. ipython:: python
1778+
1779+
m = ['5us', pd.Timedelta('1day')]
1780+
pd.to_timedelta(m)
1781+
1782+
To force a conversion, we can pass in an ``errors`` argument, which specifies how pandas should deal with elements
1783+
that cannot be converted to desired dtype or object. By default, ``errors='raise'``, meaning that any errors encountered
1784+
will be raised during the conversion process. However, if ``errors='coerce'``, these errors will be ignored and pandas
1785+
will convert problematic elements to ``pd.NaT`` (for datetime and timedelta) or ``np.nan`` (for numeric). This might be
1786+
useful if you are reading in data which is mostly of the desired dtype (e.g. numeric, datetime), but occasionally has
1787+
non-conforming elements intermixed that you want to represent as missing:
17611788

17621789
.. ipython:: python
1763-
:okwarning:
17641790
1765-
df3['D'] = '1.'
1766-
df3['E'] = '1'
1767-
df3.convert_objects(convert_numeric=True).dtypes
1791+
import datetime
1792+
m = ['apple', datetime.datetime(2016, 3, 2)]
1793+
pd.to_datetime(m, errors='coerce')
17681794
1769-
# same, but specific dtype conversion
1770-
df3['D'] = df3['D'].astype('float16')
1771-
df3['E'] = df3['E'].astype('int32')
1772-
df3.dtypes
1795+
m = ['apple', 2, 3]
1796+
pd.to_numeric(m, errors='coerce')
1797+
1798+
m = ['apple', pd.Timedelta('1day')]
1799+
pd.to_timedelta(m, errors='coerce')
17731800
1774-
To force conversion to ``datetime64[ns]``, pass ``convert_dates='coerce'``.
1775-
This will convert any datetime-like object to dates, forcing other values to ``NaT``.
1776-
This might be useful if you are reading in data which is mostly dates,
1777-
but occasionally has non-dates intermixed and you want to represent as missing.
1801+
The ``errors`` parameter has a third option of ``errors='ignore'``, which will simply return the passed in data if it
1802+
encounters any errors with the conversion to a desired data type:
17781803

17791804
.. ipython:: python
17801805
1781-
import datetime
1782-
s = pd.Series([datetime.datetime(2001,1,1,0,0),
1783-
'foo', 1.0, 1, pd.Timestamp('20010104'),
1784-
'20010105'], dtype='O')
1785-
s
1786-
pd.to_datetime(s, errors='coerce')
1806+
import datetime
1807+
m = ['apple', datetime.datetime(2016, 3, 2)]
1808+
pd.to_datetime(m, errors='ignore')
1809+
1810+
m = ['apple', 2, 3]
1811+
pd.to_numeric(m, errors='ignore')
1812+
1813+
m = ['apple', pd.Timedelta('1day')]
1814+
pd.to_timedelta(m, errors='ignore')
1815+
1816+
In addition to object conversion, :meth:`~pandas.to_numeric` provides another argument `downcast`, which gives the
1817+
option of downcasting the newly (or already) numeric data to a smaller dtype, which can conserve memory:
1818+
1819+
.. ipython:: python
1820+
1821+
m = ['1', 2, 3]
1822+
pd.to_numeric(m, downcast='integer') # smallest signed int dtype
1823+
pd.to_numeric(m, downcast='signed') # same as 'integer'
1824+
pd.to_numeric(m, downcast='unsigned') # smallest unsigned int dtype
1825+
pd.to_numeric(m, downcast='float') # smallest float dtype
1826+
1827+
As these methods apply only to one-dimensional arrays, they cannot be used directly on multi-dimensional objects such
1828+
as DataFrames. However, with :meth:`~pandas.DataFrame.apply`, we can "apply" the function over all elements:
17871829

1788-
In addition, :meth:`~DataFrame.convert_objects` will attempt the *soft* conversion of any *object* dtypes, meaning that if all
1789-
the objects in a Series are of the same type, the Series will have that dtype.
1830+
.. ipython:: python
1831+
1832+
import datetime
1833+
df = pd.DataFrame([['2016-07-09', datetime.datetime(2016, 3, 2)]] * 2, dtype='O')
1834+
df
1835+
df.apply(pd.to_datetime)
1836+
1837+
df = pd.DataFrame([['1.1', 2, 3]] * 2, dtype='O')
1838+
df
1839+
df.apply(pd.to_numeric)
1840+
1841+
df = pd.DataFrame([['5us', pd.Timedelta('1day')]] * 2, dtype='O')
1842+
df
1843+
df.apply(pd.to_timedelta)
17901844
17911845
gotchas
17921846
~~~~~~~

doc/source/whatsnew/v0.19.0.txt

+7
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,13 @@ Other enhancements
186186
^^^^^^^^^^^^^^^^^^
187187

188188
- The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`)
189+
- ``pd.to_numeric()`` now accepts a ``downcast`` parameter, which will downcast the data if possible to smallest specified numerical dtype (:issue:`13352`)
190+
191+
.. ipython:: python
192+
193+
s = ['1', 2, 3]
194+
pd.to_numeric(s, downcast='unsigned')
195+
pd.to_numeric(s, downcast='integer')
189196

190197
- ``Index`` now supports ``.str.extractall()`` which returns a ``DataFrame``, see :ref:`documentation here <text.extractall>` (:issue:`10008`, :issue:`13156`)
191198
- ``.to_hdf/read_hdf()`` now accept path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path (:issue:`11773`)

pandas/tools/tests/test_util.py

+77
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,83 @@ def test_non_hashable(self):
291291
with self.assertRaisesRegexp(TypeError, "Invalid object type"):
292292
pd.to_numeric(s)
293293

294+
def test_downcast(self):
295+
# see gh-13352
296+
mixed_data = ['1', 2, 3]
297+
int_data = [1, 2, 3]
298+
date_data = np.array(['1970-01-02', '1970-01-03',
299+
'1970-01-04'], dtype='datetime64[D]')
300+
301+
invalid_downcast = 'unsigned-integer'
302+
msg = 'invalid downcasting method provided'
303+
304+
smallest_int_dtype = np.dtype(np.typecodes['Integer'][0])
305+
smallest_uint_dtype = np.dtype(np.typecodes['UnsignedInteger'][0])
306+
307+
# support below np.float32 is rare and far between
308+
float_32_char = np.dtype(np.float32).char
309+
smallest_float_dtype = float_32_char
310+
311+
for data in (mixed_data, int_data, date_data):
312+
with self.assertRaisesRegexp(ValueError, msg):
313+
pd.to_numeric(data, downcast=invalid_downcast)
314+
315+
expected = np.array([1, 2, 3], dtype=np.int64)
316+
317+
res = pd.to_numeric(data)
318+
tm.assert_numpy_array_equal(res, expected)
319+
320+
res = pd.to_numeric(data, downcast=None)
321+
tm.assert_numpy_array_equal(res, expected)
322+
323+
expected = np.array([1, 2, 3], dtype=smallest_int_dtype)
324+
325+
for signed_downcast in ('integer', 'signed'):
326+
res = pd.to_numeric(data, downcast=signed_downcast)
327+
tm.assert_numpy_array_equal(res, expected)
328+
329+
expected = np.array([1, 2, 3], dtype=smallest_uint_dtype)
330+
res = pd.to_numeric(data, downcast='unsigned')
331+
tm.assert_numpy_array_equal(res, expected)
332+
333+
expected = np.array([1, 2, 3], dtype=smallest_float_dtype)
334+
res = pd.to_numeric(data, downcast='float')
335+
tm.assert_numpy_array_equal(res, expected)
336+
337+
# if we can't successfully cast the given
338+
# data to a numeric dtype, do not bother
339+
# with the downcast parameter
340+
data = ['foo', 2, 3]
341+
expected = np.array(data, dtype=object)
342+
res = pd.to_numeric(data, errors='ignore',
343+
downcast='unsigned')
344+
tm.assert_numpy_array_equal(res, expected)
345+
346+
# cannot cast to an unsigned integer because
347+
# we have a negative number
348+
data = ['-1', 2, 3]
349+
expected = np.array([-1, 2, 3], dtype=np.int64)
350+
res = pd.to_numeric(data, downcast='unsigned')
351+
tm.assert_numpy_array_equal(res, expected)
352+
353+
# cannot cast to an integer (signed or unsigned)
354+
# because we have a float number
355+
data = ['1.1', 2, 3]
356+
expected = np.array([1.1, 2, 3], dtype=np.float64)
357+
358+
for downcast in ('integer', 'signed', 'unsigned'):
359+
res = pd.to_numeric(data, downcast=downcast)
360+
tm.assert_numpy_array_equal(res, expected)
361+
362+
# the smallest integer dtype need not be np.(u)int8
363+
data = ['256', 257, 258]
364+
365+
for downcast, expected_dtype in zip(
366+
['integer', 'signed', 'unsigned'],
367+
[np.int16, np.int16, np.uint16]):
368+
expected = np.array([256, 257, 258], dtype=expected_dtype)
369+
res = pd.to_numeric(data, downcast=downcast)
370+
tm.assert_numpy_array_equal(res, expected)
294371

295372
if __name__ == '__main__':
296373
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

pandas/tools/util.py

+91-12
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def compose(*funcs):
5050
return reduce(_compose2, funcs)
5151

5252

53-
def to_numeric(arg, errors='raise'):
53+
def to_numeric(arg, errors='raise', downcast=None):
5454
"""
5555
Convert argument to a numeric type.
5656
@@ -61,6 +61,27 @@ def to_numeric(arg, errors='raise'):
6161
- If 'raise', then invalid parsing will raise an exception
6262
- If 'coerce', then invalid parsing will be set as NaN
6363
- If 'ignore', then invalid parsing will return the input
64+
downcast : {'integer', 'signed', 'unsigned', 'float'} , default None
65+
If not None, and if the data has been successfully cast to a
66+
numerical dtype (or if the data was numeric to begin with),
67+
downcast that resulting data to the smallest numerical dtype
68+
possible according to the following rules:
69+
70+
- 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
71+
- 'unsigned': smallest unsigned int dtype (min.: np.uint8)
72+
- 'float': smallest float dtype (min.: np.float32)
73+
74+
As this behaviour is separate from the core conversion to
75+
numeric values, any errors raised during the downcasting
76+
will be surfaced regardless of the value of the 'errors' input.
77+
78+
In addition, downcasting will only occur if the size
79+
of the resulting data's dtype is strictly larger than
80+
the dtype it is to be cast to, so if none of the dtypes
81+
checked satisfy that specification, no downcasting will be
82+
performed on the data.
83+
84+
.. versionadded:: 0.19.0
6485
6586
Returns
6687
-------
@@ -74,10 +95,37 @@ def to_numeric(arg, errors='raise'):
7495
>>> import pandas as pd
7596
>>> s = pd.Series(['1.0', '2', -3])
7697
>>> pd.to_numeric(s)
98+
0 1.0
99+
1 2.0
100+
2 -3.0
101+
dtype: float64
102+
>>> pd.to_numeric(s, downcast='float')
103+
0 1.0
104+
1 2.0
105+
2 -3.0
106+
dtype: float32
107+
>>> pd.to_numeric(s, downcast='signed')
108+
0 1
109+
1 2
110+
2 -3
111+
dtype: int8
77112
>>> s = pd.Series(['apple', '1.0', '2', -3])
78113
>>> pd.to_numeric(s, errors='ignore')
114+
0 apple
115+
1 1.0
116+
2 2
117+
3 -3
118+
dtype: object
79119
>>> pd.to_numeric(s, errors='coerce')
120+
0 NaN
121+
1 1.0
122+
2 2.0
123+
3 -3.0
124+
dtype: float64
80125
"""
126+
if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'):
127+
raise ValueError('invalid downcasting method provided')
128+
81129
is_series = False
82130
is_index = False
83131
is_scalar = False
@@ -102,20 +150,51 @@ def to_numeric(arg, errors='raise'):
102150
else:
103151
values = arg
104152

105-
if com.is_numeric_dtype(values):
106-
pass
107-
elif com.is_datetime_or_timedelta_dtype(values):
108-
values = values.astype(np.int64)
109-
else:
110-
values = com._ensure_object(values)
111-
coerce_numeric = False if errors in ('ignore', 'raise') else True
153+
try:
154+
if com.is_numeric_dtype(values):
155+
pass
156+
elif com.is_datetime_or_timedelta_dtype(values):
157+
values = values.astype(np.int64)
158+
else:
159+
values = com._ensure_object(values)
160+
coerce_numeric = False if errors in ('ignore', 'raise') else True
112161

113-
try:
114162
values = lib.maybe_convert_numeric(values, set(),
115163
coerce_numeric=coerce_numeric)
116-
except:
117-
if errors == 'raise':
118-
raise
164+
165+
except Exception:
166+
if errors == 'raise':
167+
raise
168+
169+
# attempt downcast only if the data has been successfully converted
170+
# to a numerical dtype and if a downcast method has been specified
171+
if downcast is not None and com.is_numeric_dtype(values):
172+
typecodes = None
173+
174+
if downcast in ('integer', 'signed'):
175+
typecodes = np.typecodes['Integer']
176+
elif downcast == 'unsigned' and np.min(values) > 0:
177+
typecodes = np.typecodes['UnsignedInteger']
178+
elif downcast == 'float':
179+
typecodes = np.typecodes['Float']
180+
181+
# pandas support goes only to np.float32,
182+
# as float dtypes smaller than that are
183+
# extremely rare and not well supported
184+
float_32_char = np.dtype(np.float32).char
185+
float_32_ind = typecodes.index(float_32_char)
186+
typecodes = typecodes[float_32_ind:]
187+
188+
if typecodes is not None:
189+
# from smallest to largest
190+
for dtype in typecodes:
191+
if np.dtype(dtype).itemsize < values.dtype.itemsize:
192+
values = com._possibly_downcast_to_dtype(
193+
values, dtype)
194+
195+
# successful conversion
196+
if values.dtype == dtype:
197+
break
119198

120199
if is_series:
121200
return pd.Series(values, index=arg.index, name=arg.name)

0 commit comments

Comments
 (0)