Skip to content

Commit 9e7666d

Browse files
authored
API: add infer_objects for soft conversions (#16915)
* API: add infer_objects for soft conversions * doc fixups * fixups * doc
1 parent fcb0263 commit 9e7666d

File tree

6 files changed

+153
-4
lines changed

6 files changed

+153
-4
lines changed

doc/source/api.rst

+2
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,7 @@ Conversion
270270
:toctree: generated/
271271

272272
Series.astype
273+
Series.infer_objects
273274
Series.copy
274275
Series.isnull
275276
Series.notnull
@@ -777,6 +778,7 @@ Conversion
777778

778779
DataFrame.astype
779780
DataFrame.convert_objects
781+
DataFrame.infer_objects
780782
DataFrame.copy
781783
DataFrame.isnull
782784
DataFrame.notnull

doc/source/basics.rst

+22-1
Original file line numberDiff line numberDiff line change
@@ -2024,7 +2024,28 @@ object conversion
20242024
~~~~~~~~~~~~~~~~~
20252025

20262026
pandas offers various functions to try to force conversion of types from the ``object`` dtype to other types.
2027-
The following functions are available for one dimensional object arrays or scalars:
2027+
In cases where the data is already of the correct type, but stored in an ``object`` array, the
2028+
:meth:`~DataFrame.infer_objects` and :meth:`~Series.infer_objects` can be used to soft convert
2029+
to the correct type.
2030+
2031+
.. ipython:: python
2032+
2033+
df = pd.DataFrame([[1, 2],
2034+
['a', 'b'],
2035+
[datetime.datetime(2016, 3, 2), datetime.datetime(2016, 3, 2)]])
2036+
df = df.T
2037+
df
2038+
df.dtypes
2039+
2040+
Because the data transposed the original inference stored all columns as object, which
2041+
``infer_objects`` will correct.
2042+
2043+
.. ipython:: python
2044+
2045+
df.infer_objects().dtypes
2046+
2047+
The following functions are available for one dimensional object arrays or scalars to perform
2048+
hard conversion of objects to a specified type:
20282049

20292050
- :meth:`~pandas.to_numeric` (conversion to numeric dtypes)
20302051

doc/source/whatsnew/v0.21.0.txt

+32
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,38 @@ New features
2525
- Added ``__fspath__`` method to :class:`~pandas.HDFStore`, :class:`~pandas.ExcelFile`,
2626
and :class:`~pandas.ExcelWriter` to work properly with the file system path protocol (:issue:`13823`)
2727

28+
29+
.. _whatsnew_0210.enhancements.infer_objects:
30+
31+
``infer_objects`` type conversion
32+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
33+
34+
The `:meth:`~DataFrame.infer_objects` and :meth:`~Series.infer_objects`
35+
methods have been added to perform dtype inference on object columns, replacing
36+
some of the functionality of the deprecated ``convert_objects``
37+
method. See the documentation :ref:`here <basics.object_conversion>`
38+
for more details. (:issue:`11221`)
39+
40+
This function only performs soft conversions on object columns, converting Python objects
41+
to native types, but not any coercive conversions. For example:
42+
43+
.. ipython:: python
44+
45+
df = pd.DataFrame({'A': [1, 2, 3],
46+
'B': np.array([1, 2, 3], dtype='object'),
47+
'C': ['1', '2', '3']})
48+
df.dtypes
49+
df.infer_objects().dtype
50+
51+
Note that column ``'C'`` was not converted - only scalar numeric types
52+
will be inferred to a new type. Other types of conversion should be accomplished
53+
using :func:`to_numeric` function (or :func:`to_datetime`, :func:`to_timedelta`).
54+
.. ipython:: python
55+
56+
df = df.infer_objects()
57+
df['C'] = pd.to_numeric(df['C'], errors='coerce')
58+
df.dtypes
59+
2860
.. _whatsnew_0210.enhancements.other:
2961

3062
Other Enhancements

pandas/core/generic.py

+53-3
Original file line numberDiff line numberDiff line change
@@ -3671,16 +3671,66 @@ def convert_objects(self, convert_dates=True, convert_numeric=False,
36713671
converted : same as input object
36723672
"""
36733673
from warnings import warn
3674-
warn("convert_objects is deprecated. Use the data-type specific "
3675-
"converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.",
3676-
FutureWarning, stacklevel=2)
3674+
msg = ("convert_objects is deprecated. To re-infer data dtypes for "
3675+
"object columns, use {klass}.infer_objects()\nFor all "
3676+
"other conversions use the data-type specific converters "
3677+
"pd.to_datetime, pd.to_timedelta and pd.to_numeric."
3678+
).format(klass=self.__class__.__name__)
3679+
warn(msg, FutureWarning, stacklevel=2)
36773680

36783681
return self._constructor(
36793682
self._data.convert(convert_dates=convert_dates,
36803683
convert_numeric=convert_numeric,
36813684
convert_timedeltas=convert_timedeltas,
36823685
copy=copy)).__finalize__(self)
36833686

3687+
def infer_objects(self):
3688+
"""
3689+
Attempt to infer better dtypes for object columns.
3690+
3691+
Attempts soft conversion of object-dtyped
3692+
columns, leaving non-object and unconvertible
3693+
columns unchanged. The inference rules are the
3694+
same as during normal Series/DataFrame construction.
3695+
3696+
.. versionadded:: 0.20.0
3697+
3698+
See Also
3699+
--------
3700+
pandas.to_datetime : Convert argument to datetime.
3701+
pandas.to_timedelta : Convert argument to timedelta.
3702+
pandas.to_numeric : Convert argument to numeric typeR
3703+
3704+
Returns
3705+
-------
3706+
converted : same type as input object
3707+
3708+
Examples
3709+
--------
3710+
>>> df = pd.DataFrame({"A": ["a", 1, 2, 3]})
3711+
>>> df = df.iloc[1:]
3712+
>>> df
3713+
A
3714+
1 1
3715+
2 2
3716+
3 3
3717+
3718+
>>> df.dtypes
3719+
A object
3720+
dtype: object
3721+
3722+
>>> df.infer_objects().dtypes
3723+
A int64
3724+
dtype: object
3725+
"""
3726+
# numeric=False necessary to only soft convert;
3727+
# python objects will still be converted to
3728+
# native numpy numeric types
3729+
return self._constructor(
3730+
self._data.convert(datetime=True, numeric=False,
3731+
timedelta=True, coerce=False,
3732+
copy=True)).__finalize__(self)
3733+
36843734
# ----------------------------------------------------------------------
36853735
# Filling NA's
36863736

pandas/tests/frame/test_block_internals.py

+26
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,32 @@ def test_convert_objects_no_conversion(self):
495495
mixed2 = mixed1._convert(datetime=True)
496496
assert_frame_equal(mixed1, mixed2)
497497

498+
def test_infer_objects(self):
499+
# GH 11221
500+
df = DataFrame({'a': ['a', 1, 2, 3],
501+
'b': ['b', 2.0, 3.0, 4.1],
502+
'c': ['c', datetime(2016, 1, 1),
503+
datetime(2016, 1, 2),
504+
datetime(2016, 1, 3)],
505+
'd': [1, 2, 3, 'd']},
506+
columns=['a', 'b', 'c', 'd'])
507+
df = df.iloc[1:].infer_objects()
508+
509+
assert df['a'].dtype == 'int64'
510+
assert df['b'].dtype == 'float64'
511+
assert df['c'].dtype == 'M8[ns]'
512+
assert df['d'].dtype == 'object'
513+
514+
expected = DataFrame({'a': [1, 2, 3],
515+
'b': [2.0, 3.0, 4.1],
516+
'c': [datetime(2016, 1, 1),
517+
datetime(2016, 1, 2),
518+
datetime(2016, 1, 3)],
519+
'd': [2, 3, 'd']},
520+
columns=['a', 'b', 'c', 'd'])
521+
# reconstruct frame to verify inference is same
522+
tm.assert_frame_equal(df.reset_index(drop=True), expected)
523+
498524
def test_stale_cached_series_bug_473(self):
499525

500526
# this is chained, but ok

pandas/tests/series/test_dtypes.py

+18
Original file line numberDiff line numberDiff line change
@@ -268,3 +268,21 @@ def test_series_to_categorical(self):
268268
expected = Series(['a', 'b', 'c'], dtype='category')
269269

270270
tm.assert_series_equal(result, expected)
271+
272+
def test_infer_objects_series(self):
273+
# GH 11221
274+
actual = Series(np.array([1, 2, 3], dtype='O')).infer_objects()
275+
expected = Series([1, 2, 3])
276+
tm.assert_series_equal(actual, expected)
277+
278+
actual = Series(np.array([1, 2, 3, None], dtype='O')).infer_objects()
279+
expected = Series([1., 2., 3., np.nan])
280+
tm.assert_series_equal(actual, expected)
281+
282+
# only soft conversions, uncovertable pass thru unchanged
283+
actual = (Series(np.array([1, 2, 3, None, 'a'], dtype='O'))
284+
.infer_objects())
285+
expected = Series([1, 2, 3, None, 'a'])
286+
287+
assert actual.dtype == 'object'
288+
tm.assert_series_equal(actual, expected)

0 commit comments

Comments
 (0)