From 3670711cc8f56085783a22ccdc8274de041779df Mon Sep 17 00:00:00 2001 From: Chris Date: Thu, 13 Jul 2017 18:37:25 -0500 Subject: [PATCH 1/4] API: add infer_objects for soft conversions --- doc/source/whatsnew/v0.21.0.txt | 29 +++++++++++++ pandas/core/generic.py | 48 ++++++++++++++++++++-- pandas/tests/frame/test_block_internals.py | 22 ++++++++++ pandas/tests/series/test_dtypes.py | 16 ++++++++ 4 files changed, 112 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 8ba57c0fa50be..979418a406575 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -25,6 +25,35 @@ New features - Added ``__fspath__`` method to :class:`~pandas.HDFStore`, :class:`~pandas.ExcelFile`, and :class:`~pandas.ExcelWriter` to work properly with the file system path protocol (:issue:`13823`) + +``infer_objects`` type conversion +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``DataFrame`` and ``Series`` have gained an ``infer_objects`` method to perform dtype inference +on object columns, replacing some of the functionality of the deprecated ``convert_objects`` +function (:issue:`11221`) + +This function only performs soft conversions on object columns, converting python scalars +to native types, but not any coercive conversions. For example + +.. ipython:: python + + df = pd.DataFrame({'A': [1, 2, 3], + 'B': np.array([1, 2, 3], dtype='object'), + 'C': ['1', '2', '3']}) + df.dtypes + df.infer_objects().dtype + +Note that column ``'C'`` was not converted - only scalar numeric types +will be inferred to a new type. Other types of conversion should be accomplished +using :func:`to_numeric` function (or :func:`to_datetime`, :func:`to_timedelta`) + +.. ipython:: python + + df = df.infer_objects() + df['C'] = pd.to_numeric(df['C'], errors='coerce') + df.dtypes + .. _whatsnew_0210.enhancements.other: Other Enhancements diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5722539b87aec..4a6fbaf72f58a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3634,9 +3634,12 @@ def convert_objects(self, convert_dates=True, convert_numeric=False, converted : same as input object """ from warnings import warn - warn("convert_objects is deprecated. Use the data-type specific " - "converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.", - FutureWarning, stacklevel=2) + msg = ("convert_objects is deprecated. To re-infer data dtypes for " + "object columns, use {klass}.infer_objects()\nFor all " + "other conversions use the data-type specific converters " + "pd.to_datetime, pd.to_timedelta and pd.to_numeric." + ).format(klass=self.__class__.__name__) + warn(msg, FutureWarning, stacklevel=2) return self._constructor( self._data.convert(convert_dates=convert_dates, @@ -3644,6 +3647,45 @@ def convert_objects(self, convert_dates=True, convert_numeric=False, convert_timedeltas=convert_timedeltas, copy=copy)).__finalize__(self) + def infer_objects(self): + """ + Attempt to infer better dtypes for only object columns + Only attempts soft conversions + + See Also + -------- + pandas.to_datetime : Convert argument to datetime. + pandas.to_timedelta : Convert argument to timedelta. + pandas.to_numeric : Convert argument to numeric typeR + + Returns + ------- + converted : same as input object + + Examples + -------- + >>> df = pd.DataFrame({"A": ["a", 1, 2, 3]}) + >>> df = df.iloc[1:] + >>> df + A + 1 1 + 2 2 + 3 3 + >>> df.dtypes + A object + dtype: object + >>> df.infer_objects().dtypes + A int64 + dtype: object + """ + # numeric=False necessary to only soft convert; + # python objects will still be converted to + # native numpy numeric types + return self._constructor( + self._data.convert(datetime=True, numeric=False, + timedelta=True, coerce=False, + copy=True)).__finalize__(self) + # ---------------------------------------------------------------------- # Filling NA's diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index c1a5b437be5d0..6f87436e0d0fd 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -495,6 +495,28 @@ def test_convert_objects_no_conversion(self): mixed2 = mixed1._convert(datetime=True) assert_frame_equal(mixed1, mixed2) + def test_infer_objects(self): + # GH 11221 + df = DataFrame({'a': ['a', 1, 2, 3], + 'b': ['b', 2.0, 3.0, 4.1], + 'c': ['c', datetime(2016, 1, 1), + datetime(2016, 1, 2), + datetime(2016, 1, 3)]}, + columns=['a', 'b', 'c']) + df = df.iloc[1:].infer_objects() + + assert df['a'].dtype == 'int64' + assert df['b'].dtype == 'float64' + assert df['c'].dtype == 'M8[ns]' + + expected = DataFrame({'a': [1, 2, 3], + 'b': [2.0, 3.0, 4.1], + 'c': [datetime(2016, 1, 1), + datetime(2016, 1, 2), + datetime(2016, 1, 3)]}, + columns=['a', 'b', 'c']) + tm.assert_frame_equal(df.reset_index(drop=True), expected) + def test_stale_cached_series_bug_473(self): # this is chained, but ok diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 2ec579842e33f..246efb36fa1ac 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -268,3 +268,19 @@ def test_series_to_categorical(self): expected = Series(['a', 'b', 'c'], dtype='category') tm.assert_series_equal(result, expected) + + def test_infer_objects_series(self): + # GH 11221 + actual = Series(np.array([1, 2, 3], dtype='O')).infer_objects() + expected = Series([1, 2, 3]) + tm.assert_series_equal(actual, expected) + + actual = Series(np.array([1, 2, 3, None], dtype='O')).infer_objects() + expected = Series([1., 2., 3., np.nan]) + tm.assert_series_equal(actual, expected) + + actual = (Series(np.array([1, 2, 3, None, 'a'], dtype='O')) + .infer_objects()) + expected = Series([1, 2, 3, None, 'a']) + assert actual.dtype == 'object' + tm.assert_series_equal(actual, expected) From c0c9ec78cc411f1b3c463c20204bf881de764a1e Mon Sep 17 00:00:00 2001 From: Chris Date: Mon, 17 Jul 2017 18:09:26 -0500 Subject: [PATCH 2/4] doc fixups --- doc/source/api.rst | 2 ++ doc/source/basics.rst | 17 ++++++++++++++++- doc/source/whatsnew/v0.21.0.txt | 16 ++++++++++------ pandas/core/generic.py | 12 +++++++++--- pandas/tests/frame/test_block_internals.py | 12 ++++++++---- pandas/tests/series/test_dtypes.py | 1 + 6 files changed, 46 insertions(+), 14 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index d6053791d6f4b..77d095a965221 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -270,6 +270,7 @@ Conversion :toctree: generated/ Series.astype + Series.infer_objects Series.copy Series.isnull Series.notnull @@ -777,6 +778,7 @@ Conversion DataFrame.astype DataFrame.convert_objects + DataFrame.infer_objects DataFrame.copy DataFrame.isnull DataFrame.notnull diff --git a/doc/source/basics.rst b/doc/source/basics.rst index d8b1602fb104d..a1287c8d30003 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -2024,7 +2024,22 @@ object conversion ~~~~~~~~~~~~~~~~~ pandas offers various functions to try to force conversion of types from the ``object`` dtype to other types. -The following functions are available for one dimensional object arrays or scalars: +In cases where the data is already of the correct type, but stored in an ``object`` array, the +:meth:`~DataFrame.infer_objects` and :meth:`~Series.infer_objects` can be used to soft convert +to the correct type. + + .. ipython:: python + + df = pd.DataFrame([[1, 2], + ['a', 'b'], + [datetime.datetime(2016, 3, 2), datetime.datetime(2016, 3, 2)]]) + df = df.T + df + df.dtypes + df.infer_objects().dtypes + +The following functions are available for one dimensional object arrays or scalars to perform +hard conversion of objects to a specified type: - :meth:`~pandas.to_numeric` (conversion to numeric dtypes) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 979418a406575..b83584de2b8a7 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -26,15 +26,18 @@ New features and :class:`~pandas.ExcelWriter` to work properly with the file system path protocol (:issue:`13823`) +.. _whatsnew_0210.enhancements.infer_objects: + ``infer_objects`` type conversion ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``DataFrame`` and ``Series`` have gained an ``infer_objects`` method to perform dtype inference -on object columns, replacing some of the functionality of the deprecated ``convert_objects`` -function (:issue:`11221`) +The `:meth:`~DataFrame.infer_objects` and :meth:`~Series.infer_objects` +have been added to perform dtype inference on object columns, replacing +some of the functionality of the deprecated ``convert_objects`` +method (:issue:`11221`) -This function only performs soft conversions on object columns, converting python scalars -to native types, but not any coercive conversions. For example +This function only performs soft conversions on object columns, converting Python objects +to native types, but not any coercive conversions. For example: .. ipython:: python @@ -46,7 +49,8 @@ to native types, but not any coercive conversions. For example Note that column ``'C'`` was not converted - only scalar numeric types will be inferred to a new type. Other types of conversion should be accomplished -using :func:`to_numeric` function (or :func:`to_datetime`, :func:`to_timedelta`) +using :func:`to_numeric` function (or :func:`to_datetime`, :func:`to_timedelta`). See +the documentation :ref:`here ` for more details. .. ipython:: python diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4a6fbaf72f58a..66c667dc537be 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3649,8 +3649,14 @@ def convert_objects(self, convert_dates=True, convert_numeric=False, def infer_objects(self): """ - Attempt to infer better dtypes for only object columns - Only attempts soft conversions + Attempt to infer better dtypes for object columns + + Attempts soft conversion of object dtyped + columns, leaving non-object and unconvertible + columns unchanged. The inference rules are the + same as during normal Series/DataFrame construction. + + .. versionadded:: 0.20.0 See Also -------- @@ -3660,7 +3666,7 @@ def infer_objects(self): Returns ------- - converted : same as input object + converted : same type as input object Examples -------- diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 6f87436e0d0fd..f66070fd66813 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -501,20 +501,24 @@ def test_infer_objects(self): 'b': ['b', 2.0, 3.0, 4.1], 'c': ['c', datetime(2016, 1, 1), datetime(2016, 1, 2), - datetime(2016, 1, 3)]}, - columns=['a', 'b', 'c']) + datetime(2016, 1, 3)], + 'd': [1, 2, 3, 'd']}, + columns=['a', 'b', 'c', 'd']) df = df.iloc[1:].infer_objects() assert df['a'].dtype == 'int64' assert df['b'].dtype == 'float64' assert df['c'].dtype == 'M8[ns]' + assert df['d'].dtype == 'object' expected = DataFrame({'a': [1, 2, 3], 'b': [2.0, 3.0, 4.1], 'c': [datetime(2016, 1, 1), datetime(2016, 1, 2), - datetime(2016, 1, 3)]}, - columns=['a', 'b', 'c']) + datetime(2016, 1, 3)], + 'd': [2, 3, 'd']}, + columns=['a', 'b', 'c', 'd']) + # reconstruct frame to verify inference is same tm.assert_frame_equal(df.reset_index(drop=True), expected) def test_stale_cached_series_bug_473(self): diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 246efb36fa1ac..638475dab7288 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -279,6 +279,7 @@ def test_infer_objects_series(self): expected = Series([1., 2., 3., np.nan]) tm.assert_series_equal(actual, expected) + # only soft conversions, uncovertable pass thru unchanged actual = (Series(np.array([1, 2, 3, None, 'a'], dtype='O')) .infer_objects()) expected = Series([1, 2, 3, None, 'a']) From b5442d3b6e2340a7663d0d986ec0d8fa78b9fd1b Mon Sep 17 00:00:00 2001 From: Chris Date: Mon, 17 Jul 2017 20:13:09 -0500 Subject: [PATCH 3/4] fixups --- pandas/core/generic.py | 4 ++-- pandas/tests/series/test_dtypes.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 66c667dc537be..cfb88dd1a32d8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3649,9 +3649,9 @@ def convert_objects(self, convert_dates=True, convert_numeric=False, def infer_objects(self): """ - Attempt to infer better dtypes for object columns + Attempt to infer better dtypes for object columns. - Attempts soft conversion of object dtyped + Attempts soft conversion of object-dtyped columns, leaving non-object and unconvertible columns unchanged. The inference rules are the same as during normal Series/DataFrame construction. diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 638475dab7288..c214280ee8386 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -283,5 +283,6 @@ def test_infer_objects_series(self): actual = (Series(np.array([1, 2, 3, None, 'a'], dtype='O')) .infer_objects()) expected = Series([1, 2, 3, None, 'a']) + assert actual.dtype == 'object' tm.assert_series_equal(actual, expected) From 4268007aa21b8e28078c5f73790c421be7bf834a Mon Sep 17 00:00:00 2001 From: Chris Date: Mon, 17 Jul 2017 21:00:41 -0500 Subject: [PATCH 4/4] doc --- doc/source/basics.rst | 6 ++++++ doc/source/whatsnew/v0.21.0.txt | 9 ++++----- pandas/core/generic.py | 2 ++ 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index a1287c8d30003..4211b15203721 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -2036,6 +2036,12 @@ to the correct type. df = df.T df df.dtypes + +Because the data transposed the original inference stored all columns as object, which +``infer_objects`` will correct. + + .. ipython:: python + df.infer_objects().dtypes The following functions are available for one dimensional object arrays or scalars to perform diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index b83584de2b8a7..5f2bb03ecf98a 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -32,9 +32,10 @@ New features ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The `:meth:`~DataFrame.infer_objects` and :meth:`~Series.infer_objects` -have been added to perform dtype inference on object columns, replacing +methods have been added to perform dtype inference on object columns, replacing some of the functionality of the deprecated ``convert_objects`` -method (:issue:`11221`) +method. See the documentation :ref:`here ` +for more details. (:issue:`11221`) This function only performs soft conversions on object columns, converting Python objects to native types, but not any coercive conversions. For example: @@ -49,9 +50,7 @@ to native types, but not any coercive conversions. For example: Note that column ``'C'`` was not converted - only scalar numeric types will be inferred to a new type. Other types of conversion should be accomplished -using :func:`to_numeric` function (or :func:`to_datetime`, :func:`to_timedelta`). See -the documentation :ref:`here ` for more details. - +using :func:`to_numeric` function (or :func:`to_datetime`, :func:`to_timedelta`). .. ipython:: python df = df.infer_objects() diff --git a/pandas/core/generic.py b/pandas/core/generic.py index cfb88dd1a32d8..7a064440b5273 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3677,9 +3677,11 @@ def infer_objects(self): 1 1 2 2 3 3 + >>> df.dtypes A object dtype: object + >>> df.infer_objects().dtypes A int64 dtype: object