diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 01aa6c60e3b2f..dd2af6e2799c3 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -43,6 +43,7 @@ Conversion :toctree: api/ DataFrame.astype + DataFrame.convert_dtypes DataFrame.infer_objects DataFrame.copy DataFrame.isna diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 4ad6a7b014532..1a69fa076dbf0 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -46,6 +46,7 @@ Conversion :toctree: api/ Series.astype + Series.convert_dtypes Series.infer_objects Series.copy Series.bool diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 0f55980b3d015..85f063f133dd9 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -806,7 +806,8 @@ dtype, it will use ``pd.NA``: Currently, pandas does not yet use those data types by default (when creating a DataFrame or Series, or when reading in data), so you need to specify -the dtype explicitly. +the dtype explicitly. An easy way to convert to those dtypes is explained +:ref:`here `. Propagation in arithmetic and comparison operations --------------------------------------------------- @@ -942,3 +943,29 @@ work with ``NA``, and generally return ``NA``: in the future. See :ref:`dsintro.numpy_interop` for more on ufuncs. + +.. _missing_data.NA.conversion: + +Conversion +---------- + +If you have a DataFrame or Series using traditional types that have missing data +represented using ``np.nan``, there are convenience methods +:meth:`~Series.convert_dtypes` in Series and :meth:`~DataFrame.convert_dtypes` +in DataFrame that can convert data to use the newer dtypes for integers, strings and +booleans listed :ref:`here `. This is especially helpful after reading +in data sets when letting the readers such as :meth:`read_csv` and :meth:`read_excel` +infer default dtypes. + +In this example, while the dtypes of all columns are changed, we show the results for +the first 10 columns. + +.. ipython:: python + + bb = pd.read_csv('data/baseball.csv', index_col='id') + bb[bb.columns[:10]].dtypes + +.. ipython:: python + + bbn = bb.convert_dtypes() + bbn[bbn.columns[:10]].dtypes diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 8c43b53f5cdfd..a820ef132957a 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -157,6 +157,36 @@ You can use the alias ``"boolean"`` as well. s = pd.Series([True, False, None], dtype="boolean") s +.. _whatsnew_100.convert_dtypes: + +``convert_dtypes`` method to ease use of supported extension dtypes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In order to encourage use of the extension dtypes ``StringDtype``, +``BooleanDtype``, ``Int64Dtype``, ``Int32Dtype``, etc., that support ``pd.NA``, the +methods :meth:`DataFrame.convert_dtypes` and :meth:`Series.convert_dtypes` +have been introduced. (:issue:`29752`) (:issue:`30929`) + +Example: + +.. ipython:: python + + df = pd.DataFrame({'x': ['abc', None, 'def'], + 'y': [1, 2, np.nan], + 'z': [True, False, True]}) + df + df.dtypes + +.. ipython:: python + + converted = df.convert_dtypes() + converted + converted.dtypes + +This is especially useful after reading in data using readers such as :func:`read_csv` +and :func:`read_excel`. +See :ref:`here ` for a description. + .. _whatsnew_100.numba_rolling_apply: Using Numba in ``rolling.apply`` diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 1dbdb8dbba48b..fa80e5c7b3700 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -7,6 +7,7 @@ from pandas._libs import lib, tslib, tslibs from pandas._libs.tslibs import NaT, OutOfBoundsDatetime, Period, iNaT from pandas._libs.tslibs.timezones import tz_compare +from pandas._typing import Dtype from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( @@ -34,6 +35,7 @@ is_float_dtype, is_integer, is_integer_dtype, + is_numeric_dtype, is_object_dtype, is_scalar, is_string_dtype, @@ -1018,6 +1020,80 @@ def soft_convert_objects( return values +def convert_dtypes( + input_array, + convert_string: bool = True, + convert_integer: bool = True, + convert_boolean: bool = True, +) -> Dtype: + """ + Convert objects to best possible type, and optionally, + to types supporting ``pd.NA``. + + Parameters + ---------- + input_array : ExtensionArray or PandasArray + convert_string : bool, default True + Whether object dtypes should be converted to ``StringDtype()``. + convert_integer : bool, default True + Whether, if possible, conversion can be done to integer extension types. + convert_boolean : bool, defaults True + Whether object dtypes should be converted to ``BooleanDtypes()``. + + Returns + ------- + dtype + new dtype + """ + + if convert_string or convert_integer or convert_boolean: + try: + inferred_dtype = lib.infer_dtype(input_array) + except ValueError: + # Required to catch due to Period. Can remove once GH 23553 is fixed + inferred_dtype = input_array.dtype + + if not convert_string and is_string_dtype(inferred_dtype): + inferred_dtype = input_array.dtype + + if convert_integer: + target_int_dtype = "Int64" + + if isinstance(inferred_dtype, str) and ( + inferred_dtype == "mixed-integer" + or inferred_dtype == "mixed-integer-float" + ): + inferred_dtype = target_int_dtype + if is_integer_dtype(input_array.dtype) and not is_extension_array_dtype( + input_array.dtype + ): + from pandas.core.arrays.integer import _dtypes + + inferred_dtype = _dtypes.get(input_array.dtype.name, target_int_dtype) + if not is_integer_dtype(input_array.dtype) and is_numeric_dtype( + input_array.dtype + ): + inferred_dtype = target_int_dtype + + else: + if is_integer_dtype(inferred_dtype): + inferred_dtype = input_array.dtype + + if convert_boolean: + if is_bool_dtype(input_array.dtype) and not is_extension_array_dtype( + input_array.dtype + ): + inferred_dtype = "boolean" + else: + if isinstance(inferred_dtype, str) and inferred_dtype == "boolean": + inferred_dtype = input_array.dtype + + else: + inferred_dtype = input_array.dtype + + return inferred_dtype + + def maybe_castable(arr) -> bool: # return False to force a non-fastpath diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0116207675889..93566b65d95eb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5879,6 +5879,7 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries: to_datetime : Convert argument to datetime. to_timedelta : Convert argument to timedelta. to_numeric : Convert argument to numeric type. + convert_dtypes : Convert argument to best possible dtype. Examples -------- @@ -5907,6 +5908,142 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries: ) ).__finalize__(self) + def convert_dtypes( + self: FrameOrSeries, + infer_objects: bool_t = True, + convert_string: bool_t = True, + convert_integer: bool_t = True, + convert_boolean: bool_t = True, + ) -> FrameOrSeries: + """ + Convert columns to best possible dtypes using dtypes supporting ``pd.NA``. + + .. versionadded:: 1.0.0 + + Parameters + ---------- + infer_objects : bool, default True + Whether object dtypes should be converted to the best possible types. + convert_string : bool, default True + Whether object dtypes should be converted to ``StringDtype()``. + convert_integer : bool, default True + Whether, if possible, conversion can be done to integer extension types. + convert_boolean : bool, defaults True + Whether object dtypes should be converted to ``BooleanDtypes()``. + + Returns + ------- + Series or DataFrame + Copy of input object with new dtype. + + See Also + -------- + infer_objects : Infer dtypes of objects. + to_datetime : Convert argument to datetime. + to_timedelta : Convert argument to timedelta. + to_numeric : Convert argument to a numeric type. + + Notes + ----- + + By default, ``convert_dtypes`` will attempt to convert a Series (or each + Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options + ``convert_string``, ``convert_integer``, and ``convert_boolean``, it is + possible to turn off individual conversions to ``StringDtype``, the integer + extension types or ``BooleanDtype``, respectively. + + For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference + rules as during normal Series/DataFrame construction. Then, if possible, + convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer extension + type, otherwise leave as ``object``. + + If the dtype is integer, convert to an appropriate integer extension type. + + If the dtype is numeric, and consists of all integers, convert to an + appropriate integer extension type. + + In the future, as new dtypes are added that support ``pd.NA``, the results + of this method will change to support those new dtypes. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), + ... "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), + ... "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")), + ... "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")), + ... "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")), + ... "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")), + ... } + ... ) + + Start with a DataFrame with default dtypes. + + >>> df + a b c d e f + 0 1 x True h 10.0 NaN + 1 2 y False i NaN 100.5 + 2 3 z NaN NaN 20.0 200.0 + + >>> df.dtypes + a int32 + b object + c object + d object + e float64 + f float64 + dtype: object + + Convert the DataFrame to use best possible dtypes. + + >>> dfn = df.convert_dtypes() + >>> dfn + a b c d e f + 0 1 x True h 10 NaN + 1 2 y False i 100.5 + 2 3 z 20 200.0 + + >>> dfn.dtypes + a Int32 + b string + c boolean + d string + e Int64 + f float64 + dtype: object + + Start with a Series of strings and missing data represented by ``np.nan``. + + >>> s = pd.Series(["a", "b", np.nan]) + >>> s + 0 a + 1 b + 2 NaN + dtype: object + + Obtain a Series with dtype ``StringDtype``. + + >>> s.convert_dtypes() + 0 a + 1 b + 2 + dtype: string + """ + if self.ndim == 1: + return self._convert_dtypes( + infer_objects, convert_string, convert_integer, convert_boolean + ) + else: + results = [ + col._convert_dtypes( + infer_objects, convert_string, convert_integer, convert_boolean + ) + for col_name, col in self.items() + ] + result = pd.concat(results, axis=1, copy=False) + return result + # ---------------------------------------------------------------------- # Filling NA's diff --git a/pandas/core/series.py b/pandas/core/series.py index 270b97af9bdfb..9f8f94fe93ad8 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -16,6 +16,7 @@ from pandas.util._decorators import Appender, Substitution from pandas.util._validators import validate_bool_kwarg, validate_percentile +from pandas.core.dtypes.cast import convert_dtypes from pandas.core.dtypes.common import ( _is_unorderable_exception, ensure_platform_int, @@ -4352,6 +4353,34 @@ def between(self, left, right, inclusive=True): return lmask & rmask + # ---------------------------------------------------------------------- + # Convert to types that support pd.NA + + def _convert_dtypes( + self: ABCSeries, + infer_objects: bool = True, + convert_string: bool = True, + convert_integer: bool = True, + convert_boolean: bool = True, + ) -> "Series": + input_series = self + if infer_objects: + input_series = input_series.infer_objects() + if is_object_dtype(input_series): + input_series = input_series.copy() + + if convert_string or convert_integer or convert_boolean: + inferred_dtype = convert_dtypes( + input_series._values, convert_string, convert_integer, convert_boolean + ) + try: + result = input_series.astype(inferred_dtype) + except TypeError: + result = input_series.copy() + else: + result = input_series.copy() + return result + @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs) def isna(self): return super().isna() diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index cfa42d764ee44..8c2be7092c37d 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -629,6 +629,7 @@ def to_datetime( -------- DataFrame.astype : Cast argument to a specified dtype. to_timedelta : Convert argument to timedelta. + convert_dtypes : Convert dtypes. Examples -------- diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index e59ed247bd87b..4939cbfc9cc96 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -70,6 +70,7 @@ def to_numeric(arg, errors="raise", downcast=None): to_datetime : Convert argument to datetime. to_timedelta : Convert argument to timedelta. numpy.ndarray.astype : Cast a numpy array to a specified type. + convert_dtypes : Convert dtypes. Examples -------- diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index 3e185feaea38e..3f0cfce39f6f9 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -49,6 +49,7 @@ def to_timedelta(arg, unit="ns", errors="raise"): -------- DataFrame.astype : Cast argument to a specified dtype. to_datetime : Convert argument to datetime. + convert_dtypes : Convert dtypes. Examples -------- diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 06bb040224455..2d3db3a1eff51 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -1072,6 +1072,27 @@ def test_str_to_small_float_conversion_type(self): expected = pd.DataFrame(col_data, columns=["A"], dtype=float) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] + ) + def test_convert_dtypes(self, convert_integer, expected): + # Specific types are tested in tests/series/test_dtypes.py + # Just check that it works for DataFrame here + df = pd.DataFrame( + { + "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), + "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), + } + ) + result = df.convert_dtypes(True, True, convert_integer, False) + expected = pd.DataFrame( + { + "a": pd.Series([1, 2, 3], dtype=expected), + "b": pd.Series(["x", "y", "z"], dtype="string"), + } + ) + tm.assert_frame_equal(result, expected) + class TestDataFrameDatetimeWithTZ: def test_interleave(self, timezone_frame): diff --git a/pandas/tests/series/test_convert_dtypes.py b/pandas/tests/series/test_convert_dtypes.py new file mode 100644 index 0000000000000..923b5a94c5f41 --- /dev/null +++ b/pandas/tests/series/test_convert_dtypes.py @@ -0,0 +1,248 @@ +from itertools import product + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +class TestSeriesConvertDtypes: + # The answerdict has keys that have 4 tuples, corresponding to the arguments + # infer_objects, convert_string, convert_integer, convert_boolean + # This allows all 16 possible combinations to be tested. Since common + # combinations expect the same answer, this provides an easy way to list + # all the possibilities + @pytest.mark.parametrize( + "data, maindtype, answerdict", + [ + ( + [1, 2, 3], + np.dtype("int32"), + { + ((True, False), (True, False), (True,), (True, False)): "Int32", + ((True, False), (True, False), (False,), (True, False)): np.dtype( + "int32" + ), + }, + ), + ( + [1, 2, 3], + np.dtype("int64"), + { + ((True, False), (True, False), (True,), (True, False)): "Int64", + ((True, False), (True, False), (False,), (True, False)): np.dtype( + "int64" + ), + }, + ), + ( + ["x", "y", "z"], + np.dtype("O"), + { + ( + (True, False), + (True,), + (True, False), + (True, False), + ): pd.StringDtype(), + ((True, False), (False,), (True, False), (True, False)): np.dtype( + "O" + ), + }, + ), + ( + [True, False, np.nan], + np.dtype("O"), + { + ( + (True, False), + (True, False), + (True, False), + (True,), + ): pd.BooleanDtype(), + ((True, False), (True, False), (True, False), (False,)): np.dtype( + "O" + ), + }, + ), + ( + ["h", "i", np.nan], + np.dtype("O"), + { + ( + (True, False), + (True,), + (True, False), + (True, False), + ): pd.StringDtype(), + ((True, False), (False,), (True, False), (True, False)): np.dtype( + "O" + ), + }, + ), + ( + [10, np.nan, 20], + np.dtype("float"), + { + ((True, False), (True, False), (True,), (True, False)): "Int64", + ((True, False), (True, False), (False,), (True, False)): np.dtype( + "float" + ), + }, + ), + ( + [np.nan, 100.5, 200], + np.dtype("float"), + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): np.dtype("float"), + }, + ), + ( + [3, 4, 5], + "Int8", + {((True, False), (True, False), (True, False), (True, False)): "Int8"}, + ), + ( + [[1, 2], [3, 4], [5]], + None, + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): np.dtype("O"), + }, + ), + ( + [4, 5, 6], + np.dtype("uint32"), + { + ((True, False), (True, False), (True,), (True, False)): "UInt32", + ((True, False), (True, False), (False,), (True, False)): np.dtype( + "uint32" + ), + }, + ), + ( + [-10, 12, 13], + np.dtype("i1"), + { + ((True, False), (True, False), (True,), (True, False)): "Int8", + ((True, False), (True, False), (False,), (True, False)): np.dtype( + "i1" + ), + }, + ), + ( + [1, 2.0], + object, + { + ((True, False), (True, False), (True,), (True, False)): "Int64", + ((True,), (True, False), (False,), (True, False)): np.dtype( + "float" + ), + ((False,), (True, False), (False,), (True, False)): np.dtype( + "object" + ), + }, + ), + ( + ["a", "b"], + pd.CategoricalDtype(), + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): pd.CategoricalDtype(), + }, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), + pd.DatetimeTZDtype(tz="UTC"), + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): pd.DatetimeTZDtype(tz="UTC"), + }, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), + "datetime64[ns]", + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): np.dtype("datetime64[ns]"), + }, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), + object, + { + ((True,), (True, False), (True, False), (True, False),): np.dtype( + "datetime64[ns]" + ), + ((False,), (True, False), (True, False), (True, False),): np.dtype( + "O" + ), + }, + ), + ( + pd.period_range("1/1/2011", freq="M", periods=3), + None, + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): pd.PeriodDtype("M"), + }, + ), + ( + pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]), + None, + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): pd.IntervalDtype("int64"), + }, + ), + ], + ) + @pytest.mark.parametrize("params", product(*[(True, False)] * 4)) + def test_convert_dtypes(self, data, maindtype, params, answerdict): + if maindtype is not None: + series = pd.Series(data, dtype=maindtype) + else: + series = pd.Series(data) + answers = {k: a for (kk, a) in answerdict.items() for k in product(*kk)} + + ns = series.convert_dtypes(*params) + expected_dtype = answers[tuple(params)] + expected = pd.Series(series.values, dtype=expected_dtype) + tm.assert_series_equal(ns, expected) + + # Test that it is a copy + copy = series.copy(deep=True) + ns[ns.notna()] = np.nan + + # Make sure original not changed + tm.assert_series_equal(series, copy)