Skip to content

Commit b463f4a

Browse files
ENH: Implement convert_dtypes (pandas-dev#30929) (pandas-dev#31282)
Co-authored-by: Irv Lustig <[email protected]>
1 parent e93bc80 commit b463f4a

File tree

12 files changed

+574
-1
lines changed

12 files changed

+574
-1
lines changed

doc/source/reference/frame.rst

+1
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ Conversion
4343
:toctree: api/
4444

4545
DataFrame.astype
46+
DataFrame.convert_dtypes
4647
DataFrame.infer_objects
4748
DataFrame.copy
4849
DataFrame.isna

doc/source/reference/series.rst

+1
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ Conversion
4646
:toctree: api/
4747

4848
Series.astype
49+
Series.convert_dtypes
4950
Series.infer_objects
5051
Series.copy
5152
Series.bool

doc/source/user_guide/missing_data.rst

+28-1
Original file line numberDiff line numberDiff line change
@@ -806,7 +806,8 @@ dtype, it will use ``pd.NA``:
806806
807807
Currently, pandas does not yet use those data types by default (when creating
808808
a DataFrame or Series, or when reading in data), so you need to specify
809-
the dtype explicitly.
809+
the dtype explicitly. An easy way to convert to those dtypes is explained
810+
:ref:`here <missing_data.NA.conversion>`.
810811

811812
Propagation in arithmetic and comparison operations
812813
---------------------------------------------------
@@ -942,3 +943,29 @@ work with ``NA``, and generally return ``NA``:
942943
in the future.
943944

944945
See :ref:`dsintro.numpy_interop` for more on ufuncs.
946+
947+
.. _missing_data.NA.conversion:
948+
949+
Conversion
950+
----------
951+
952+
If you have a DataFrame or Series using traditional types that have missing data
953+
represented using ``np.nan``, there are convenience methods
954+
:meth:`~Series.convert_dtypes` in Series and :meth:`~DataFrame.convert_dtypes`
955+
in DataFrame that can convert data to use the newer dtypes for integers, strings and
956+
booleans listed :ref:`here <basics.dtypes>`. This is especially helpful after reading
957+
in data sets when letting the readers such as :meth:`read_csv` and :meth:`read_excel`
958+
infer default dtypes.
959+
960+
In this example, while the dtypes of all columns are changed, we show the results for
961+
the first 10 columns.
962+
963+
.. ipython:: python
964+
965+
bb = pd.read_csv('data/baseball.csv', index_col='id')
966+
bb[bb.columns[:10]].dtypes
967+
968+
.. ipython:: python
969+
970+
bbn = bb.convert_dtypes()
971+
bbn[bbn.columns[:10]].dtypes

doc/source/whatsnew/v1.0.0.rst

+30
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,36 @@ You can use the alias ``"boolean"`` as well.
157157
s = pd.Series([True, False, None], dtype="boolean")
158158
s
159159
160+
.. _whatsnew_100.convert_dtypes:
161+
162+
``convert_dtypes`` method to ease use of supported extension dtypes
163+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
164+
165+
In order to encourage use of the extension dtypes ``StringDtype``,
166+
``BooleanDtype``, ``Int64Dtype``, ``Int32Dtype``, etc., that support ``pd.NA``, the
167+
methods :meth:`DataFrame.convert_dtypes` and :meth:`Series.convert_dtypes`
168+
have been introduced. (:issue:`29752`) (:issue:`30929`)
169+
170+
Example:
171+
172+
.. ipython:: python
173+
174+
df = pd.DataFrame({'x': ['abc', None, 'def'],
175+
'y': [1, 2, np.nan],
176+
'z': [True, False, True]})
177+
df
178+
df.dtypes
179+
180+
.. ipython:: python
181+
182+
converted = df.convert_dtypes()
183+
converted
184+
converted.dtypes
185+
186+
This is especially useful after reading in data using readers such as :func:`read_csv`
187+
and :func:`read_excel`.
188+
See :ref:`here <missing_data.NA.conversion>` for a description.
189+
160190
.. _whatsnew_100.numba_rolling_apply:
161191

162192
Using Numba in ``rolling.apply``

pandas/core/dtypes/cast.py

+76
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from pandas._libs import lib, tslib, tslibs
88
from pandas._libs.tslibs import NaT, OutOfBoundsDatetime, Period, iNaT
99
from pandas._libs.tslibs.timezones import tz_compare
10+
from pandas._typing import Dtype
1011
from pandas.util._validators import validate_bool_kwarg
1112

1213
from pandas.core.dtypes.common import (
@@ -34,6 +35,7 @@
3435
is_float_dtype,
3536
is_integer,
3637
is_integer_dtype,
38+
is_numeric_dtype,
3739
is_object_dtype,
3840
is_scalar,
3941
is_string_dtype,
@@ -1018,6 +1020,80 @@ def soft_convert_objects(
10181020
return values
10191021

10201022

1023+
def convert_dtypes(
1024+
input_array,
1025+
convert_string: bool = True,
1026+
convert_integer: bool = True,
1027+
convert_boolean: bool = True,
1028+
) -> Dtype:
1029+
"""
1030+
Convert objects to best possible type, and optionally,
1031+
to types supporting ``pd.NA``.
1032+
1033+
Parameters
1034+
----------
1035+
input_array : ExtensionArray or PandasArray
1036+
convert_string : bool, default True
1037+
Whether object dtypes should be converted to ``StringDtype()``.
1038+
convert_integer : bool, default True
1039+
Whether, if possible, conversion can be done to integer extension types.
1040+
convert_boolean : bool, defaults True
1041+
Whether object dtypes should be converted to ``BooleanDtypes()``.
1042+
1043+
Returns
1044+
-------
1045+
dtype
1046+
new dtype
1047+
"""
1048+
1049+
if convert_string or convert_integer or convert_boolean:
1050+
try:
1051+
inferred_dtype = lib.infer_dtype(input_array)
1052+
except ValueError:
1053+
# Required to catch due to Period. Can remove once GH 23553 is fixed
1054+
inferred_dtype = input_array.dtype
1055+
1056+
if not convert_string and is_string_dtype(inferred_dtype):
1057+
inferred_dtype = input_array.dtype
1058+
1059+
if convert_integer:
1060+
target_int_dtype = "Int64"
1061+
1062+
if isinstance(inferred_dtype, str) and (
1063+
inferred_dtype == "mixed-integer"
1064+
or inferred_dtype == "mixed-integer-float"
1065+
):
1066+
inferred_dtype = target_int_dtype
1067+
if is_integer_dtype(input_array.dtype) and not is_extension_array_dtype(
1068+
input_array.dtype
1069+
):
1070+
from pandas.core.arrays.integer import _dtypes
1071+
1072+
inferred_dtype = _dtypes.get(input_array.dtype.name, target_int_dtype)
1073+
if not is_integer_dtype(input_array.dtype) and is_numeric_dtype(
1074+
input_array.dtype
1075+
):
1076+
inferred_dtype = target_int_dtype
1077+
1078+
else:
1079+
if is_integer_dtype(inferred_dtype):
1080+
inferred_dtype = input_array.dtype
1081+
1082+
if convert_boolean:
1083+
if is_bool_dtype(input_array.dtype) and not is_extension_array_dtype(
1084+
input_array.dtype
1085+
):
1086+
inferred_dtype = "boolean"
1087+
else:
1088+
if isinstance(inferred_dtype, str) and inferred_dtype == "boolean":
1089+
inferred_dtype = input_array.dtype
1090+
1091+
else:
1092+
inferred_dtype = input_array.dtype
1093+
1094+
return inferred_dtype
1095+
1096+
10211097
def maybe_castable(arr) -> bool:
10221098
# return False to force a non-fastpath
10231099

pandas/core/generic.py

+137
Original file line numberDiff line numberDiff line change
@@ -5879,6 +5879,7 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries:
58795879
to_datetime : Convert argument to datetime.
58805880
to_timedelta : Convert argument to timedelta.
58815881
to_numeric : Convert argument to numeric type.
5882+
convert_dtypes : Convert argument to best possible dtype.
58825883
58835884
Examples
58845885
--------
@@ -5907,6 +5908,142 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries:
59075908
)
59085909
).__finalize__(self)
59095910

5911+
def convert_dtypes(
5912+
self: FrameOrSeries,
5913+
infer_objects: bool_t = True,
5914+
convert_string: bool_t = True,
5915+
convert_integer: bool_t = True,
5916+
convert_boolean: bool_t = True,
5917+
) -> FrameOrSeries:
5918+
"""
5919+
Convert columns to best possible dtypes using dtypes supporting ``pd.NA``.
5920+
5921+
.. versionadded:: 1.0.0
5922+
5923+
Parameters
5924+
----------
5925+
infer_objects : bool, default True
5926+
Whether object dtypes should be converted to the best possible types.
5927+
convert_string : bool, default True
5928+
Whether object dtypes should be converted to ``StringDtype()``.
5929+
convert_integer : bool, default True
5930+
Whether, if possible, conversion can be done to integer extension types.
5931+
convert_boolean : bool, defaults True
5932+
Whether object dtypes should be converted to ``BooleanDtypes()``.
5933+
5934+
Returns
5935+
-------
5936+
Series or DataFrame
5937+
Copy of input object with new dtype.
5938+
5939+
See Also
5940+
--------
5941+
infer_objects : Infer dtypes of objects.
5942+
to_datetime : Convert argument to datetime.
5943+
to_timedelta : Convert argument to timedelta.
5944+
to_numeric : Convert argument to a numeric type.
5945+
5946+
Notes
5947+
-----
5948+
5949+
By default, ``convert_dtypes`` will attempt to convert a Series (or each
5950+
Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options
5951+
``convert_string``, ``convert_integer``, and ``convert_boolean``, it is
5952+
possible to turn off individual conversions to ``StringDtype``, the integer
5953+
extension types or ``BooleanDtype``, respectively.
5954+
5955+
For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference
5956+
rules as during normal Series/DataFrame construction. Then, if possible,
5957+
convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer extension
5958+
type, otherwise leave as ``object``.
5959+
5960+
If the dtype is integer, convert to an appropriate integer extension type.
5961+
5962+
If the dtype is numeric, and consists of all integers, convert to an
5963+
appropriate integer extension type.
5964+
5965+
In the future, as new dtypes are added that support ``pd.NA``, the results
5966+
of this method will change to support those new dtypes.
5967+
5968+
Examples
5969+
--------
5970+
>>> df = pd.DataFrame(
5971+
... {
5972+
... "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
5973+
... "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
5974+
... "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")),
5975+
... "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")),
5976+
... "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")),
5977+
... "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),
5978+
... }
5979+
... )
5980+
5981+
Start with a DataFrame with default dtypes.
5982+
5983+
>>> df
5984+
a b c d e f
5985+
0 1 x True h 10.0 NaN
5986+
1 2 y False i NaN 100.5
5987+
2 3 z NaN NaN 20.0 200.0
5988+
5989+
>>> df.dtypes
5990+
a int32
5991+
b object
5992+
c object
5993+
d object
5994+
e float64
5995+
f float64
5996+
dtype: object
5997+
5998+
Convert the DataFrame to use best possible dtypes.
5999+
6000+
>>> dfn = df.convert_dtypes()
6001+
>>> dfn
6002+
a b c d e f
6003+
0 1 x True h 10 NaN
6004+
1 2 y False i <NA> 100.5
6005+
2 3 z <NA> <NA> 20 200.0
6006+
6007+
>>> dfn.dtypes
6008+
a Int32
6009+
b string
6010+
c boolean
6011+
d string
6012+
e Int64
6013+
f float64
6014+
dtype: object
6015+
6016+
Start with a Series of strings and missing data represented by ``np.nan``.
6017+
6018+
>>> s = pd.Series(["a", "b", np.nan])
6019+
>>> s
6020+
0 a
6021+
1 b
6022+
2 NaN
6023+
dtype: object
6024+
6025+
Obtain a Series with dtype ``StringDtype``.
6026+
6027+
>>> s.convert_dtypes()
6028+
0 a
6029+
1 b
6030+
2 <NA>
6031+
dtype: string
6032+
"""
6033+
if self.ndim == 1:
6034+
return self._convert_dtypes(
6035+
infer_objects, convert_string, convert_integer, convert_boolean
6036+
)
6037+
else:
6038+
results = [
6039+
col._convert_dtypes(
6040+
infer_objects, convert_string, convert_integer, convert_boolean
6041+
)
6042+
for col_name, col in self.items()
6043+
]
6044+
result = pd.concat(results, axis=1, copy=False)
6045+
return result
6046+
59106047
# ----------------------------------------------------------------------
59116048
# Filling NA's
59126049

pandas/core/series.py

+29
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from pandas.util._decorators import Appender, Substitution
1717
from pandas.util._validators import validate_bool_kwarg, validate_percentile
1818

19+
from pandas.core.dtypes.cast import convert_dtypes
1920
from pandas.core.dtypes.common import (
2021
_is_unorderable_exception,
2122
ensure_platform_int,
@@ -4352,6 +4353,34 @@ def between(self, left, right, inclusive=True):
43524353

43534354
return lmask & rmask
43544355

4356+
# ----------------------------------------------------------------------
4357+
# Convert to types that support pd.NA
4358+
4359+
def _convert_dtypes(
4360+
self: ABCSeries,
4361+
infer_objects: bool = True,
4362+
convert_string: bool = True,
4363+
convert_integer: bool = True,
4364+
convert_boolean: bool = True,
4365+
) -> "Series":
4366+
input_series = self
4367+
if infer_objects:
4368+
input_series = input_series.infer_objects()
4369+
if is_object_dtype(input_series):
4370+
input_series = input_series.copy()
4371+
4372+
if convert_string or convert_integer or convert_boolean:
4373+
inferred_dtype = convert_dtypes(
4374+
input_series._values, convert_string, convert_integer, convert_boolean
4375+
)
4376+
try:
4377+
result = input_series.astype(inferred_dtype)
4378+
except TypeError:
4379+
result = input_series.copy()
4380+
else:
4381+
result = input_series.copy()
4382+
return result
4383+
43554384
@Appender(generic._shared_docs["isna"] % _shared_doc_kwargs)
43564385
def isna(self):
43574386
return super().isna()

pandas/core/tools/datetimes.py

+1
Original file line numberDiff line numberDiff line change
@@ -629,6 +629,7 @@ def to_datetime(
629629
--------
630630
DataFrame.astype : Cast argument to a specified dtype.
631631
to_timedelta : Convert argument to timedelta.
632+
convert_dtypes : Convert dtypes.
632633
633634
Examples
634635
--------

pandas/core/tools/numeric.py

+1
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ def to_numeric(arg, errors="raise", downcast=None):
7070
to_datetime : Convert argument to datetime.
7171
to_timedelta : Convert argument to timedelta.
7272
numpy.ndarray.astype : Cast a numpy array to a specified type.
73+
convert_dtypes : Convert dtypes.
7374
7475
Examples
7576
--------

0 commit comments

Comments
 (0)