BUG: to_dict not converting NA to None (#50796)

phofl · web-flow · commit b888dadc581a · 2023-01-18T09:41:22.000+01:00
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -1020,6 +1020,7 @@ I/O
 - Bug in :meth:`DataFrame.to_string` ignoring float formatter for extension arrays (:issue:`39336`)
 - Fixed memory leak which stemmed from the initialization of the internal JSON module (:issue:`49222`)
 - Fixed issue where :func:`json_normalize` would incorrectly remove leading characters from column names that matched the ``sep`` argument (:issue:`49861`)
+- Bug in :meth:`DataFrame.to_dict` not converting ``NA`` to ``None`` (:issue:`50795`)
 - Bug in :meth:`DataFrame.to_json` where it would segfault when failing to encode a string (:issue:`50307`)
 
 Period
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -20,6 +20,10 @@
 import numpy as np
 
 from pandas._libs import lib
+from pandas._libs.missing import (
+    NA,
+    NAType,
+)
 from pandas._libs.tslibs import (
     NaT,
     OutOfBoundsDatetime,
@@ -176,7 +180,7 @@ def maybe_box_datetimelike(value: Scalar, dtype: Dtype | None = None) -> Scalar:
     return value
 
 
-def maybe_box_native(value: Scalar) -> Scalar:
+def maybe_box_native(value: Scalar | None | NAType) -> Scalar | None | NAType:
     """
     If passed a scalar cast the scalar to a python native type.
 
@@ -202,6 +206,8 @@ def maybe_box_native(value: Scalar) -> Scalar:
         value = bool(value)
     elif isinstance(value, (np.datetime64, np.timedelta64)):
         value = maybe_box_datetimelike(value)
+    elif value is NA:
+        value = None
     return value
 
 
diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py
@@ -6,7 +6,10 @@
 from pandas.util._exceptions import find_stack_level
 
 from pandas.core.dtypes.cast import maybe_box_native
-from pandas.core.dtypes.common import is_object_dtype
+from pandas.core.dtypes.common import (
+    is_extension_array_dtype,
+    is_object_dtype,
+)
 
 from pandas import DataFrame
 from pandas.core import common as com
@@ -88,16 +91,18 @@ def to_dict(
         # GH46470 Return quickly if orient series to avoid creating dtype objects
         return into_c((k, v) for k, v in df.items())
 
-    object_dtype_indices = [
-        i for i, col_dtype in enumerate(df.dtypes.values) if is_object_dtype(col_dtype)
+    box_native_indices = [
+        i
+        for i, col_dtype in enumerate(df.dtypes.values)
+        if is_object_dtype(col_dtype) or is_extension_array_dtype(col_dtype)
     ]
-    are_all_object_dtype_cols = len(object_dtype_indices) == len(df.dtypes)
+    are_all_object_dtype_cols = len(box_native_indices) == len(df.dtypes)
 
     if orient == "dict":
         return into_c((k, v.to_dict(into)) for k, v in df.items())
 
     elif orient == "list":
-        object_dtype_indices_as_set = set(object_dtype_indices)
+        object_dtype_indices_as_set = set(box_native_indices)
         return into_c(
             (
                 k,
@@ -110,7 +115,7 @@ def to_dict(
 
     elif orient == "split":
         data = df._create_data_for_split_and_tight_to_dict(
-            are_all_object_dtype_cols, object_dtype_indices
+            are_all_object_dtype_cols, box_native_indices
         )
 
         return into_c(
@@ -123,7 +128,7 @@ def to_dict(
 
     elif orient == "tight":
         data = df._create_data_for_split_and_tight_to_dict(
-            are_all_object_dtype_cols, object_dtype_indices
+            are_all_object_dtype_cols, box_native_indices
         )
 
         return into_c(
@@ -155,8 +160,8 @@ def to_dict(
             data = [
                 into_c(zip(columns, t)) for t in df.itertuples(index=False, name=None)
             ]
-            if object_dtype_indices:
-                object_dtype_indices_as_set = set(object_dtype_indices)
+            if box_native_indices:
+                object_dtype_indices_as_set = set(box_native_indices)
                 object_dtype_cols = {
                     col
                     for i, col in enumerate(df.columns)
@@ -176,8 +181,8 @@ def to_dict(
                 (t[0], dict(zip(df.columns, map(maybe_box_native, t[1:]))))
                 for t in df.itertuples(name=None)
             )
-        elif object_dtype_indices:
-            object_dtype_indices_as_set = set(object_dtype_indices)
+        elif box_native_indices:
+            object_dtype_indices_as_set = set(box_native_indices)
             is_object_dtype_by_index = [
                 i in object_dtype_indices_as_set for i in range(len(df.columns))
             ]
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -89,6 +89,7 @@
 from pandas.core.dtypes.common import (
     ensure_platform_int,
     is_dict_like,
+    is_extension_array_dtype,
     is_integer,
     is_iterator,
     is_list_like,
@@ -1832,7 +1833,7 @@ def to_dict(self, into: type[dict] = dict) -> dict:
         # GH16122
         into_c = com.standardize_mapping(into)
 
-        if is_object_dtype(self):
+        if is_object_dtype(self) or is_extension_array_dtype(self):
             return into_c((k, maybe_box_native(v)) for k, v in self.items())
         else:
             # Not an object dtype => all types will be the same so let the default
diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py
@@ -9,6 +9,7 @@
 import pytz
 
 from pandas import (
+    NA,
     DataFrame,
     Index,
     MultiIndex,
@@ -458,3 +459,29 @@ def test_to_dict_index_false(self, orient, expected):
         df = DataFrame({"col1": [1, 2], "col2": [3, 4]}, index=["row1", "row2"])
         result = df.to_dict(orient=orient, index=False)
         tm.assert_dict_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "orient, expected",
+        [
+            ("dict", {"a": {0: 1, 1: None}}),
+            ("list", {"a": [1, None]}),
+            ("split", {"index": [0, 1], "columns": ["a"], "data": [[1], [None]]}),
+            (
+                "tight",
+                {
+                    "index": [0, 1],
+                    "columns": ["a"],
+                    "data": [[1], [None]],
+                    "index_names": [None],
+                    "column_names": [None],
+                },
+            ),
+            ("records", [{"a": 1}, {"a": None}]),
+            ("index", {0: {"a": 1}, 1: {"a": None}}),
+        ],
+    )
+    def test_to_dict_na_to_none(self, orient, expected):
+        # GH#50795
+        df = DataFrame({"a": [1, NA]}, dtype="Int64")
+        result = df.to_dict(orient=orient)
+        assert result == expected