Skip to content

BUG: to_dict not converting NA to None #50796

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 18, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1021,6 +1021,7 @@ I/O
- Bug in :meth:`DataFrame.to_string` ignoring float formatter for extension arrays (:issue:`39336`)
- Fixed memory leak which stemmed from the initialization of the internal JSON module (:issue:`49222`)
- Fixed issue where :func:`json_normalize` would incorrectly remove leading characters from column names that matched the ``sep`` argument (:issue:`49861`)
- Bug in :meth:`DataFrame.to_dict` not converting ``NA`` to ``None`` (:issue:`50795`)
- Bug in :meth:`DataFrame.to_json` where it would segfault when failing to encode a string (:issue:`50307`)

Period
Expand Down
3 changes: 3 additions & 0 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import numpy as np

from pandas._libs import lib
from pandas._libs.missing import NA
from pandas._libs.tslibs import (
NaT,
OutOfBoundsDatetime,
Expand Down Expand Up @@ -202,6 +203,8 @@ def maybe_box_native(value: Scalar) -> Scalar:
value = bool(value)
elif isinstance(value, (np.datetime64, np.timedelta64)):
value = maybe_box_datetimelike(value)
elif value is NA:
value = None
return value


Expand Down
27 changes: 16 additions & 11 deletions pandas/core/methods/to_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.cast import maybe_box_native
from pandas.core.dtypes.common import is_object_dtype
from pandas.core.dtypes.common import (
is_extension_array_dtype,
is_object_dtype,
)

from pandas import DataFrame
from pandas.core import common as com
Expand Down Expand Up @@ -88,16 +91,18 @@ def to_dict(
# GH46470 Return quickly if orient series to avoid creating dtype objects
return into_c((k, v) for k, v in df.items())

object_dtype_indices = [
i for i, col_dtype in enumerate(df.dtypes.values) if is_object_dtype(col_dtype)
box_native_indices = [
i
for i, col_dtype in enumerate(df.dtypes.values)
if is_object_dtype(col_dtype) or is_extension_array_dtype(col_dtype)
]
are_all_object_dtype_cols = len(object_dtype_indices) == len(df.dtypes)
are_all_object_dtype_cols = len(box_native_indices) == len(df.dtypes)

if orient == "dict":
return into_c((k, v.to_dict(into)) for k, v in df.items())

elif orient == "list":
object_dtype_indices_as_set = set(object_dtype_indices)
object_dtype_indices_as_set = set(box_native_indices)
return into_c(
(
k,
Expand All @@ -110,7 +115,7 @@ def to_dict(

elif orient == "split":
data = df._create_data_for_split_and_tight_to_dict(
are_all_object_dtype_cols, object_dtype_indices
are_all_object_dtype_cols, box_native_indices
)

return into_c(
Expand All @@ -123,7 +128,7 @@ def to_dict(

elif orient == "tight":
data = df._create_data_for_split_and_tight_to_dict(
are_all_object_dtype_cols, object_dtype_indices
are_all_object_dtype_cols, box_native_indices
)

return into_c(
Expand Down Expand Up @@ -155,8 +160,8 @@ def to_dict(
data = [
into_c(zip(columns, t)) for t in df.itertuples(index=False, name=None)
]
if object_dtype_indices:
object_dtype_indices_as_set = set(object_dtype_indices)
if box_native_indices:
object_dtype_indices_as_set = set(box_native_indices)
object_dtype_cols = {
col
for i, col in enumerate(df.columns)
Expand All @@ -176,8 +181,8 @@ def to_dict(
(t[0], dict(zip(df.columns, map(maybe_box_native, t[1:]))))
for t in df.itertuples(name=None)
)
elif object_dtype_indices:
object_dtype_indices_as_set = set(object_dtype_indices)
elif box_native_indices:
object_dtype_indices_as_set = set(box_native_indices)
is_object_dtype_by_index = [
i in object_dtype_indices_as_set for i in range(len(df.columns))
]
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@
from pandas.core.dtypes.common import (
ensure_platform_int,
is_dict_like,
is_extension_array_dtype,
is_integer,
is_iterator,
is_list_like,
Expand Down Expand Up @@ -1832,7 +1833,7 @@ def to_dict(self, into: type[dict] = dict) -> dict:
# GH16122
into_c = com.standardize_mapping(into)

if is_object_dtype(self):
if is_object_dtype(self) or is_extension_array_dtype(self):
return into_c((k, maybe_box_native(v)) for k, v in self.items())
else:
# Not an object dtype => all types will be the same so let the default
Expand Down
27 changes: 27 additions & 0 deletions pandas/tests/frame/methods/test_to_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import pytz

from pandas import (
NA,
DataFrame,
Index,
MultiIndex,
Expand Down Expand Up @@ -458,3 +459,29 @@ def test_to_dict_index_false(self, orient, expected):
df = DataFrame({"col1": [1, 2], "col2": [3, 4]}, index=["row1", "row2"])
result = df.to_dict(orient=orient, index=False)
tm.assert_dict_equal(result, expected)

@pytest.mark.parametrize(
"orient, expected",
[
("dict", {"a": {0: 1, 1: None}}),
("list", {"a": [1, None]}),
("split", {"index": [0, 1], "columns": ["a"], "data": [[1], [None]]}),
(
"tight",
{
"index": [0, 1],
"columns": ["a"],
"data": [[1], [None]],
"index_names": [None],
"column_names": [None],
},
),
("records", [{"a": 1}, {"a": None}]),
("index", {0: {"a": 1}, 1: {"a": None}}),
],
)
def test_to_dict_na_to_none(self, orient, expected):
# GH#50795
df = DataFrame({"a": [1, NA]}, dtype="Int64")
result = df.to_dict(orient=orient)
assert result == expected