pandas/tests/frame/methods/test_to_dict.py

from collections import (
    OrderedDict,
    defaultdict,
)
from datetime import datetime

import numpy as np
import pytest
import pytz

from pandas import (
    NA,
    DataFrame,
    Index,
    MultiIndex,
    Series,
    Timestamp,
)
import pandas._testing as tm


class TestDataFrameToDict:
    def test_to_dict_timestamp(self):

        # GH#11247
        # split/records producing np.datetime64 rather than Timestamps
        # on datetime64[ns] dtypes only

        tsmp = Timestamp("20130101")
        test_data = DataFrame({"A": [tsmp, tsmp], "B": [tsmp, tsmp]})
        test_data_mixed = DataFrame({"A": [tsmp, tsmp], "B": [1, 2]})

        expected_records = [{"A": tsmp, "B": tsmp}, {"A": tsmp, "B": tsmp}]
        expected_records_mixed = [{"A": tsmp, "B": 1}, {"A": tsmp, "B": 2}]

        assert test_data.to_dict(orient="records") == expected_records
        assert test_data_mixed.to_dict(orient="records") == expected_records_mixed

        expected_series = {
            "A": Series([tsmp, tsmp], name="A"),
            "B": Series([tsmp, tsmp], name="B"),
        }
        expected_series_mixed = {
            "A": Series([tsmp, tsmp], name="A"),
            "B": Series([1, 2], name="B"),
        }

        tm.assert_dict_equal(test_data.to_dict(orient="series"), expected_series)
        tm.assert_dict_equal(
            test_data_mixed.to_dict(orient="series"), expected_series_mixed
        )

        expected_split = {
            "index": [0, 1],
            "data": [[tsmp, tsmp], [tsmp, tsmp]],
            "columns": ["A", "B"],
        }
        expected_split_mixed = {
            "index": [0, 1],
            "data": [[tsmp, 1], [tsmp, 2]],
            "columns": ["A", "B"],
        }

        tm.assert_dict_equal(test_data.to_dict(orient="split"), expected_split)
        tm.assert_dict_equal(
            test_data_mixed.to_dict(orient="split"), expected_split_mixed
        )

    def test_to_dict_index_not_unique_with_index_orient(self):
        # GH#22801
        # Data loss when indexes are not unique. Raise ValueError.
        df = DataFrame({"a": [1, 2], "b": [0.5, 0.75]}, index=["A", "A"])
        msg = "DataFrame index must be unique for orient='index'"
        with pytest.raises(ValueError, match=msg):
            df.to_dict(orient="index")

    def test_to_dict_invalid_orient(self):
        df = DataFrame({"A": [0, 1]})
        msg = "orient 'xinvalid' not understood"
        with pytest.raises(ValueError, match=msg):
            df.to_dict(orient="xinvalid")

    @pytest.mark.parametrize("orient", ["d", "l", "r", "sp", "s", "i"])
    def test_to_dict_short_orient_raises(self, orient):
        # GH#32515
        df = DataFrame({"A": [0, 1]})
        with pytest.raises(ValueError, match="not understood"):
            df.to_dict(orient=orient)

    @pytest.mark.parametrize("mapping", [dict, defaultdict(list), OrderedDict])
    def test_to_dict(self, mapping):
        # orient= should only take the listed options
        # see GH#32515
        test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}}

        # GH#16122
        recons_data = DataFrame(test_data).to_dict(into=mapping)

        for k, v in test_data.items():
            for k2, v2 in v.items():
                assert v2 == recons_data[k][k2]

        recons_data = DataFrame(test_data).to_dict("list", mapping)

        for k, v in test_data.items():
            for k2, v2 in v.items():
                assert v2 == recons_data[k][int(k2) - 1]

        recons_data = DataFrame(test_data).to_dict("series", mapping)

        for k, v in test_data.items():
            for k2, v2 in v.items():
                assert v2 == recons_data[k][k2]

        recons_data = DataFrame(test_data).to_dict("split", mapping)
        expected_split = {
            "columns": ["A", "B"],
            "index": ["1", "2", "3"],
            "data": [[1.0, "1"], [2.0, "2"], [np.nan, "3"]],
        }
        tm.assert_dict_equal(recons_data, expected_split)

        recons_data = DataFrame(test_data).to_dict("records", mapping)
        expected_records = [
            {"A": 1.0, "B": "1"},
            {"A": 2.0, "B": "2"},
            {"A": np.nan, "B": "3"},
        ]
        assert isinstance(recons_data, list)
        assert len(recons_data) == 3
        for left, right in zip(recons_data, expected_records):
            tm.assert_dict_equal(left, right)

        # GH#10844
        recons_data = DataFrame(test_data).to_dict("index")

        for k, v in test_data.items():
            for k2, v2 in v.items():
                assert v2 == recons_data[k2][k]

        df = DataFrame(test_data)
        df["duped"] = df[df.columns[0]]
        recons_data = df.to_dict("index")
        comp_data = test_data.copy()
        comp_data["duped"] = comp_data[df.columns[0]]
        for k, v in comp_data.items():
            for k2, v2 in v.items():
                assert v2 == recons_data[k2][k]

    @pytest.mark.parametrize("mapping", [list, defaultdict, []])
    def test_to_dict_errors(self, mapping):
        # GH#16122
        df = DataFrame(np.random.randn(3, 3))
        msg = "|".join(
            [
                "unsupported type: <class 'list'>",
                r"to_dict\(\) only accepts initialized defaultdicts",
            ]
        )
        with pytest.raises(TypeError, match=msg):
            df.to_dict(into=mapping)

    def test_to_dict_not_unique_warning(self):
        # GH#16927: When converting to a dict, if a column has a non-unique name
        # it will be dropped, throwing a warning.
        df = DataFrame([[1, 2, 3]], columns=["a", "a", "b"])
        with tm.assert_produces_warning(UserWarning):
            df.to_dict()

    # orient - orient argument to to_dict function
    # item_getter - function for extracting value from
    # the resulting dict using column name and index
    @pytest.mark.parametrize(
        "orient,item_getter",
        [
            ("dict", lambda d, col, idx: d[col][idx]),
            ("records", lambda d, col, idx: d[idx][col]),
            ("list", lambda d, col, idx: d[col][idx]),
            ("split", lambda d, col, idx: d["data"][idx][d["columns"].index(col)]),
            ("index", lambda d, col, idx: d[idx][col]),
        ],
    )
    def test_to_dict_box_scalars(self, orient, item_getter):
        # GH#14216, GH#23753
        # make sure that we are boxing properly
        df = DataFrame({"a": [1, 2], "b": [0.1, 0.2]})
        result = df.to_dict(orient=orient)
        assert isinstance(item_getter(result, "a", 0), int)
        assert isinstance(item_getter(result, "b", 0), float)

    def test_to_dict_tz(self):
        # GH#18372 When converting to dict with orient='records' columns of
        # datetime that are tz-aware were not converted to required arrays
        data = [
            (datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),),
            (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc),),
        ]
        df = DataFrame(list(data), columns=["d"])

        result = df.to_dict(orient="records")
        expected = [
            {"d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=pytz.utc)},
            {"d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=pytz.utc)},
        ]
        tm.assert_dict_equal(result[0], expected[0])
        tm.assert_dict_equal(result[1], expected[1])

    @pytest.mark.parametrize(
        "into, expected",
        [
            (
                dict,
                {
                    0: {"int_col": 1, "float_col": 1.0},
                    1: {"int_col": 2, "float_col": 2.0},
                    2: {"int_col": 3, "float_col": 3.0},
                },
            ),
            (
                OrderedDict,
                OrderedDict(
                    [
                        (0, {"int_col": 1, "float_col": 1.0}),
                        (1, {"int_col": 2, "float_col": 2.0}),
                        (2, {"int_col": 3, "float_col": 3.0}),
                    ]
                ),
            ),
            (
                defaultdict(dict),
                defaultdict(
                    dict,
                    {
                        0: {"int_col": 1, "float_col": 1.0},
                        1: {"int_col": 2, "float_col": 2.0},
                        2: {"int_col": 3, "float_col": 3.0},
                    },
                ),
            ),
        ],
    )
    def test_to_dict_index_dtypes(self, into, expected):
        # GH#18580
        # When using to_dict(orient='index') on a dataframe with int
        # and float columns only the int columns were cast to float

        df = DataFrame({"int_col": [1, 2, 3], "float_col": [1.0, 2.0, 3.0]})

        result = df.to_dict(orient="index", into=into)
        cols = ["int_col", "float_col"]
        result = DataFrame.from_dict(result, orient="index")[cols]
        expected = DataFrame.from_dict(expected, orient="index")[cols]
        tm.assert_frame_equal(result, expected)

    def test_to_dict_numeric_names(self):
        # GH#24940
        df = DataFrame({str(i): [i] for i in range(5)})
        result = set(df.to_dict("records")[0].keys())
        expected = set(df.columns)
        assert result == expected

    def test_to_dict_wide(self):
        # GH#24939
        df = DataFrame({(f"A_{i:d}"): [i] for i in range(256)})
        result = df.to_dict("records")[0]
        expected = {f"A_{i:d}": i for i in range(256)}
        assert result == expected

    @pytest.mark.parametrize(
        "data,dtype",
        (
            ([True, True, False], bool),
            [
                [
                    datetime(2018, 1, 1),
                    datetime(2019, 2, 2),
                    datetime(2020, 3, 3),
                ],
                Timestamp,
            ],
            [[1.0, 2.0, 3.0], float],
            [[1, 2, 3], int],
            [["X", "Y", "Z"], str],
        ),
    )
    def test_to_dict_orient_dtype(self, data, dtype):
        # GH22620 & GH21256

        df = DataFrame({"a": data})
        d = df.to_dict(orient="records")
        assert all(type(record["a"]) is dtype for record in d)

    @pytest.mark.parametrize(
        "data,expected_dtype",
        (
            [np.uint64(2), int],
            [np.int64(-9), int],
            [np.float64(1.1), float],
            [np.bool_(True), bool],
            [np.datetime64("2005-02-25"), Timestamp],
        ),
    )
    def test_to_dict_scalar_constructor_orient_dtype(self, data, expected_dtype):
        # GH22620 & GH21256

        df = DataFrame({"a": data}, index=[0])
        d = df.to_dict(orient="records")
        result = type(d[0]["a"])
        assert result is expected_dtype

    def test_to_dict_mixed_numeric_frame(self):
        # GH 12859
        df = DataFrame({"a": [1.0], "b": [9.0]})
        result = df.reset_index().to_dict("records")
        expected = [{"index": 0, "a": 1.0, "b": 9.0}]
        assert result == expected

    @pytest.mark.parametrize(
        "index",
        [
            None,
            Index(["aa", "bb"]),
            Index(["aa", "bb"], name="cc"),
            MultiIndex.from_tuples([("a", "b"), ("a", "c")]),
            MultiIndex.from_tuples([("a", "b"), ("a", "c")], names=["n1", "n2"]),
        ],
    )
    @pytest.mark.parametrize(
        "columns",
        [
            ["x", "y"],
            Index(["x", "y"]),
            Index(["x", "y"], name="z"),
            MultiIndex.from_tuples([("x", 1), ("y", 2)]),
            MultiIndex.from_tuples([("x", 1), ("y", 2)], names=["z1", "z2"]),
        ],
    )
    def test_to_dict_orient_tight(self, index, columns):
        df = DataFrame.from_records(
            [[1, 3], [2, 4]],
            columns=columns,
            index=index,
        )
        roundtrip = DataFrame.from_dict(df.to_dict(orient="tight"), orient="tight")

        tm.assert_frame_equal(df, roundtrip)

    @pytest.mark.parametrize(
        "orient",
        ["dict", "list", "split", "records", "index", "tight"],
    )
    @pytest.mark.parametrize(
        "data,expected_types",
        (
            (
                {
                    "a": [np.int64(1), 1, np.int64(3)],
                    "b": [np.float64(1.0), 2.0, np.float64(3.0)],
                    "c": [np.float64(1.0), 2, np.int64(3)],
                    "d": [np.float64(1.0), "a", np.int64(3)],
                    "e": [np.float64(1.0), ["a"], np.int64(3)],
                    "f": [np.float64(1.0), ("a",), np.int64(3)],
                },
                {
                    "a": [int, int, int],
                    "b": [float, float, float],
                    "c": [float, float, float],
                    "d": [float, str, int],
                    "e": [float, list, int],
                    "f": [float, tuple, int],
                },
            ),
            (
                {
                    "a": [1, 2, 3],
                    "b": [1.1, 2.2, 3.3],
                },
                {
                    "a": [int, int, int],
                    "b": [float, float, float],
                },
            ),
            (  # Make sure we have one df which is all object type cols
                {
                    "a": [1, "hello", 3],
                    "b": [1.1, "world", 3.3],
                },
                {
                    "a": [int, str, int],
                    "b": [float, str, float],
                },
            ),
        ),
    )
    def test_to_dict_returns_native_types(self, orient, data, expected_types):
        # GH 46751
        # Tests we get back native types for all orient types
        df = DataFrame(data)
        result = df.to_dict(orient)
        if orient == "dict":
            assertion_iterator = (
                (i, key, value)
                for key, index_value_map in result.items()
                for i, value in index_value_map.items()
            )
        elif orient == "list":
            assertion_iterator = (
                (i, key, value)
                for key, values in result.items()
                for i, value in enumerate(values)
            )
        elif orient in {"split", "tight"}:
            assertion_iterator = (
                (i, key, result["data"][i][j])
                for i in result["index"]
                for j, key in enumerate(result["columns"])
            )
        elif orient == "records":
            assertion_iterator = (
                (i, key, value)
                for i, record in enumerate(result)
                for key, value in record.items()
            )
        elif orient == "index":
            assertion_iterator = (
                (i, key, value)
                for i, record in result.items()
                for key, value in record.items()
            )

        for i, key, value in assertion_iterator:
            assert value == data[key][i]
            assert type(value) is expected_types[key][i]

    @pytest.mark.parametrize("orient", ["dict", "list", "series", "records", "index"])
    def test_to_dict_index_false_error(self, orient):
        # GH#46398
        df = DataFrame({"col1": [1, 2], "col2": [3, 4]}, index=["row1", "row2"])
        msg = "'index=False' is only valid when 'orient' is 'split' or 'tight'"
        with pytest.raises(ValueError, match=msg):
            df.to_dict(orient=orient, index=False)

    @pytest.mark.parametrize(
        "orient, expected",
        [
            ("split", {"columns": ["col1", "col2"], "data": [[1, 3], [2, 4]]}),
            (
                "tight",
                {
                    "columns": ["col1", "col2"],
                    "data": [[1, 3], [2, 4]],
                    "column_names": [None],
                },
            ),
        ],
    )
    def test_to_dict_index_false(self, orient, expected):
        # GH#46398
        df = DataFrame({"col1": [1, 2], "col2": [3, 4]}, index=["row1", "row2"])
        result = df.to_dict(orient=orient, index=False)
        tm.assert_dict_equal(result, expected)

    @pytest.mark.parametrize(
        "orient, expected",
        [
            ("dict", {"a": {0: 1, 1: None}}),
            ("list", {"a": [1, None]}),
            ("split", {"index": [0, 1], "columns": ["a"], "data": [[1], [None]]}),
            (
                "tight",
                {
                    "index": [0, 1],
                    "columns": ["a"],
                    "data": [[1], [None]],
                    "index_names": [None],
                    "column_names": [None],
                },
            ),
            ("records", [{"a": 1}, {"a": None}]),
            ("index", {0: {"a": 1}, 1: {"a": None}}),
        ],
    )
    def test_to_dict_na_to_none(self, orient, expected):
        # GH#50795
        df = DataFrame({"a": [1, NA]}, dtype="Int64")
        result = df.to_dict(orient=orient)
        assert result == expected

    def test_to_dict_masked_native_python(self):
        # GH#34665
        df = DataFrame({"a": Series([1, 2], dtype="Int64"), "B": 1})
        result = df.to_dict(orient="records")
        assert type(result[0]["a"]) is int

        df = DataFrame({"a": Series([1, NA], dtype="Int64"), "B": 1})
        result = df.to_dict(orient="records")
        assert type(result[0]["a"]) is int