diff --git a/doc/source/whatsnew/v1.3.1.rst b/doc/source/whatsnew/v1.3.1.rst index 9c17a22bf6d52..65719a11243f8 100644 --- a/doc/source/whatsnew/v1.3.1.rst +++ b/doc/source/whatsnew/v1.3.1.rst @@ -16,6 +16,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Pandas could not be built on PyPy (:issue:`42355`) - :class:`DataFrame` constructed with with an older version of pandas could not be unpickled (:issue:`42345`) +- Performance regression in constructing a :class:`DataFrame` from a dictionary of dictionaries (:issue:`42338`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index d39c5ac5d2967..5be50f16af003 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -51,6 +51,7 @@ def is_string_array(values: np.ndarray, skipna: bool = False): ... def is_float_array(values: np.ndarray, skipna: bool = False): ... def is_integer_array(values: np.ndarray, skipna: bool = False): ... def is_bool_array(values: np.ndarray, skipna: bool = False): ... +def fast_multiget(mapping: dict, keys: np.ndarray, default=np.nan) -> np.ndarray: ... def fast_unique_multiple_list_gen(gen: Generator, sort: bool = True) -> list: ... def fast_unique_multiple_list(lists: list, sort: bool = True) -> list: ... def fast_unique_multiple(arrays: list, sort: bool = True) -> list: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c48b8ed5aec9c..4ab2497be94d5 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2955,6 +2955,28 @@ def to_object_array_tuples(rows: object) -> np.ndarray: return result +@cython.wraparound(False) +@cython.boundscheck(False) +def fast_multiget(dict mapping, ndarray keys, default=np.nan) -> np.ndarray: + cdef: + Py_ssize_t i, n = len(keys) + object val + ndarray[object] output = np.empty(n, dtype='O') + + if n == 0: + # kludge, for Series + return np.empty(0, dtype='f8') + + for i in range(n): + val = keys[i] + if val in mapping: + output[i] = mapping[val] + else: + output[i] = default + + return maybe_convert_objects(output) + + def is_bool_list(obj: list) -> bool: """ Check if this list contains only bool or np.bool_ objects. diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 9f0fefc9a10d0..15a18d5027274 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -781,6 +781,21 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> tuple[DtypeObj, return dtype, val +def dict_compat(d: dict[Scalar, Scalar]) -> dict[Scalar, Scalar]: + """ + Convert datetimelike-keyed dicts to a Timestamp-keyed dict. + + Parameters + ---------- + d: dict-like object + + Returns + ------- + dict + """ + return {maybe_box_datetimelike(key): value for key, value in d.items()} + + def infer_dtype_from_array( arr, pandas_dtype: bool = False ) -> tuple[DtypeObj, ArrayLike]: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 7bef7ae9b39d7..5e327adfb8905 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -26,6 +26,7 @@ from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, + dict_compat, maybe_cast_to_datetime, maybe_convert_platform, maybe_infer_to_datetimelike, @@ -59,7 +60,6 @@ TimedeltaArray, ) from pandas.core.construction import ( - create_series_with_explicit_dtype, ensure_wrapped_if_datetimelike, extract_array, range_to_ndarray, @@ -67,7 +67,9 @@ ) from pandas.core.indexes import base as ibase from pandas.core.indexes.api import ( + DatetimeIndex, Index, + TimedeltaIndex, ensure_index, get_objs_combined_axis, union_indexes, @@ -556,6 +558,7 @@ def convert(v): def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]: + oindex = None homogenized = [] for val in data: @@ -570,9 +573,18 @@ def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]: val = val._values else: if isinstance(val, dict): - # see test_constructor_subclass_dict - # test_constructor_dict_datetime64_index - val = create_series_with_explicit_dtype(val, index=index)._values + # GH#41785 this _should_ be equivalent to (but faster than) + # val = create_series_with_explicit_dtype(val, index=index)._values + if oindex is None: + oindex = index.astype("O") + + if isinstance(index, (DatetimeIndex, TimedeltaIndex)): + # see test_constructor_dict_datetime64_index + val = dict_compat(val) + else: + # see test_constructor_subclass_dict + val = dict(val) + val = lib.fast_multiget(val, oindex._values, default=np.nan) val = sanitize_array( val, index, dtype=dtype, copy=False, raise_cast_failure=False diff --git a/pandas/tests/dtypes/cast/test_dict_compat.py b/pandas/tests/dtypes/cast/test_dict_compat.py new file mode 100644 index 0000000000000..13dc82d779f95 --- /dev/null +++ b/pandas/tests/dtypes/cast/test_dict_compat.py @@ -0,0 +1,14 @@ +import numpy as np + +from pandas.core.dtypes.cast import dict_compat + +from pandas import Timestamp + + +def test_dict_compat(): + data_datetime64 = {np.datetime64("1990-03-15"): 1, np.datetime64("2015-03-15"): 2} + data_unchanged = {1: 2, 3: 4, 5: 6} + expected = {Timestamp("1990-3-15"): 1, Timestamp("2015-03-15"): 2} + assert dict_compat(data_datetime64) == expected + assert dict_compat(expected) == expected + assert dict_compat(data_unchanged) == data_unchanged