Skip to content

Commit d48ea59

Browse files
jbrockmendelmeeseeksmachine
authored andcommitted
Backport PR pandas-dev#42338: PERF/REGR: revert pandas-dev#41785
1 parent ebccc14 commit d48ea59

File tree

6 files changed

+69
-4
lines changed

6 files changed

+69
-4
lines changed

doc/source/whatsnew/v1.3.1.rst

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ Fixed regressions
1616
~~~~~~~~~~~~~~~~~
1717
- Pandas could not be built on PyPy (:issue:`42355`)
1818
- :class:`DataFrame` constructed with with an older version of pandas could not be unpickled (:issue:`42345`)
19+
- Performance regression in constructing a :class:`DataFrame` from a dictionary of dictionaries (:issue:`42338`)
1920
-
2021

2122
.. ---------------------------------------------------------------------------

pandas/_libs/lib.pyi

+1
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ def is_string_array(values: np.ndarray, skipna: bool = False): ...
5151
def is_float_array(values: np.ndarray, skipna: bool = False): ...
5252
def is_integer_array(values: np.ndarray, skipna: bool = False): ...
5353
def is_bool_array(values: np.ndarray, skipna: bool = False): ...
54+
def fast_multiget(mapping: dict, keys: np.ndarray, default=np.nan) -> np.ndarray: ...
5455
def fast_unique_multiple_list_gen(gen: Generator, sort: bool = True) -> list: ...
5556
def fast_unique_multiple_list(lists: list, sort: bool = True) -> list: ...
5657
def fast_unique_multiple(arrays: list, sort: bool = True) -> list: ...

pandas/_libs/lib.pyx

+22
Original file line numberDiff line numberDiff line change
@@ -2979,6 +2979,28 @@ def to_object_array_tuples(rows: object) -> np.ndarray:
29792979
return result
29802980

29812981

2982+
@cython.wraparound(False)
2983+
@cython.boundscheck(False)
2984+
def fast_multiget(dict mapping, ndarray keys, default=np.nan) -> np.ndarray:
2985+
cdef:
2986+
Py_ssize_t i, n = len(keys)
2987+
object val
2988+
ndarray[object] output = np.empty(n, dtype='O')
2989+
2990+
if n == 0:
2991+
# kludge, for Series
2992+
return np.empty(0, dtype='f8')
2993+
2994+
for i in range(n):
2995+
val = keys[i]
2996+
if val in mapping:
2997+
output[i] = mapping[val]
2998+
else:
2999+
output[i] = default
3000+
3001+
return maybe_convert_objects(output)
3002+
3003+
29823004
def is_bool_list(obj: list) -> bool:
29833005
"""
29843006
Check if this list contains only bool or np.bool_ objects.

pandas/core/dtypes/cast.py

+15
Original file line numberDiff line numberDiff line change
@@ -781,6 +781,21 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> tuple[DtypeObj,
781781
return dtype, val
782782

783783

784+
def dict_compat(d: dict[Scalar, Scalar]) -> dict[Scalar, Scalar]:
785+
"""
786+
Convert datetimelike-keyed dicts to a Timestamp-keyed dict.
787+
788+
Parameters
789+
----------
790+
d: dict-like object
791+
792+
Returns
793+
-------
794+
dict
795+
"""
796+
return {maybe_box_datetimelike(key): value for key, value in d.items()}
797+
798+
784799
def infer_dtype_from_array(
785800
arr, pandas_dtype: bool = False
786801
) -> tuple[DtypeObj, ArrayLike]:

pandas/core/internals/construction.py

+16-4
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
from pandas.core.dtypes.cast import (
2828
construct_1d_arraylike_from_scalar,
29+
dict_compat,
2930
maybe_cast_to_datetime,
3031
maybe_convert_platform,
3132
maybe_infer_to_datetimelike,
@@ -59,15 +60,16 @@
5960
TimedeltaArray,
6061
)
6162
from pandas.core.construction import (
62-
create_series_with_explicit_dtype,
6363
ensure_wrapped_if_datetimelike,
6464
extract_array,
6565
range_to_ndarray,
6666
sanitize_array,
6767
)
6868
from pandas.core.indexes import base as ibase
6969
from pandas.core.indexes.api import (
70+
DatetimeIndex,
7071
Index,
72+
TimedeltaIndex,
7173
ensure_index,
7274
get_objs_combined_axis,
7375
union_indexes,
@@ -556,6 +558,7 @@ def convert(v):
556558

557559

558560
def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]:
561+
oindex = None
559562
homogenized = []
560563

561564
for val in data:
@@ -570,9 +573,18 @@ def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]:
570573
val = val._values
571574
else:
572575
if isinstance(val, dict):
573-
# see test_constructor_subclass_dict
574-
# test_constructor_dict_datetime64_index
575-
val = create_series_with_explicit_dtype(val, index=index)._values
576+
# GH#41785 this _should_ be equivalent to (but faster than)
577+
# val = create_series_with_explicit_dtype(val, index=index)._values
578+
if oindex is None:
579+
oindex = index.astype("O")
580+
581+
if isinstance(index, (DatetimeIndex, TimedeltaIndex)):
582+
# see test_constructor_dict_datetime64_index
583+
val = dict_compat(val)
584+
else:
585+
# see test_constructor_subclass_dict
586+
val = dict(val)
587+
val = lib.fast_multiget(val, oindex._values, default=np.nan)
576588

577589
val = sanitize_array(
578590
val, index, dtype=dtype, copy=False, raise_cast_failure=False
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
import numpy as np
2+
3+
from pandas.core.dtypes.cast import dict_compat
4+
5+
from pandas import Timestamp
6+
7+
8+
def test_dict_compat():
9+
data_datetime64 = {np.datetime64("1990-03-15"): 1, np.datetime64("2015-03-15"): 2}
10+
data_unchanged = {1: 2, 3: 4, 5: 6}
11+
expected = {Timestamp("1990-3-15"): 1, Timestamp("2015-03-15"): 2}
12+
assert dict_compat(data_datetime64) == expected
13+
assert dict_compat(expected) == expected
14+
assert dict_compat(data_unchanged) == data_unchanged

0 commit comments

Comments
 (0)