From 6d44e556d4144f8bc1b20197cb2a7550d8f8d8e0 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <> Date: Sat, 11 Apr 2020 19:36:23 +0300 Subject: [PATCH 01/12] PERF: Cythonize `from_nested_dict` --- pandas/_libs/lib.pyx | 11 +++++++++++ pandas/core/frame.py | 11 +---------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 6147d6d9c1658..79d580e5c47b5 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1,3 +1,4 @@ +import collections from collections import abc from decimal import Decimal from fractions import Fraction @@ -2526,3 +2527,13 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): output[i] = default return maybe_convert_objects(output) + +def from_nested_dict(object data) -> object: + cdef: + object new_data = collections.defaultdict(dict) + object index, column, value, nested_dict + + for index, nested_dict in data.items(): + for column, value in nested_dict.items(): + new_data[column][index] = value + return new_data diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d19f1a263f71a..e64ced2101c5c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1266,7 +1266,7 @@ def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFra if len(data) > 0: # TODO speed up Series case if isinstance(list(data.values())[0], (Series, dict)): - data = _from_nested_dict(data) + data = lib.from_nested_dict(data) else: data, index = list(data.values()), list(data.keys()) elif orient == "columns": @@ -8817,12 +8817,3 @@ def isin(self, values) -> "DataFrame": ops.add_flex_arithmetic_methods(DataFrame) ops.add_special_arithmetic_methods(DataFrame) - - -def _from_nested_dict(data): - # TODO: this should be seriously cythonized - new_data = collections.defaultdict(dict) - for index, s in data.items(): - for col, v in s.items(): - new_data[col][index] = v - return new_data From 61c841dd5c6722d02960c52ea06f19792760d38c Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <> Date: Sat, 11 Apr 2020 23:40:16 +0300 Subject: [PATCH 02/12] List issues --- pandas/_libs/lib.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 79d580e5c47b5..25d74da30abfd 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2528,6 +2528,7 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): return maybe_convert_objects(output) + def from_nested_dict(object data) -> object: cdef: object new_data = collections.defaultdict(dict) From 1c7ffbbe52eee705c61167df0f003e89b381b596 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <> Date: Sat, 11 Apr 2020 23:40:44 +0300 Subject: [PATCH 03/12] Added wrappers --- pandas/_libs/lib.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 25d74da30abfd..73c6d5fc80fd4 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2529,6 +2529,8 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): return maybe_convert_objects(output) +@cython.wraparound(False) +@cython.boundscheck(False) def from_nested_dict(object data) -> object: cdef: object new_data = collections.defaultdict(dict) From 94977b2b8f426ec65a4c6b9eea62d60ce4f624cf Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <> Date: Sun, 12 Apr 2020 00:34:07 +0300 Subject: [PATCH 04/12] Converting the object dict inside the function xref: https://github.com/pandas-dev/pandas/pull/33485#issuecomment-612492274 --- pandas/_libs/lib.pyx | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 73c6d5fc80fd4..d480979f96d44 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2531,12 +2531,17 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): @cython.wraparound(False) @cython.boundscheck(False) -def from_nested_dict(object data) -> object: +def from_nested_dict(object data) -> dict: cdef: object new_data = collections.defaultdict(dict) - object index, column, value, nested_dict + object index, column, value, dict_iterator + dict data_dct, nested_dict - for index, nested_dict in data.items(): + data_dct = dict(data) + + for index, dict_iterator in data_dct.items(): + nested_dict = dict(dict_iterator) for column, value in nested_dict.items(): new_data[column][index] = value - return new_data + + return dict(new_data) From bcb25b91f8542b26475ea82482d82810e1b2c353 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <> Date: Sun, 12 Apr 2020 01:41:36 +0300 Subject: [PATCH 05/12] Avoiding expensive call ref: https://github.com/pandas-dev/pandas/pull/33485#discussion_r407115994 --- pandas/core/frame.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e64ced2101c5c..e314235967c0e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1265,7 +1265,8 @@ def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFra if orient == "index": if len(data) > 0: # TODO speed up Series case - if isinstance(list(data.values())[0], (Series, dict)): + first_val = next(iter((data.values())), None) + if isinstance(first_val, (Series, dict)): data = lib.from_nested_dict(data) else: data, index = list(data.values()), list(data.keys()) From 7dfbca8aa58fbed63d72bdbaeeaf88a6cbb0fb4a Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <> Date: Mon, 13 Apr 2020 12:29:52 +0300 Subject: [PATCH 06/12] Converting the data to builtin dict, in the python space xref: https://github.com/pandas-dev/pandas/pull/33485#discussion_r407116593 --- pandas/_libs/lib.pyx | 2 +- pandas/core/frame.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d480979f96d44..07baae8787b35 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2531,7 +2531,7 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): @cython.wraparound(False) @cython.boundscheck(False) -def from_nested_dict(object data) -> dict: +def from_nested_dict(dict data) -> dict: cdef: object new_data = collections.defaultdict(dict) object index, column, value, dict_iterator diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e314235967c0e..3add94c891a1e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1267,6 +1267,10 @@ def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFra # TODO speed up Series case first_val = next(iter((data.values())), None) if isinstance(first_val, (Series, dict)): + # If we are dealing with not a builtin dict, + # `collections.defaultdict` for example, we need to convert it + # to a regular dict so Cython will not raise. + data = dict(data) if not type(data) is dict else data data = lib.from_nested_dict(data) else: data, index = list(data.values()), list(data.keys()) From c8e515f51c66bcf0577ce9156ad51b9c57e3381f Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <> Date: Mon, 13 Apr 2020 12:33:54 +0300 Subject: [PATCH 07/12] Going less offen to the python space xref: https://github.com/pandas-dev/pandas/pull/33485#discussion_r407116876 --- pandas/_libs/lib.pyx | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 07baae8787b35..a8cb6bb4bf807 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1,4 +1,3 @@ -import collections from collections import abc from decimal import Decimal from fractions import Fraction @@ -2533,7 +2532,7 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): @cython.boundscheck(False) def from_nested_dict(dict data) -> dict: cdef: - object new_data = collections.defaultdict(dict) + dict new_data = {} object index, column, value, dict_iterator dict data_dct, nested_dict @@ -2542,6 +2541,9 @@ def from_nested_dict(dict data) -> dict: for index, dict_iterator in data_dct.items(): nested_dict = dict(dict_iterator) for column, value in nested_dict.items(): - new_data[column][index] = value + if column in new_data: + new_data[column].update(dict([(index, value)])) + else: + new_data.setdefault(column, dict([(index, value)])) - return dict(new_data) + return new_data From 4829f78068d0186612a7b2e3fe9600d1b0bed3b4 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <> Date: Mon, 13 Apr 2020 12:34:57 +0300 Subject: [PATCH 08/12] Got rid of unneeded vars --- pandas/_libs/lib.pyx | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a8cb6bb4bf807..7b5d974347b68 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2534,11 +2534,9 @@ def from_nested_dict(dict data) -> dict: cdef: dict new_data = {} object index, column, value, dict_iterator - dict data_dct, nested_dict + dict nested_dict - data_dct = dict(data) - - for index, dict_iterator in data_dct.items(): + for index, dict_iterator in data.items(): nested_dict = dict(dict_iterator) for column, value in nested_dict.items(): if column in new_data: From 5faa02b305991ea199977f48587ce99b08d37acb Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <> Date: Mon, 13 Apr 2020 12:38:01 +0300 Subject: [PATCH 09/12] Better perf if we have nested series xref: https://github.com/pandas-dev/pandas/pull/33485#discussion_r407117156 --- pandas/_libs/lib.pyx | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 7b5d974347b68..ed7219a4f5cc6 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2533,12 +2533,10 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): def from_nested_dict(dict data) -> dict: cdef: dict new_data = {} - object index, column, value, dict_iterator - dict nested_dict + object index, column, value, dict_or_series - for index, dict_iterator in data.items(): - nested_dict = dict(dict_iterator) - for column, value in nested_dict.items(): + for index, dict_or_series in data.items(): + for column, value in dict_or_series.items(): if column in new_data: new_data[column].update(dict([(index, value)])) else: From d349b4f728f7a6420667d30d1ba7a437e8b870b2 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <> Date: Mon, 13 Apr 2020 13:09:17 +0300 Subject: [PATCH 10/12] Assigning the new value into a variable --- pandas/_libs/lib.pyx | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index ed7219a4f5cc6..7c874a3de3118 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2532,14 +2532,16 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): @cython.boundscheck(False) def from_nested_dict(dict data) -> dict: cdef: - dict new_data = {} object index, column, value, dict_or_series + dict new_data = {} + dict new_value for index, dict_or_series in data.items(): for column, value in dict_or_series.items(): + new_value = dict([(index, value)]) if column in new_data: - new_data[column].update(dict([(index, value)])) + new_data[column].update(new_value) else: - new_data.setdefault(column, dict([(index, value)])) + new_data.setdefault(column, new_value) return new_data From 4f8afeda4c8917195886202491c37788de168fed Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <> Date: Mon, 13 Apr 2020 14:58:49 +0300 Subject: [PATCH 11/12] Remove the `if` statement --- pandas/_libs/lib.pyx | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 7c874a3de3118..58fd0116d813c 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2539,9 +2539,7 @@ def from_nested_dict(dict data) -> dict: for index, dict_or_series in data.items(): for column, value in dict_or_series.items(): new_value = dict([(index, value)]) - if column in new_data: - new_data[column].update(new_value) - else: - new_data.setdefault(column, new_value) + new_data.setdefault(column, new_value) + new_data[column].update(new_value) return new_data From e22fbda50873d2f75056a355193941b9b2185ca3 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <> Date: Mon, 13 Apr 2020 19:30:54 +0300 Subject: [PATCH 12/12] Suggestion by @topper-123 xref: https://github.com/pandas-dev/pandas/pull/33485#discussion_r407515079 --- pandas/_libs/lib.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 58fd0116d813c..77caaa87d9b98 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2534,12 +2534,12 @@ def from_nested_dict(dict data) -> dict: cdef: object index, column, value, dict_or_series dict new_data = {} - dict new_value for index, dict_or_series in data.items(): for column, value in dict_or_series.items(): - new_value = dict([(index, value)]) - new_data.setdefault(column, new_value) - new_data[column].update(new_value) + if column in new_data: + new_data[column][index] = value + else: + new_data[column] = {index: value} return new_data