From b5e5b28c95a98eb7c821de6db7e2b9d4940370e5 Mon Sep 17 00:00:00 2001 From: Rohith295 Date: Tue, 23 Jun 2020 11:42:09 +0200 Subject: [PATCH 1/8] Changed the way we are generating tuple of keys/values to increase the performance --- pandas/core/series.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index cab8dd133b579..9497b31231f97 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -354,8 +354,11 @@ def _init_dict(self, data, index=None, dtype=None): # Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')] # raises KeyError), so we iterate the entire dict, and align if data: - keys, values = zip(*data.items()) - values = list(values) + # GH:34717 + # Using the below way to generate tuple of keys and values increasing the performance by 50%, instead of zip + keys = tuple(data.keys()) + values = tuple([data[key] for key in keys]) + elif index is not None: # fastpath for Series(data=None). Just use broadcasting a scalar # instead of reindexing. From c6956af484d25ac1ef5dc9358fb7cd2c8794029b Mon Sep 17 00:00:00 2001 From: Rohith295 Date: Tue, 23 Jun 2020 12:03:00 +0200 Subject: [PATCH 2/8] Changed the way of generating the tuple --- pandas/core/series.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 9497b31231f97..8a2dca8665576 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -355,9 +355,12 @@ def _init_dict(self, data, index=None, dtype=None): # raises KeyError), so we iterate the entire dict, and align if data: # GH:34717 - # Using the below way to generate tuple of keys and values increasing the performance by 50%, instead of zip + # Using the below way to generate tuple of keys and values + # increasing the performance by 50%, instead of zip keys = tuple(data.keys()) - values = tuple([data[key] for key in keys]) + values = ( + *[data[key] for key in keys], + ) # Generating tuple of values- faster way elif index is not None: # fastpath for Series(data=None). Just use broadcasting a scalar From 510bce602f639e2bab8f71b97f45a11aebd09e39 Mon Sep 17 00:00:00 2001 From: Rohith295 Date: Tue, 23 Jun 2020 13:15:27 +0200 Subject: [PATCH 3/8] Fixing the failing type annotation checks and also the code comments --- pandas/core/series.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 8a2dca8665576..bebe3907dd6d8 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -355,12 +355,12 @@ def _init_dict(self, data, index=None, dtype=None): # raises KeyError), so we iterate the entire dict, and align if data: # GH:34717 - # Using the below way to generate tuple of keys and values + # Using the below way for generating keys and values # increasing the performance by 50%, instead of zip keys = tuple(data.keys()) - values = ( - *[data[key] for key in keys], - ) # Generating tuple of values- faster way + values = [ + data[key] for key in keys + ] # Generating list of values- faster way elif index is not None: # fastpath for Series(data=None). Just use broadcasting a scalar From 4655d7b03f9c87cd32deb5a7d0191b0c1abb0caf Mon Sep 17 00:00:00 2001 From: Rohith295 Date: Tue, 23 Jun 2020 15:38:49 +0200 Subject: [PATCH 4/8] Fixed mypy static type analysis issue --- pandas/core/series.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index bebe3907dd6d8..a83a2e3c98431 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -358,17 +358,15 @@ def _init_dict(self, data, index=None, dtype=None): # Using the below way for generating keys and values # increasing the performance by 50%, instead of zip keys = tuple(data.keys()) - values = [ - data[key] for key in keys - ] # Generating list of values- faster way - + values = list(data.values()) # Generating list of values- faster way + elif index is not None: # fastpath for Series(data=None). Just use broadcasting a scalar # instead of reindexing. values = na_value_for_dtype(dtype) keys = index else: - keys, values = [], [] + keys, values = tuple([]), [] # Input is now list-like, so rely on "standard" construction: From 6f7c242e945e961dbd7bcf7b164b0a4b7c87e45a Mon Sep 17 00:00:00 2001 From: Rohith295 Date: Tue, 23 Jun 2020 16:07:07 +0200 Subject: [PATCH 5/8] Fixed linting issues --- pandas/core/series.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index a83a2e3c98431..83c7b5f7e3e94 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -359,7 +359,6 @@ def _init_dict(self, data, index=None, dtype=None): # increasing the performance by 50%, instead of zip keys = tuple(data.keys()) values = list(data.values()) # Generating list of values- faster way - elif index is not None: # fastpath for Series(data=None). Just use broadcasting a scalar # instead of reindexing. From e523480e43f4c4eae1b3a9b7c86b8ed14621d2fc Mon Sep 17 00:00:00 2001 From: Rohith295 Date: Fri, 26 Jun 2020 18:33:25 +0200 Subject: [PATCH 6/8] removed unnecessary comments --- pandas/core/series.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 83c7b5f7e3e94..fd695b35850b9 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -355,8 +355,6 @@ def _init_dict(self, data, index=None, dtype=None): # raises KeyError), so we iterate the entire dict, and align if data: # GH:34717 - # Using the below way for generating keys and values - # increasing the performance by 50%, instead of zip keys = tuple(data.keys()) values = list(data.values()) # Generating list of values- faster way elif index is not None: From 242aacbc8a69d82c70a6572c0805e3cee8d62f18 Mon Sep 17 00:00:00 2001 From: Rohith295 Date: Sun, 13 Sep 2020 11:42:29 +0200 Subject: [PATCH 7/8] Added comments to explain more about the performance issue, Also added a note for the release v1.2 --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/series.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e577a8f26bd12..ace549050124a 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -207,7 +207,7 @@ Performance improvements - Performance improvements when creating Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`) - Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`) -- +- Performance improvements when creating `pd.Series.map` from a huge dictionary (:issue:`34717`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/series.py b/pandas/core/series.py index cf1ac61e9e9af..747aabb26ca84 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -362,7 +362,10 @@ def _init_dict(self, data, index=None, dtype=None): # Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')] # raises KeyError), so we iterate the entire dict, and align if data: - # GH:34717 + # GH:34717, issue was using zip to extract key and values from data. + # using generators in effects the performance. + # Below is the new way of extracting the keys and values + keys = tuple(data.keys()) values = list(data.values()) # Generating list of values- faster way elif index is not None: From adebd8ce2ef4c2e075e72ba85d5aaa2f72ca499d Mon Sep 17 00:00:00 2001 From: Rohith295 Date: Sun, 13 Sep 2020 17:32:06 +0200 Subject: [PATCH 8/8] Fixed as per review comments --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index f1223863f5ac4..dbc88d0b371e8 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -207,7 +207,7 @@ Performance improvements - Performance improvements when creating Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`) - Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`) -- Performance improvements when creating `pd.Series.map` from a huge dictionary (:issue:`34717`) +- Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`) - Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`) .. ---------------------------------------------------------------------------