From 22c7edb930cf26234bd3d7cdc60013a20de3f8c0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 9 Apr 2024 14:42:26 -0700 Subject: [PATCH 1/6] Use better data structures --- pandas/core/frame.py | 23 ++++++++++++----------- pandas/core/generic.py | 2 +- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 76df5c82e6239..db6ece019e191 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3700,7 +3700,7 @@ def transpose( nv.validate_transpose(args, {}) # construct the args - dtypes = list(self.dtypes) + first_dtype = next(self.dtypes, None) if self._can_fast_transpose: # Note: tests pass without this, but this improves perf quite a bit. @@ -3718,11 +3718,11 @@ def transpose( elif ( self._is_homogeneous_type - and dtypes - and isinstance(dtypes[0], ExtensionDtype) + and first_dtype is not None + and isinstance(first_dtype, ExtensionDtype) ): new_values: list - if isinstance(dtypes[0], BaseMaskedDtype): + if isinstance(first_dtype, BaseMaskedDtype): # We have masked arrays with the same dtype. We can transpose faster. from pandas.core.arrays.masked import ( transpose_homogeneous_masked_arrays, @@ -3731,7 +3731,7 @@ def transpose( new_values = transpose_homogeneous_masked_arrays( cast(Sequence[BaseMaskedArray], self._iter_column_arrays()) ) - elif isinstance(dtypes[0], ArrowDtype): + elif isinstance(first_dtype, ArrowDtype): # We have arrow EAs with the same dtype. We can transpose faster. from pandas.core.arrays.arrow.array import ( ArrowExtensionArray, @@ -3743,10 +3743,11 @@ def transpose( ) else: # We have other EAs with the same dtype. We preserve dtype in transpose. - dtyp = dtypes[0] - arr_typ = dtyp.construct_array_type() + arr_typ = first_dtype.construct_array_type() values = self.values - new_values = [arr_typ._from_sequence(row, dtype=dtyp) for row in values] + new_values = [ + arr_typ._from_sequence(row, dtype=first_dtype) for row in values + ] result = type(self)._from_arrays( new_values, @@ -5875,7 +5876,7 @@ def set_index( else: arrays.append(self.index) - to_remove: list[Hashable] = [] + to_remove: set[Hashable] = set() for col in keys: if isinstance(col, MultiIndex): arrays.extend(col._get_level_values(n) for n in range(col.nlevels)) @@ -5902,7 +5903,7 @@ def set_index( arrays.append(frame[col]) names.append(col) if drop: - to_remove.append(col) + to_remove.add(col) if len(arrays[-1]) != len(self): # check newest element against length of calling frame, since @@ -5919,7 +5920,7 @@ def set_index( raise ValueError(f"Index has duplicate keys: {duplicates}") # use set to handle duplicate column names gracefully in case of drop - for c in set(to_remove): + for c in to_remove: del frame[c] # clear up memory usage diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8af9503a3691d..398adebd1e278 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2045,7 +2045,7 @@ def __setstate__(self, state) -> None: # e.g. say fill_value needing _mgr to be # defined meta = set(self._internal_names + self._metadata) - for k in list(meta): + for k in meta: if k in state and k != "_flags": v = state[k] object.__setattr__(self, k, v) From 0e7fa5efd93471b3f8982f385c0782f33ec3bc88 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 9 Apr 2024 16:03:42 -0700 Subject: [PATCH 2/6] Use generator and set --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index db6ece019e191..c6ebe183df37e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2296,8 +2296,8 @@ def maybe_reorder( exclude.update(index) if any(exclude): - arr_exclude = [x for x in exclude if x in arr_columns] - to_remove = [arr_columns.get_loc(col) for col in arr_exclude] + arr_exclude = (x for x in exclude if x in arr_columns) + to_remove = {arr_columns.get_loc(col) for col in arr_exclude} arrays = [v for i, v in enumerate(arrays) if i not in to_remove] columns = columns.drop(exclude) From fa086fc4bfe4a9bb6b3e42cd076690ef9f304d1a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 9 Apr 2024 16:18:44 -0700 Subject: [PATCH 3/6] Move sorted to exception block, use set instead of list --- pandas/core/internals/construction.py | 15 +++++++-------- pandas/core/tools/datetimes.py | 8 ++++---- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 73b93110c9018..cea52bf8c91b2 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -567,7 +567,7 @@ def _extract_index(data) -> Index: if len(data) == 0: return default_index(0) - raw_lengths = [] + raw_lengths = set() indexes: list[list[Hashable] | Index] = [] have_raw_arrays = False @@ -583,7 +583,7 @@ def _extract_index(data) -> Index: indexes.append(list(val.keys())) elif is_list_like(val) and getattr(val, "ndim", 1) == 1: have_raw_arrays = True - raw_lengths.append(len(val)) + raw_lengths.add(len(val)) elif isinstance(val, np.ndarray) and val.ndim > 1: raise ValueError("Per-column arrays must each be 1-dimensional") @@ -596,24 +596,23 @@ def _extract_index(data) -> Index: index = union_indexes(indexes, sort=False) if have_raw_arrays: - lengths = list(set(raw_lengths)) - if len(lengths) > 1: + if len(raw_lengths) > 1: raise ValueError("All arrays must be of the same length") if have_dicts: raise ValueError( "Mixing dicts with non-Series may lead to ambiguous ordering." ) - + raw_length = raw_lengths.pop() if have_series: - if lengths[0] != len(index): + if raw_length != len(index): msg = ( - f"array length {lengths[0]} does not match index " + f"array length {raw_length} does not match index " f"length {len(index)}" ) raise ValueError(msg) else: - index = default_index(lengths[0]) + index = default_index(raw_length) return ensure_index(index) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 2aeb1aff07a54..df7a6cdb1ea52 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1124,18 +1124,18 @@ def f(value): # we require at least Ymd required = ["year", "month", "day"] - req = sorted(set(required) - set(unit_rev.keys())) + req = set(required) - set(unit_rev.keys()) if len(req): - _required = ",".join(req) + _required = ",".join(sorted(req)) raise ValueError( "to assemble mappings requires at least that " f"[year, month, day] be specified: [{_required}] is missing" ) # keys we don't recognize - excess = sorted(set(unit_rev.keys()) - set(_unit_map.values())) + excess = set(unit_rev.keys()) - set(_unit_map.values()) if len(excess): - _excess = ",".join(excess) + _excess = ",".join(sorted(excess)) raise ValueError( f"extra keys have been passed to the datetime assemblage: [{_excess}]" ) From c2142d3339d7a88d0213dc3d66fd2d933bb72ebe Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 15 Apr 2024 11:28:53 -0700 Subject: [PATCH 4/6] Another iterator, use iter --- pandas/_libs/tslibs/offsets.pyx | 10 ++++------ pandas/core/frame.py | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index e36abdf0ad971..00daeff21c425 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -219,8 +219,7 @@ cdef _get_calendar(weekmask, holidays, calendar): holidays = holidays + calendar.holidays().tolist() except AttributeError: pass - holidays = [_to_dt64D(dt) for dt in holidays] - holidays = tuple(sorted(holidays)) + holidays = tuple(sorted(_to_dt64D(dt) for dt in holidays)) kwargs = {"weekmask": weekmask} if holidays: @@ -420,10 +419,9 @@ cdef class BaseOffset: if "holidays" in all_paras and not all_paras["holidays"]: all_paras.pop("holidays") exclude = ["kwds", "name", "calendar"] - attrs = [(k, v) for k, v in all_paras.items() - if (k not in exclude) and (k[0] != "_")] - attrs = sorted(set(attrs)) - params = tuple([str(type(self))] + attrs) + attrs = {(k, v) for k, v in all_paras.items() + if (k not in exclude) and (k[0] != "_")} + params = tuple([str(type(self))] + sorted(attrs)) return params @property diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 50b3e62e9256e..562927967a536 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3705,7 +3705,7 @@ def transpose( nv.validate_transpose(args, {}) # construct the args - first_dtype = next(self.dtypes, None) + first_dtype = next(iter(self.dtypes), None) if self._can_fast_transpose: # Note: tests pass without this, but this improves perf quite a bit. From a61895bd5dd88cc5ea56e4816690b57eb1d63846 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 15 Apr 2024 11:35:34 -0700 Subject: [PATCH 5/6] another set --- pandas/_libs/tslibs/offsets.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 00daeff21c425..107608ec9f606 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -418,7 +418,7 @@ cdef class BaseOffset: if "holidays" in all_paras and not all_paras["holidays"]: all_paras.pop("holidays") - exclude = ["kwds", "name", "calendar"] + exclude = {"kwds", "name", "calendar"} attrs = {(k, v) for k, v in all_paras.items() if (k not in exclude) and (k[0] != "_")} params = tuple([str(type(self))] + sorted(attrs)) From 77e8b11054726ccec0e07919cfa9612b3d1471d1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 16 Apr 2024 10:41:11 -0700 Subject: [PATCH 6/6] Dont use iterator protocol --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 61f6d6b23f4bc..b65a00db7d7df 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3705,7 +3705,7 @@ def transpose( nv.validate_transpose(args, {}) # construct the args - first_dtype = next(iter(self.dtypes), None) + first_dtype = self.dtypes.iloc[0] if len(self.columns) else None if self._can_fast_transpose: # Note: tests pass without this, but this improves perf quite a bit.