Skip to content

REF: Convert list comprehensions into lazy iterators #58798

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pandas/core/arraylike.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,8 +329,8 @@ def array_ufunc(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any)
reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes))

if self.ndim == 1:
names = [getattr(x, "name") for x in inputs if hasattr(x, "name")]
name = names[0] if len(set(names)) == 1 else None
names = {getattr(x, "name") for x in inputs if hasattr(x, "name")}
name = names.pop() if len(names) == 1 else None
reconstruct_kwargs = {"name": name}
else:
reconstruct_kwargs = {}
Expand Down
7 changes: 4 additions & 3 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,11 +335,12 @@ def is_empty_slice(obj) -> bool:
)


def is_true_slices(line) -> list[bool]:
def is_true_slices(line: abc.Iterable) -> abc.Generator[bool, None, None]:
"""
Find non-trivial slices in "line": return a list of booleans with same length.
Find non-trivial slices in "line": yields a bool.
"""
return [isinstance(k, slice) and not is_null_slice(k) for k in line]
for k in line:
yield isinstance(k, slice) and not is_null_slice(k)


# TODO: used only once in indexing; belongs elsewhere?
Expand Down
5 changes: 3 additions & 2 deletions pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -680,10 +680,11 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
return None

# categorical is aware of Sparse -> extract sparse subdtypes
dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]
subtypes = (x.subtype if isinstance(x, SparseDtype) else x for x in dtypes)
# extract the categories' dtype
non_cat_dtypes = [
x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes
x.categories.dtype if isinstance(x, CategoricalDtype) else x
for x in subtypes
]
# TODO should categorical always give an answer?
from pandas.core.dtypes.cast import find_common_type
Expand Down
16 changes: 8 additions & 8 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6999,19 +6999,19 @@ def sort_values(
f" != length of by ({len(by)})"
)
if len(by) > 1:
keys = [self._get_label_or_level_values(x, axis=axis) for x in by]
keys = (self._get_label_or_level_values(x, axis=axis) for x in by)

# need to rewrap columns in Series to apply key function
if key is not None:
# error: List comprehension has incompatible type List[Series];
# expected List[ndarray]
keys = [
Series(k, name=name) # type: ignore[misc]
for (k, name) in zip(keys, by)
]
keys_data = [Series(k, name=name) for (k, name) in zip(keys, by)]
else:
# error: Argument 1 to "list" has incompatible type
# "Generator[ExtensionArray | ndarray[Any, Any], None, None]";
# expected "Iterable[Series]"
keys_data = list(keys) # type: ignore[arg-type]

indexer = lexsort_indexer(
keys, orders=ascending, na_position=na_position, key=key
keys_data, orders=ascending, na_position=na_position, key=key
)
elif len(by):
# len(by) == 1
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ def _aggregate_multiple_funcs(self, arg, *args, **kwargs) -> DataFrame:
raise SpecificationError("nested renamer is not supported")

if any(isinstance(x, (tuple, list)) for x in arg):
arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg]
arg = ((x, x) if not isinstance(x, (tuple, list)) else x for x in arg)
else:
# list of functions / function names
columns = (com.get_callable_name(f) or f for f in arg)
Expand Down Expand Up @@ -2077,7 +2077,7 @@ def _apply_to_column_groupbys(self, func) -> DataFrame:

obj = self._obj_with_exclusions
columns = obj.columns
sgbs = [
sgbs = (
SeriesGroupBy(
obj.iloc[:, i],
selection=colname,
Expand All @@ -2086,7 +2086,7 @@ def _apply_to_column_groupbys(self, func) -> DataFrame:
observed=self.observed,
)
for i, colname in enumerate(obj.columns)
]
)
results = [func(sgb) for sgb in sgbs]

if not len(results):
Expand Down
9 changes: 5 additions & 4 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class providing the base-class of operations.

from collections.abc import (
Hashable,
Iterable,
Iterator,
Mapping,
Sequence,
Expand Down Expand Up @@ -758,7 +759,7 @@ def get_converter(s):
)
raise ValueError(msg) from err

converters = [get_converter(s) for s in index_sample]
converters = (get_converter(s) for s in index_sample)
names = (tuple(f(n) for f, n in zip(converters, name)) for name in names)

else:
Expand Down Expand Up @@ -2645,7 +2646,7 @@ def _value_counts(
}
if isinstance(obj, Series):
_name = obj.name
keys = [] if _name in in_axis_names else [obj]
keys: Iterable[Series] = [] if _name in in_axis_names else [obj]
else:
unique_cols = set(obj.columns)
if subset is not None:
Expand All @@ -2665,12 +2666,12 @@ def _value_counts(
else:
subsetted = unique_cols

keys = [
keys = (
# Can't use .values because the column label needs to be preserved
obj.iloc[:, idx]
for idx, _name in enumerate(obj.columns)
if _name not in in_axis_names and _name in subsetted
]
)

groupings = list(self._grouper.groupings)
for key in keys:
Expand Down
15 changes: 10 additions & 5 deletions pandas/core/indexes/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,20 +212,25 @@ def union_indexes(indexes, sort: bool | None = True) -> Index:
if kind == "special":
result = indexes[0]

dtis = [x for x in indexes if isinstance(x, DatetimeIndex)]
dti_tzs = [x for x in dtis if x.tz is not None]
if len(dti_tzs) not in [0, len(dtis)]:
num_dtis = 0
num_dti_tzs = 0
for idx in indexes:
if isinstance(idx, DatetimeIndex):
num_dtis += 1
if idx.tz is not None:
num_dti_tzs += 1
if num_dti_tzs not in [0, num_dtis]:
# TODO: this behavior is not tested (so may not be desired),
# but is kept in order to keep behavior the same when
# deprecating union_many
# test_frame_from_dict_with_mixed_indexes
raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex")

if len(dtis) == len(indexes):
if num_dtis == len(indexes):
sort = True
result = indexes[0]

elif len(dtis) > 1:
elif num_dtis > 1:
# If we have mixed timezones, our casting behavior may depend on
# the order of indexes, which we don't want.
sort = False
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3140,7 +3140,7 @@ def _union(self, other: Index, sort: bool | None):

# worth making this faster? a very unusual case
value_set = set(lvals)
value_list.extend([x for x in rvals if x not in value_set])
value_list.extend(x for x in rvals if x not in value_set)
# If objects are unorderable, we must have object dtype.
return np.array(value_list, dtype=object)

Expand Down Expand Up @@ -7620,8 +7620,8 @@ def get_unanimous_names(*indexes: Index) -> tuple[Hashable, ...]:
list
A list representing the unanimous 'names' found.
"""
name_tups = [tuple(i.names) for i in indexes]
name_sets = [{*ns} for ns in zip_longest(*name_tups)]
name_tups = (tuple(i.names) for i in indexes)
name_sets = ({*ns} for ns in zip_longest(*name_tups))
names = tuple(ns.pop() if len(ns) == 1 else None for ns in name_sets)
return names

Expand Down
4 changes: 2 additions & 2 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1387,7 +1387,7 @@ def _formatter_func(self, tup):
"""
Formats each item in tup according to its level's formatter function.
"""
formatter_funcs = [level._formatter_func for level in self.levels]
formatter_funcs = (level._formatter_func for level in self.levels)
return tuple(func(val) for func, val in zip(formatter_funcs, tup))

def _get_values_for_csv(
Expand Down Expand Up @@ -1537,7 +1537,7 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None:
if level is None:
level = range(self.nlevels)
else:
level = [self._get_level_number(lev) for lev in level]
level = (self._get_level_number(lev) for lev in level)

# set the name
for lev, name in zip(level, names):
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/reshape/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,7 @@ def get_result(self):

# combine as columns in a frame
else:
data = dict(zip(range(len(self.objs)), self.objs))
data = dict(enumerate(self.objs))

# GH28330 Preserves subclassed objects through concat
cons = sample._constructor_expanddim
Expand Down Expand Up @@ -874,7 +874,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde

if isinstance(new_index, MultiIndex):
new_levels.extend(new_index.levels)
new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes])
new_codes.extend(np.tile(lab, kpieces) for lab in new_index.codes)
else:
new_levels.append(new_index.unique())
single_codes = new_index.unique().get_indexer(new_index)
Expand Down
40 changes: 20 additions & 20 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,24 +137,24 @@ def __init__(
self.removed_level = self.removed_level.take(unique_codes)
self.removed_level_full = self.removed_level_full.take(unique_codes)

# Bug fix GH 20601
# If the data frame is too big, the number of unique index combination
# will cause int32 overflow on windows environments.
# We want to check and raise an warning before this happens
num_rows = np.max([index_level.size for index_level in self.new_index_levels])
num_columns = self.removed_level.size

# GH20601: This forces an overflow if the number of cells is too high.
num_cells = num_rows * num_columns

# GH 26314: Previous ValueError raised was too restrictive for many users.
if get_option("performance_warnings") and num_cells > np.iinfo(np.int32).max:
warnings.warn(
f"The following operation may generate {num_cells} cells "
f"in the resulting pandas object.",
PerformanceWarning,
stacklevel=find_stack_level(),
)
if get_option("performance_warnings"):
# Bug fix GH 20601
# If the data frame is too big, the number of unique index combination
# will cause int32 overflow on windows environments.
# We want to check and raise an warning before this happens
num_rows = max(index_level.size for index_level in self.new_index_levels)
num_columns = self.removed_level.size

# GH20601: This forces an overflow if the number of cells is too high.
# GH 26314: Previous ValueError raised was too restrictive for many users.
num_cells = num_rows * num_columns
if num_cells > np.iinfo(np.int32).max:
warnings.warn(
f"The following operation may generate {num_cells} cells "
f"in the resulting pandas object.",
PerformanceWarning,
stacklevel=find_stack_level(),
)

self._make_selectors()

Expand Down Expand Up @@ -731,10 +731,10 @@ def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex | Index:
if len(columns.levels) <= 2:
return columns.levels[0]._rename(name=columns.names[0])

levs = [
levs = (
[lev[c] if c >= 0 else None for c in codes]
for lev, codes in zip(columns.levels[:-1], columns.codes[:-1])
]
)

# Remove duplicate tuples in the MultiIndex.
tuples = zip(*levs)
Expand Down