Skip to content

Commit d51847e

Browse files
Merge branch 'pandas-dev:main' into Fix#58748
2 parents 3bfa2c2 + 2aa155a commit d51847e

File tree

18 files changed

+95
-67
lines changed

18 files changed

+95
-67
lines changed

ci/code_checks.sh

-1
Original file line numberDiff line numberDiff line change
@@ -441,7 +441,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
441441
-i "pandas.errors.UnsupportedFunctionCall SA01" \
442442
-i "pandas.errors.ValueLabelTypeMismatch SA01" \
443443
-i "pandas.infer_freq SA01" \
444-
-i "pandas.interval_range RT03" \
445444
-i "pandas.io.formats.style.Styler.apply RT03" \
446445
-i "pandas.io.formats.style.Styler.apply_index RT03" \
447446
-i "pandas.io.formats.style.Styler.background_gradient RT03" \

doc/source/user_guide/style.ipynb

+1-1
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@
211211
"source": [
212212
"## Styler Object and HTML \n",
213213
"\n",
214-
"The [Styler][styler] was originally constructed to support the wide array of HTML formatting options. Its HTML output creates an HTML `<table>` and leverages CSS styling language to manipulate many parameters including colors, fonts, borders, background, etc. See [here][w3schools] for more information on styling HTML tables. This allows a lot of flexibility out of the box, and even enables web developers to integrate DataFrames into their exiting user interface designs.\n",
214+
"The [Styler][styler] was originally constructed to support the wide array of HTML formatting options. Its HTML output creates an HTML `<table>` and leverages CSS styling language to manipulate many parameters including colors, fonts, borders, background, etc. See [here][w3schools] for more information on styling HTML tables. This allows a lot of flexibility out of the box, and even enables web developers to integrate DataFrames into their existing user interface designs.\n",
215215
"\n",
216216
"Below we demonstrate the default output, which looks very similar to the standard DataFrame HTML representation. But the HTML here has already attached some CSS classes to each cell, even if we haven't yet created any styles. We can view these by calling the [.to_html()][tohtml] method, which returns the raw HTML as string, which is useful for further processing or adding to a file - read on in [More about CSS and HTML](#More-About-CSS-and-HTML). This section will also provide a walkthrough for how to convert this default output to represent a DataFrame output that is more communicative. For example how we can build `s`:\n",
217217
"\n",

doc/source/user_guide/timeseries.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,7 @@ which can be specified. These are computed from the starting point specified by
326326
.. note::
327327

328328
The ``unit`` parameter does not use the same strings as the ``format`` parameter
329-
that was discussed :ref:`above<timeseries.converting.format>`). The
329+
that was discussed :ref:`above<timeseries.converting.format>`. The
330330
available units are listed on the documentation for :func:`pandas.to_datetime`.
331331

332332
Constructing a :class:`Timestamp` or :class:`DatetimeIndex` with an epoch timestamp

doc/source/whatsnew/v3.0.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -477,7 +477,7 @@ Groupby/resample/rolling
477477
Reshaping
478478
^^^^^^^^^
479479
- Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`)
480-
-
480+
- Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`)
481481

482482
Sparse
483483
^^^^^^

pandas/core/arraylike.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -329,8 +329,8 @@ def array_ufunc(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any)
329329
reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes))
330330

331331
if self.ndim == 1:
332-
names = [getattr(x, "name") for x in inputs if hasattr(x, "name")]
333-
name = names[0] if len(set(names)) == 1 else None
332+
names = {getattr(x, "name") for x in inputs if hasattr(x, "name")}
333+
name = names.pop() if len(names) == 1 else None
334334
reconstruct_kwargs = {"name": name}
335335
else:
336336
reconstruct_kwargs = {}

pandas/core/common.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -335,11 +335,12 @@ def is_empty_slice(obj) -> bool:
335335
)
336336

337337

338-
def is_true_slices(line) -> list[bool]:
338+
def is_true_slices(line: abc.Iterable) -> abc.Generator[bool, None, None]:
339339
"""
340-
Find non-trivial slices in "line": return a list of booleans with same length.
340+
Find non-trivial slices in "line": yields a bool.
341341
"""
342-
return [isinstance(k, slice) and not is_null_slice(k) for k in line]
342+
for k in line:
343+
yield isinstance(k, slice) and not is_null_slice(k)
343344

344345

345346
# TODO: used only once in indexing; belongs elsewhere?

pandas/core/dtypes/dtypes.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -680,10 +680,11 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
680680
return None
681681

682682
# categorical is aware of Sparse -> extract sparse subdtypes
683-
dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]
683+
subtypes = (x.subtype if isinstance(x, SparseDtype) else x for x in dtypes)
684684
# extract the categories' dtype
685685
non_cat_dtypes = [
686-
x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes
686+
x.categories.dtype if isinstance(x, CategoricalDtype) else x
687+
for x in subtypes
687688
]
688689
# TODO should categorical always give an answer?
689690
from pandas.core.dtypes.cast import find_common_type

pandas/core/frame.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -6999,19 +6999,19 @@ def sort_values(
69996999
f" != length of by ({len(by)})"
70007000
)
70017001
if len(by) > 1:
7002-
keys = [self._get_label_or_level_values(x, axis=axis) for x in by]
7002+
keys = (self._get_label_or_level_values(x, axis=axis) for x in by)
70037003

70047004
# need to rewrap columns in Series to apply key function
70057005
if key is not None:
7006-
# error: List comprehension has incompatible type List[Series];
7007-
# expected List[ndarray]
7008-
keys = [
7009-
Series(k, name=name) # type: ignore[misc]
7010-
for (k, name) in zip(keys, by)
7011-
]
7006+
keys_data = [Series(k, name=name) for (k, name) in zip(keys, by)]
7007+
else:
7008+
# error: Argument 1 to "list" has incompatible type
7009+
# "Generator[ExtensionArray | ndarray[Any, Any], None, None]";
7010+
# expected "Iterable[Series]"
7011+
keys_data = list(keys) # type: ignore[arg-type]
70127012

70137013
indexer = lexsort_indexer(
7014-
keys, orders=ascending, na_position=na_position, key=key
7014+
keys_data, orders=ascending, na_position=na_position, key=key
70157015
)
70167016
elif len(by):
70177017
# len(by) == 1

pandas/core/groupby/generic.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -387,7 +387,7 @@ def _aggregate_multiple_funcs(self, arg, *args, **kwargs) -> DataFrame:
387387
raise SpecificationError("nested renamer is not supported")
388388

389389
if any(isinstance(x, (tuple, list)) for x in arg):
390-
arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg]
390+
arg = ((x, x) if not isinstance(x, (tuple, list)) else x for x in arg)
391391
else:
392392
# list of functions / function names
393393
columns = (com.get_callable_name(f) or f for f in arg)
@@ -2077,7 +2077,7 @@ def _apply_to_column_groupbys(self, func) -> DataFrame:
20772077

20782078
obj = self._obj_with_exclusions
20792079
columns = obj.columns
2080-
sgbs = [
2080+
sgbs = (
20812081
SeriesGroupBy(
20822082
obj.iloc[:, i],
20832083
selection=colname,
@@ -2086,7 +2086,7 @@ def _apply_to_column_groupbys(self, func) -> DataFrame:
20862086
observed=self.observed,
20872087
)
20882088
for i, colname in enumerate(obj.columns)
2089-
]
2089+
)
20902090
results = [func(sgb) for sgb in sgbs]
20912091

20922092
if not len(results):

pandas/core/groupby/groupby.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ class providing the base-class of operations.
1111

1212
from collections.abc import (
1313
Hashable,
14+
Iterable,
1415
Iterator,
1516
Mapping,
1617
Sequence,
@@ -758,7 +759,7 @@ def get_converter(s):
758759
)
759760
raise ValueError(msg) from err
760761

761-
converters = [get_converter(s) for s in index_sample]
762+
converters = (get_converter(s) for s in index_sample)
762763
names = (tuple(f(n) for f, n in zip(converters, name)) for name in names)
763764

764765
else:
@@ -2645,7 +2646,7 @@ def _value_counts(
26452646
}
26462647
if isinstance(obj, Series):
26472648
_name = obj.name
2648-
keys = [] if _name in in_axis_names else [obj]
2649+
keys: Iterable[Series] = [] if _name in in_axis_names else [obj]
26492650
else:
26502651
unique_cols = set(obj.columns)
26512652
if subset is not None:
@@ -2665,12 +2666,12 @@ def _value_counts(
26652666
else:
26662667
subsetted = unique_cols
26672668

2668-
keys = [
2669+
keys = (
26692670
# Can't use .values because the column label needs to be preserved
26702671
obj.iloc[:, idx]
26712672
for idx, _name in enumerate(obj.columns)
26722673
if _name not in in_axis_names and _name in subsetted
2673-
]
2674+
)
26742675

26752676
groupings = list(self._grouper.groupings)
26762677
for key in keys:

pandas/core/indexes/api.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -212,20 +212,25 @@ def union_indexes(indexes, sort: bool | None = True) -> Index:
212212
if kind == "special":
213213
result = indexes[0]
214214

215-
dtis = [x for x in indexes if isinstance(x, DatetimeIndex)]
216-
dti_tzs = [x for x in dtis if x.tz is not None]
217-
if len(dti_tzs) not in [0, len(dtis)]:
215+
num_dtis = 0
216+
num_dti_tzs = 0
217+
for idx in indexes:
218+
if isinstance(idx, DatetimeIndex):
219+
num_dtis += 1
220+
if idx.tz is not None:
221+
num_dti_tzs += 1
222+
if num_dti_tzs not in [0, num_dtis]:
218223
# TODO: this behavior is not tested (so may not be desired),
219224
# but is kept in order to keep behavior the same when
220225
# deprecating union_many
221226
# test_frame_from_dict_with_mixed_indexes
222227
raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex")
223228

224-
if len(dtis) == len(indexes):
229+
if num_dtis == len(indexes):
225230
sort = True
226231
result = indexes[0]
227232

228-
elif len(dtis) > 1:
233+
elif num_dtis > 1:
229234
# If we have mixed timezones, our casting behavior may depend on
230235
# the order of indexes, which we don't want.
231236
sort = False

pandas/core/indexes/base.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -3140,7 +3140,7 @@ def _union(self, other: Index, sort: bool | None):
31403140

31413141
# worth making this faster? a very unusual case
31423142
value_set = set(lvals)
3143-
value_list.extend([x for x in rvals if x not in value_set])
3143+
value_list.extend(x for x in rvals if x not in value_set)
31443144
# If objects are unorderable, we must have object dtype.
31453145
return np.array(value_list, dtype=object)
31463146

@@ -7620,8 +7620,8 @@ def get_unanimous_names(*indexes: Index) -> tuple[Hashable, ...]:
76207620
list
76217621
A list representing the unanimous 'names' found.
76227622
"""
7623-
name_tups = [tuple(i.names) for i in indexes]
7624-
name_sets = [{*ns} for ns in zip_longest(*name_tups)]
7623+
name_tups = (tuple(i.names) for i in indexes)
7624+
name_sets = ({*ns} for ns in zip_longest(*name_tups))
76257625
names = tuple(ns.pop() if len(ns) == 1 else None for ns in name_sets)
76267626
return names
76277627

pandas/core/indexes/interval.py

+1
Original file line numberDiff line numberDiff line change
@@ -1138,6 +1138,7 @@ def interval_range(
11381138
Returns
11391139
-------
11401140
IntervalIndex
1141+
Object with a fixed frequency.
11411142
11421143
See Also
11431144
--------

pandas/core/indexes/multi.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1387,7 +1387,7 @@ def _formatter_func(self, tup):
13871387
"""
13881388
Formats each item in tup according to its level's formatter function.
13891389
"""
1390-
formatter_funcs = [level._formatter_func for level in self.levels]
1390+
formatter_funcs = (level._formatter_func for level in self.levels)
13911391
return tuple(func(val) for func, val in zip(formatter_funcs, tup))
13921392

13931393
def _get_values_for_csv(
@@ -1537,7 +1537,7 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None:
15371537
if level is None:
15381538
level = range(self.nlevels)
15391539
else:
1540-
level = [self._get_level_number(lev) for lev in level]
1540+
level = (self._get_level_number(lev) for lev in level)
15411541

15421542
# set the name
15431543
for lev, name in zip(level, names):

pandas/core/reshape/concat.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -560,7 +560,7 @@ def get_result(self):
560560

561561
# combine as columns in a frame
562562
else:
563-
data = dict(zip(range(len(self.objs)), self.objs))
563+
data = dict(enumerate(self.objs))
564564

565565
# GH28330 Preserves subclassed objects through concat
566566
cons = sample._constructor_expanddim
@@ -874,7 +874,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde
874874

875875
if isinstance(new_index, MultiIndex):
876876
new_levels.extend(new_index.levels)
877-
new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes])
877+
new_codes.extend(np.tile(lab, kpieces) for lab in new_index.codes)
878878
else:
879879
new_levels.append(new_index.unique())
880880
single_codes = new_index.unique().get_indexer(new_index)

pandas/core/reshape/reshape.py

+33-27
Original file line numberDiff line numberDiff line change
@@ -137,24 +137,24 @@ def __init__(
137137
self.removed_level = self.removed_level.take(unique_codes)
138138
self.removed_level_full = self.removed_level_full.take(unique_codes)
139139

140-
# Bug fix GH 20601
141-
# If the data frame is too big, the number of unique index combination
142-
# will cause int32 overflow on windows environments.
143-
# We want to check and raise an warning before this happens
144-
num_rows = np.max([index_level.size for index_level in self.new_index_levels])
145-
num_columns = self.removed_level.size
146-
147-
# GH20601: This forces an overflow if the number of cells is too high.
148-
num_cells = num_rows * num_columns
149-
150-
# GH 26314: Previous ValueError raised was too restrictive for many users.
151-
if get_option("performance_warnings") and num_cells > np.iinfo(np.int32).max:
152-
warnings.warn(
153-
f"The following operation may generate {num_cells} cells "
154-
f"in the resulting pandas object.",
155-
PerformanceWarning,
156-
stacklevel=find_stack_level(),
157-
)
140+
if get_option("performance_warnings"):
141+
# Bug fix GH 20601
142+
# If the data frame is too big, the number of unique index combination
143+
# will cause int32 overflow on windows environments.
144+
# We want to check and raise an warning before this happens
145+
num_rows = max(index_level.size for index_level in self.new_index_levels)
146+
num_columns = self.removed_level.size
147+
148+
# GH20601: This forces an overflow if the number of cells is too high.
149+
# GH 26314: Previous ValueError raised was too restrictive for many users.
150+
num_cells = num_rows * num_columns
151+
if num_cells > np.iinfo(np.int32).max:
152+
warnings.warn(
153+
f"The following operation may generate {num_cells} cells "
154+
f"in the resulting pandas object.",
155+
PerformanceWarning,
156+
stacklevel=find_stack_level(),
157+
)
158158

159159
self._make_selectors()
160160

@@ -168,6 +168,9 @@ def _indexer_and_to_sort(
168168
v = self.level
169169

170170
codes = list(self.index.codes)
171+
if not self.sort:
172+
# Create new codes considering that labels are already sorted
173+
codes = [factorize(code)[0] for code in codes]
171174
levs = list(self.index.levels)
172175
to_sort = codes[:v] + codes[v + 1 :] + [codes[v]]
173176
sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]])
@@ -186,12 +189,9 @@ def sorted_labels(self) -> list[np.ndarray]:
186189
return to_sort
187190

188191
def _make_sorted_values(self, values: np.ndarray) -> np.ndarray:
189-
if self.sort:
190-
indexer, _ = self._indexer_and_to_sort
191-
192-
sorted_values = algos.take_nd(values, indexer, axis=0)
193-
return sorted_values
194-
return values
192+
indexer, _ = self._indexer_and_to_sort
193+
sorted_values = algos.take_nd(values, indexer, axis=0)
194+
return sorted_values
195195

196196
def _make_selectors(self) -> None:
197197
new_levels = self.new_index_levels
@@ -394,7 +394,13 @@ def _repeater(self) -> np.ndarray:
394394
@cache_readonly
395395
def new_index(self) -> MultiIndex | Index:
396396
# Does not depend on values or value_columns
397-
result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]]
397+
if self.sort:
398+
labels = self.sorted_labels[:-1]
399+
else:
400+
v = self.level
401+
codes = list(self.index.codes)
402+
labels = codes[:v] + codes[v + 1 :]
403+
result_codes = [lab.take(self.compressor) for lab in labels]
398404

399405
# construct the new index
400406
if len(self.new_index_levels) == 1:
@@ -731,10 +737,10 @@ def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex | Index:
731737
if len(columns.levels) <= 2:
732738
return columns.levels[0]._rename(name=columns.names[0])
733739

734-
levs = [
740+
levs = (
735741
[lev[c] if c >= 0 else None for c in codes]
736742
for lev, codes in zip(columns.levels[:-1], columns.codes[:-1])
737-
]
743+
)
738744

739745
# Remove duplicate tuples in the MultiIndex.
740746
tuples = zip(*levs)

pandas/tests/frame/test_stack_unstack.py

+15
Original file line numberDiff line numberDiff line change
@@ -1321,6 +1321,21 @@ def test_unstack_sort_false(frame_or_series, dtype):
13211321
[("two", "z", "b"), ("two", "y", "a"), ("one", "z", "b"), ("one", "y", "a")]
13221322
)
13231323
obj = frame_or_series(np.arange(1.0, 5.0), index=index, dtype=dtype)
1324+
1325+
result = obj.unstack(level=0, sort=False)
1326+
1327+
if frame_or_series is DataFrame:
1328+
expected_columns = MultiIndex.from_tuples([(0, "two"), (0, "one")])
1329+
else:
1330+
expected_columns = ["two", "one"]
1331+
expected = DataFrame(
1332+
[[1.0, 3.0], [2.0, 4.0]],
1333+
index=MultiIndex.from_tuples([("z", "b"), ("y", "a")]),
1334+
columns=expected_columns,
1335+
dtype=dtype,
1336+
)
1337+
tm.assert_frame_equal(result, expected)
1338+
13241339
result = obj.unstack(level=-1, sort=False)
13251340

13261341
if frame_or_series is DataFrame:

0 commit comments

Comments
 (0)