Skip to content

Commit 19a07c3

Browse files
Merge branch 'pandas-dev:main' into Fix#58748
2 parents 5ad5d89 + 695b170 commit 19a07c3

File tree

9 files changed

+247
-166
lines changed

9 files changed

+247
-166
lines changed

ci/code_checks.sh

-1
Original file line numberDiff line numberDiff line change
@@ -440,7 +440,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
440440
-i "pandas.errors.UnsortedIndexError SA01" \
441441
-i "pandas.errors.UnsupportedFunctionCall SA01" \
442442
-i "pandas.errors.ValueLabelTypeMismatch SA01" \
443-
-i "pandas.get_option SA01" \
444443
-i "pandas.infer_freq SA01" \
445444
-i "pandas.interval_range RT03" \
446445
-i "pandas.io.formats.style.Styler.apply RT03" \

doc/source/user_guide/merging.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -763,7 +763,7 @@ Joining a single Index to a MultiIndex
763763
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
764764

765765
You can join a :class:`DataFrame` with a :class:`Index` to a :class:`DataFrame` with a :class:`MultiIndex` on a level.
766-
The ``name`` of the :class:`Index` with match the level name of the :class:`MultiIndex`.
766+
The ``name`` of the :class:`Index` will match the level name of the :class:`MultiIndex`.
767767

768768
.. ipython:: python
769769

pandas/_config/config.py

+6
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,12 @@ def get_option(pat: str) -> Any:
157157
------
158158
OptionError : if no such option exists
159159
160+
See Also
161+
--------
162+
set_option : Set the value of the specified option or options.
163+
reset_option : Reset one or more options to their default value.
164+
describe_option : Print the description for one or more registered options.
165+
160166
Notes
161167
-----
162168
For all available options, please view the :ref:`User Guide <options.available>`

pandas/core/reshape/reshape.py

+78-51
Original file line numberDiff line numberDiff line change
@@ -925,27 +925,99 @@ def _reorder_for_extension_array_stack(
925925
def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame:
926926
if frame.columns.nunique() != len(frame.columns):
927927
raise ValueError("Columns with duplicate values are not supported in stack")
928-
929-
# If we need to drop `level` from columns, it needs to be in descending order
930928
set_levels = set(level)
931-
drop_levnums = sorted(level, reverse=True)
932929
stack_cols = frame.columns._drop_level_numbers(
933930
[k for k in range(frame.columns.nlevels - 1, -1, -1) if k not in set_levels]
934931
)
932+
933+
result = stack_reshape(frame, level, set_levels, stack_cols)
934+
935+
# Construct the correct MultiIndex by combining the frame's index and
936+
# stacked columns.
937+
ratio = 0 if frame.empty else len(result) // len(frame)
938+
939+
index_levels: list | FrozenList
940+
if isinstance(frame.index, MultiIndex):
941+
index_levels = frame.index.levels
942+
index_codes = list(np.tile(frame.index.codes, (1, ratio)))
943+
else:
944+
codes, uniques = factorize(frame.index, use_na_sentinel=False)
945+
index_levels = [uniques]
946+
index_codes = list(np.tile(codes, (1, ratio)))
947+
935948
if len(level) > 1:
936949
# Arrange columns in the order we want to take them, e.g. level=[2, 0, 1]
937950
sorter = np.argsort(level)
938951
assert isinstance(stack_cols, MultiIndex)
939952
ordered_stack_cols = stack_cols._reorder_ilevels(sorter)
940953
else:
941954
ordered_stack_cols = stack_cols
942-
943-
stack_cols_unique = stack_cols.unique()
944955
ordered_stack_cols_unique = ordered_stack_cols.unique()
956+
if isinstance(ordered_stack_cols, MultiIndex):
957+
column_levels = ordered_stack_cols.levels
958+
column_codes = ordered_stack_cols.drop_duplicates().codes
959+
else:
960+
column_levels = [ordered_stack_cols_unique]
961+
column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]]
962+
963+
# error: Incompatible types in assignment (expression has type "list[ndarray[Any,
964+
# dtype[Any]]]", variable has type "FrozenList")
965+
column_codes = [np.repeat(codes, len(frame)) for codes in column_codes] # type: ignore[assignment]
966+
result.index = MultiIndex(
967+
levels=index_levels + column_levels,
968+
codes=index_codes + column_codes,
969+
names=frame.index.names + list(ordered_stack_cols.names),
970+
verify_integrity=False,
971+
)
972+
973+
# sort result, but faster than calling sort_index since we know the order we need
974+
len_df = len(frame)
975+
n_uniques = len(ordered_stack_cols_unique)
976+
indexer = np.arange(n_uniques)
977+
idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques)
978+
result = result.take(idxs)
979+
980+
# Reshape/rename if needed and dropna
981+
if result.ndim == 2 and frame.columns.nlevels == len(level):
982+
if len(result.columns) == 0:
983+
result = Series(index=result.index)
984+
else:
985+
result = result.iloc[:, 0]
986+
if result.ndim == 1:
987+
result.name = None
988+
989+
return result
990+
991+
992+
def stack_reshape(
993+
frame: DataFrame, level: list[int], set_levels: set[int], stack_cols: Index
994+
) -> Series | DataFrame:
995+
"""Reshape the data of a frame for stack.
996+
997+
This function takes care of most of the work that stack needs to do. Caller
998+
will sort the result once the appropriate index is set.
999+
1000+
Parameters
1001+
----------
1002+
frame: DataFrame
1003+
DataFrame that is to be stacked.
1004+
level: list of ints.
1005+
Levels of the columns to stack.
1006+
set_levels: set of ints.
1007+
Same as level, but as a set.
1008+
stack_cols: Index.
1009+
Columns of the result when the DataFrame is stacked.
1010+
1011+
Returns
1012+
-------
1013+
The data of behind the stacked DataFrame.
1014+
"""
1015+
# If we need to drop `level` from columns, it needs to be in descending order
1016+
drop_levnums = sorted(level, reverse=True)
9451017

9461018
# Grab data for each unique index to be stacked
9471019
buf = []
948-
for idx in stack_cols_unique:
1020+
for idx in stack_cols.unique():
9491021
if len(frame.columns) == 1:
9501022
data = frame.copy()
9511023
else:
@@ -972,10 +1044,8 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame:
9721044
data.columns = RangeIndex(len(data.columns))
9731045
buf.append(data)
9741046

975-
result: Series | DataFrame
9761047
if len(buf) > 0 and not frame.empty:
9771048
result = concat(buf, ignore_index=True)
978-
ratio = len(result) // len(frame)
9791049
else:
9801050
# input is empty
9811051
if len(level) < frame.columns.nlevels:
@@ -984,54 +1054,11 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame:
9841054
else:
9851055
new_columns = [0]
9861056
result = DataFrame(columns=new_columns, dtype=frame._values.dtype)
987-
ratio = 0
9881057

9891058
if len(level) < frame.columns.nlevels:
9901059
# concat column order may be different from dropping the levels
9911060
desired_columns = frame.columns._drop_level_numbers(drop_levnums).unique()
9921061
if not result.columns.equals(desired_columns):
9931062
result = result[desired_columns]
9941063

995-
# Construct the correct MultiIndex by combining the frame's index and
996-
# stacked columns.
997-
index_levels: list | FrozenList
998-
if isinstance(frame.index, MultiIndex):
999-
index_levels = frame.index.levels
1000-
index_codes = list(np.tile(frame.index.codes, (1, ratio)))
1001-
else:
1002-
codes, uniques = factorize(frame.index, use_na_sentinel=False)
1003-
index_levels = [uniques]
1004-
index_codes = list(np.tile(codes, (1, ratio)))
1005-
if isinstance(ordered_stack_cols, MultiIndex):
1006-
column_levels = ordered_stack_cols.levels
1007-
column_codes = ordered_stack_cols.drop_duplicates().codes
1008-
else:
1009-
column_levels = [ordered_stack_cols.unique()]
1010-
column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]]
1011-
# error: Incompatible types in assignment (expression has type "list[ndarray[Any,
1012-
# dtype[Any]]]", variable has type "FrozenList")
1013-
column_codes = [np.repeat(codes, len(frame)) for codes in column_codes] # type: ignore[assignment]
1014-
result.index = MultiIndex(
1015-
levels=index_levels + column_levels,
1016-
codes=index_codes + column_codes,
1017-
names=frame.index.names + list(ordered_stack_cols.names),
1018-
verify_integrity=False,
1019-
)
1020-
1021-
# sort result, but faster than calling sort_index since we know the order we need
1022-
len_df = len(frame)
1023-
n_uniques = len(ordered_stack_cols_unique)
1024-
indexer = np.arange(n_uniques)
1025-
idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques)
1026-
result = result.take(idxs)
1027-
1028-
# Reshape/rename if needed and dropna
1029-
if result.ndim == 2 and frame.columns.nlevels == len(level):
1030-
if len(result.columns) == 0:
1031-
result = Series(index=result.index)
1032-
else:
1033-
result = result.iloc[:, 0]
1034-
if result.ndim == 1:
1035-
result.name = None
1036-
10371064
return result

pandas/tests/groupby/test_apply.py

+34-21
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,8 @@ def test_groupby_as_index_apply():
322322
tm.assert_index_equal(res_as_apply, exp_as_apply)
323323
tm.assert_index_equal(res_not_as_apply, exp_not_as_apply)
324324

325+
326+
def test_groupby_as_index_apply_str():
325327
ind = Index(list("abcde"))
326328
df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind)
327329
msg = "DataFrameGroupBy.apply operated on the grouping columns"
@@ -379,8 +381,8 @@ def f(piece):
379381
{"value": piece, "demeaned": piece - piece.mean(), "logged": logged}
380382
)
381383

382-
dr = bdate_range("1/1/2000", periods=100)
383-
ts = Series(np.random.default_rng(2).standard_normal(100), index=dr)
384+
dr = bdate_range("1/1/2000", periods=10)
385+
ts = Series(np.random.default_rng(2).standard_normal(10), index=dr)
384386

385387
grouped = ts.groupby(lambda x: x.month, group_keys=False)
386388
result = grouped.apply(f)
@@ -639,13 +641,13 @@ def reindex_helper(x):
639641
def test_apply_corner_cases():
640642
# #535, can't use sliding iterator
641643

642-
N = 1000
644+
N = 10
643645
labels = np.random.default_rng(2).integers(0, 100, size=N)
644646
df = DataFrame(
645647
{
646648
"key": labels,
647649
"value1": np.random.default_rng(2).standard_normal(N),
648-
"value2": ["foo", "bar", "baz", "qux"] * (N // 4),
650+
"value2": ["foo", "bar", "baz", "qux", "a"] * (N // 5),
649651
}
650652
)
651653

@@ -680,6 +682,8 @@ def test_apply_numeric_coercion_when_datetime():
680682
result = df.groupby(["Number"]).apply(lambda x: x.iloc[0])
681683
tm.assert_series_equal(result["Str"], expected["Str"])
682684

685+
686+
def test_apply_numeric_coercion_when_datetime_getitem():
683687
# GH 15421
684688
df = DataFrame(
685689
{"A": [10, 20, 30], "B": ["foo", "3", "4"], "T": [pd.Timestamp("12:31:22")] * 3}
@@ -695,6 +699,8 @@ def get_B(g):
695699
expected.index = df.A
696700
tm.assert_series_equal(result, expected)
697701

702+
703+
def test_apply_numeric_coercion_when_datetime_with_nat():
698704
# GH 14423
699705
def predictions(tool):
700706
out = Series(index=["p1", "p2", "useTime"], dtype=object)
@@ -843,10 +849,24 @@ def test_func(x):
843849
tm.assert_frame_equal(result, expected)
844850

845851

846-
def test_groupby_apply_none_first():
852+
@pytest.mark.parametrize(
853+
"in_data, out_idx, out_data",
854+
[
855+
[
856+
{"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]},
857+
[[1, 1], [0, 2]],
858+
{"groups": [1, 1], "vars": [0, 2]},
859+
],
860+
[
861+
{"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]},
862+
[[2, 2], [1, 3]],
863+
{"groups": [2, 2], "vars": [1, 3]},
864+
],
865+
],
866+
)
867+
def test_groupby_apply_none_first(in_data, out_idx, out_data):
847868
# GH 12824. Tests if apply returns None first.
848-
test_df1 = DataFrame({"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]})
849-
test_df2 = DataFrame({"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]})
869+
test_df1 = DataFrame(in_data)
850870

851871
def test_func(x):
852872
if x.shape[0] < 2:
@@ -856,14 +876,9 @@ def test_func(x):
856876
msg = "DataFrameGroupBy.apply operated on the grouping columns"
857877
with tm.assert_produces_warning(DeprecationWarning, match=msg):
858878
result1 = test_df1.groupby("groups").apply(test_func)
859-
with tm.assert_produces_warning(DeprecationWarning, match=msg):
860-
result2 = test_df2.groupby("groups").apply(test_func)
861-
index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None])
862-
index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None])
863-
expected1 = DataFrame({"groups": [1, 1], "vars": [0, 2]}, index=index1)
864-
expected2 = DataFrame({"groups": [2, 2], "vars": [1, 3]}, index=index2)
879+
index1 = MultiIndex.from_arrays(out_idx, names=["groups", None])
880+
expected1 = DataFrame(out_data, index=index1)
865881
tm.assert_frame_equal(result1, expected1)
866-
tm.assert_frame_equal(result2, expected2)
867882

868883

869884
def test_groupby_apply_return_empty_chunk():
@@ -883,18 +898,16 @@ def test_groupby_apply_return_empty_chunk():
883898
tm.assert_series_equal(result, expected)
884899

885900

886-
def test_apply_with_mixed_types():
901+
@pytest.mark.parametrize("meth", ["apply", "transform"])
902+
def test_apply_with_mixed_types(meth):
887903
# gh-20949
888904
df = DataFrame({"A": "a a b".split(), "B": [1, 2, 3], "C": [4, 6, 5]})
889905
g = df.groupby("A", group_keys=False)
890906

891-
result = g.transform(lambda x: x / x.sum())
907+
result = getattr(g, meth)(lambda x: x / x.sum())
892908
expected = DataFrame({"B": [1 / 3.0, 2 / 3.0, 1], "C": [0.4, 0.6, 1.0]})
893909
tm.assert_frame_equal(result, expected)
894910

895-
result = g.apply(lambda x: x / x.sum())
896-
tm.assert_frame_equal(result, expected)
897-
898911

899912
def test_func_returns_object():
900913
# GH 28652
@@ -1106,7 +1119,7 @@ def test_apply_function_with_indexing_return_column():
11061119

11071120
@pytest.mark.parametrize(
11081121
"udf",
1109-
[(lambda x: x.copy()), (lambda x: x.copy().rename(lambda y: y + 1))],
1122+
[lambda x: x.copy(), lambda x: x.copy().rename(lambda y: y + 1)],
11101123
)
11111124
@pytest.mark.parametrize("group_keys", [True, False])
11121125
def test_apply_result_type(group_keys, udf):
@@ -1214,7 +1227,7 @@ def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp():
12141227
expected = df.iloc[[0, 2, 3]]
12151228
expected = expected.reset_index()
12161229
expected.index = MultiIndex.from_frame(expected[["A", "B", "idx"]])
1217-
expected = expected.drop(columns="idx")
1230+
expected = expected.drop(columns=["idx"])
12181231

12191232
tm.assert_frame_equal(result, expected)
12201233
for val in result.index.levels[1]:

0 commit comments

Comments
 (0)