Skip to content

Commit 181a1d1

Browse files
Merge branch 'pandas-dev:main' into Fix#58748
2 parents 520fafe + b162331 commit 181a1d1

File tree

12 files changed

+94
-50
lines changed

12 files changed

+94
-50
lines changed

doc/source/development/maintaining.rst

+8-2
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ Here's a typical workflow for triaging a newly opened issue.
8484
example. See https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports
8585
for a good explanation. If the example is not reproducible, or if it's
8686
*clearly* not minimal, feel free to ask the reporter if they can provide
87-
and example or simplify the provided one. Do acknowledge that writing
87+
an example or simplify the provided one. Do acknowledge that writing
8888
minimal reproducible examples is hard work. If the reporter is struggling,
8989
you can try to write one yourself and we'll edit the original post to include it.
9090

@@ -93,6 +93,9 @@ Here's a typical workflow for triaging a newly opened issue.
9393
If a reproducible example is provided, but you see a simplification,
9494
edit the original post with your simpler reproducible example.
9595

96+
If this is a regression report, post the result of a ``git bisect`` run.
97+
More info on this can be found in the :ref:`maintaining.regressions` section.
98+
9699
Ensure the issue exists on the main branch and that it has the "Needs Triage" tag
97100
until all steps have been completed. Add a comment to the issue once you have
98101
verified it exists on the main branch, so others know it has been confirmed.
@@ -125,7 +128,10 @@ Here's a typical workflow for triaging a newly opened issue.
125128
If the issue is clearly defined and the fix seems relatively straightforward,
126129
label the issue as "Good first issue".
127130

128-
Once you have completed the above, make sure to remove the "needs triage" label.
131+
If the issue is a regression report, add the "Regression" label and the next patch
132+
release milestone.
133+
134+
Once you have completed the above, make sure to remove the "Needs Triage" label.
129135

130136
.. _maintaining.regressions:
131137

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,7 @@ Missing
440440
MultiIndex
441441
^^^^^^^^^^
442442
- :func:`DataFrame.loc` with ``axis=0`` and :class:`MultiIndex` when setting a value adds extra columns (:issue:`58116`)
443+
- :meth:`DataFrame.melt` would not accept multiple names in ``var_name`` when the columns were a :class:`MultiIndex` (:issue:`58033`)
443444
-
444445

445446
I/O

pandas/core/apply.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -664,7 +664,7 @@ def _apply_str(self, obj, func: str, *args, **kwargs):
664664
# people may aggregate on a non-callable attribute
665665
# but don't let them think they can pass args to it
666666
assert len(args) == 0
667-
assert len([kwarg for kwarg in kwargs if kwarg not in ["axis"]]) == 0
667+
assert not any(kwarg == "axis" for kwarg in kwargs)
668668
return f
669669
elif hasattr(np, func) and hasattr(obj, "__array__"):
670670
# in particular exclude Window

pandas/core/computation/eval.py

+2
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,8 @@ def eval(
193193
corresponding bitwise operators. :class:`~pandas.Series` and
194194
:class:`~pandas.DataFrame` objects are supported and behave as they would
195195
with plain ol' Python evaluation.
196+
`eval` can run arbitrary code which can make you vulnerable to code
197+
injection if you pass user input to this function.
196198
197199
Parameters
198200
----------

pandas/core/frame.py

+3
Original file line numberDiff line numberDiff line change
@@ -4472,6 +4472,9 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No
44724472
"""
44734473
Query the columns of a DataFrame with a boolean expression.
44744474
4475+
This method can run arbitrary code which can make you vulnerable to code
4476+
injection if you pass user input to this function.
4477+
44754478
Parameters
44764479
----------
44774480
expr : str

pandas/core/generic.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -1750,19 +1750,25 @@ def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike
17501750
if `key` matches multiple labels
17511751
"""
17521752
axis = self._get_axis_number(axis)
1753-
other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis]
1753+
first_other_axes = next(
1754+
(ax for ax in range(self._AXIS_LEN) if ax != axis), None
1755+
)
17541756

17551757
if self._is_label_reference(key, axis=axis):
17561758
self._check_label_or_level_ambiguity(key, axis=axis)
1757-
values = self.xs(key, axis=other_axes[0])._values
1759+
if first_other_axes is None:
1760+
raise ValueError("axis matched all axes")
1761+
values = self.xs(key, axis=first_other_axes)._values
17581762
elif self._is_level_reference(key, axis=axis):
17591763
values = self.axes[axis].get_level_values(key)._values
17601764
else:
17611765
raise KeyError(key)
17621766

17631767
# Check for duplicates
17641768
if values.ndim > 1:
1765-
if other_axes and isinstance(self._get_axis(other_axes[0]), MultiIndex):
1769+
if first_other_axes is not None and isinstance(
1770+
self._get_axis(first_other_axes), MultiIndex
1771+
):
17661772
multi_message = (
17671773
"\n"
17681774
"For a multi-index, the label must be a "

pandas/core/reshape/melt.py

+17-4
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@
55

66
import numpy as np
77

8-
from pandas.core.dtypes.common import is_list_like
8+
from pandas.core.dtypes.common import (
9+
is_iterator,
10+
is_list_like,
11+
)
912
from pandas.core.dtypes.concat import concat_compat
1013
from pandas.core.dtypes.missing import notna
1114

@@ -64,9 +67,10 @@ def melt(
6467
value_vars : scalar, tuple, list, or ndarray, optional
6568
Column(s) to unpivot. If not specified, uses all columns that
6669
are not set as `id_vars`.
67-
var_name : scalar, default None
70+
var_name : scalar, tuple, list, or ndarray, optional
6871
Name to use for the 'variable' column. If None it uses
69-
``frame.columns.name`` or 'variable'.
72+
``frame.columns.name`` or 'variable'. Must be a scalar if columns are a
73+
MultiIndex.
7074
value_name : scalar, default 'value'
7175
Name to use for the 'value' column, can't be an existing column label.
7276
col_level : scalar, optional
@@ -217,7 +221,16 @@ def melt(
217221
frame.columns.name if frame.columns.name is not None else "variable"
218222
]
219223
elif is_list_like(var_name):
220-
raise ValueError(f"{var_name=} must be a scalar.")
224+
if isinstance(frame.columns, MultiIndex):
225+
if is_iterator(var_name):
226+
var_name = list(var_name)
227+
if len(var_name) > len(frame.columns):
228+
raise ValueError(
229+
f"{var_name=} has {len(var_name)} items, "
230+
f"but the dataframe columns only have {len(frame.columns)} levels."
231+
)
232+
else:
233+
raise ValueError(f"{var_name=} must be a scalar.")
221234
else:
222235
var_name = [var_name]
223236

pandas/io/excel/_base.py

+9-10
Original file line numberDiff line numberDiff line change
@@ -857,24 +857,23 @@ def _parse_sheet(
857857
# a row containing just the index name(s)
858858
has_index_names = False
859859
if is_list_header and not is_len_one_list_header and index_col is not None:
860-
index_col_list: Sequence[int]
860+
index_col_set: set[int]
861861
if isinstance(index_col, int):
862-
index_col_list = [index_col]
862+
index_col_set = {index_col}
863863
else:
864864
assert isinstance(index_col, Sequence)
865-
index_col_list = index_col
865+
index_col_set = set(index_col)
866866

867867
# We have to handle mi without names. If any of the entries in the data
868868
# columns are not empty, this is a regular row
869869
assert isinstance(header, Sequence)
870870
if len(header) < len(data):
871871
potential_index_names = data[len(header)]
872-
potential_data = [
873-
x
872+
has_index_names = all(
873+
x == "" or x is None
874874
for i, x in enumerate(potential_index_names)
875-
if not control_row[i] and i not in index_col_list
876-
]
877-
has_index_names = all(x == "" or x is None for x in potential_data)
875+
if not control_row[i] and i not in index_col_set
876+
)
878877

879878
if is_list_like(index_col):
880879
# Forward fill values for MultiIndex index.
@@ -1457,9 +1456,9 @@ def inspect_excel_format(
14571456
with zipfile.ZipFile(stream) as zf:
14581457
# Workaround for some third party files that use forward slashes and
14591458
# lower case names.
1460-
component_names = [
1459+
component_names = {
14611460
name.replace("\\", "/").lower() for name in zf.namelist()
1462-
]
1461+
}
14631462

14641463
if "xl/workbook.xml" in component_names:
14651464
return "xlsx"

pandas/io/excel/_odfreader.py

+16-20
Original file line numberDiff line numberDiff line change
@@ -122,29 +122,25 @@ def get_sheet_data(
122122
table: list[list[Scalar | NaTType]] = []
123123

124124
for sheet_row in sheet_rows:
125-
sheet_cells = [
126-
x
127-
for x in sheet_row.childNodes
128-
if hasattr(x, "qname") and x.qname in cell_names
129-
]
130125
empty_cells = 0
131126
table_row: list[Scalar | NaTType] = []
132127

133-
for sheet_cell in sheet_cells:
134-
if sheet_cell.qname == table_cell_name:
135-
value = self._get_cell_value(sheet_cell)
136-
else:
137-
value = self.empty_value
138-
139-
column_repeat = self._get_column_repeat(sheet_cell)
140-
141-
# Queue up empty values, writing only if content succeeds them
142-
if value == self.empty_value:
143-
empty_cells += column_repeat
144-
else:
145-
table_row.extend([self.empty_value] * empty_cells)
146-
empty_cells = 0
147-
table_row.extend([value] * column_repeat)
128+
for sheet_cell in sheet_row.childNodes:
129+
if hasattr(sheet_cell, "qname") and sheet_cell.qname in cell_names:
130+
if sheet_cell.qname == table_cell_name:
131+
value = self._get_cell_value(sheet_cell)
132+
else:
133+
value = self.empty_value
134+
135+
column_repeat = self._get_column_repeat(sheet_cell)
136+
137+
# Queue up empty values, writing only if content succeeds them
138+
if value == self.empty_value:
139+
empty_cells += column_repeat
140+
else:
141+
table_row.extend([self.empty_value] * empty_cells)
142+
empty_cells = 0
143+
table_row.extend([value] * column_repeat)
148144

149145
if max_row_len < len(table_row):
150146
max_row_len = len(table_row)

pandas/io/excel/_xlrd.py

+4-7
Original file line numberDiff line numberDiff line change
@@ -128,16 +128,13 @@ def _parse_cell(cell_contents, cell_typ):
128128
cell_contents = val
129129
return cell_contents
130130

131-
data = []
132-
133131
nrows = sheet.nrows
134132
if file_rows_needed is not None:
135133
nrows = min(nrows, file_rows_needed)
136-
for i in range(nrows):
137-
row = [
134+
return [
135+
[
138136
_parse_cell(value, typ)
139137
for value, typ in zip(sheet.row_values(i), sheet.row_types(i))
140138
]
141-
data.append(row)
142-
143-
return data
139+
for i in range(nrows)
140+
]

pandas/io/sql.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ def _convert_arrays_to_dataframe(
157157
dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
158158
) -> DataFrame:
159159
content = lib.to_object_array_tuples(data)
160+
idx_len = content.shape[0]
160161
arrays = convert_object_array(
161162
list(content.T),
162163
dtype=None,
@@ -177,9 +178,9 @@ def _convert_arrays_to_dataframe(
177178
result_arrays.append(ArrowExtensionArray(pa_array))
178179
arrays = result_arrays # type: ignore[assignment]
179180
if arrays:
180-
df = DataFrame(dict(zip(range(len(columns)), arrays)))
181-
df.columns = columns
182-
return df
181+
return DataFrame._from_arrays(
182+
arrays, columns=columns, index=range(idx_len), verify_integrity=False
183+
)
183184
else:
184185
return DataFrame(columns=columns)
185186

pandas/tests/reshape/test_melt.py

+20
Original file line numberDiff line numberDiff line change
@@ -533,6 +533,26 @@ def test_melt_non_scalar_var_name_raises(self):
533533
with pytest.raises(ValueError, match=r".* must be a scalar."):
534534
df.melt(id_vars=["a"], var_name=[1, 2])
535535

536+
def test_melt_multiindex_columns_var_name(self):
537+
# GH 58033
538+
df = DataFrame({("A", "a"): [1], ("A", "b"): [2]})
539+
540+
expected = DataFrame(
541+
[("A", "a", 1), ("A", "b", 2)], columns=["first", "second", "value"]
542+
)
543+
544+
tm.assert_frame_equal(df.melt(var_name=["first", "second"]), expected)
545+
tm.assert_frame_equal(df.melt(var_name=["first"]), expected[["first", "value"]])
546+
547+
def test_melt_multiindex_columns_var_name_too_many(self):
548+
# GH 58033
549+
df = DataFrame({("A", "a"): [1], ("A", "b"): [2]})
550+
551+
with pytest.raises(
552+
ValueError, match="but the dataframe columns only have 2 levels"
553+
):
554+
df.melt(var_name=["first", "second", "third"])
555+
536556

537557
class TestLreshape:
538558
def test_pairs(self):

0 commit comments

Comments
 (0)