Skip to content

Commit 41a94b0

Browse files
authored
EHN: multi-column explode (#39240) (#40770)
1 parent 1079cd2 commit 41a94b0

File tree

3 files changed

+167
-24
lines changed

3 files changed

+167
-24
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,7 @@ Other enhancements
275275
- Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`)
276276
- :meth:`Series.replace` will now cast results to ``PeriodDtype`` where possible instead of ``object`` dtype (:issue:`41526`)
277277
- Improved error message in ``corr`` and ``cov`` methods on :class:`.Rolling`, :class:`.Expanding`, and :class:`.ExponentialMovingWindow` when ``other`` is not a :class:`DataFrame` or :class:`Series` (:issue:`41741`)
278+
- :meth:`DataFrame.explode` now supports exploding multiple columns. Its ``column`` argument now also accepts a list of str or tuples for exploding on multiple columns at the same time (:issue:`39240`)
278279

279280
.. ---------------------------------------------------------------------------
280281

pandas/core/frame.py

+74-23
Original file line numberDiff line numberDiff line change
@@ -8145,16 +8145,27 @@ def stack(self, level: Level = -1, dropna: bool = True):
81458145

81468146
return result.__finalize__(self, method="stack")
81478147

8148-
def explode(self, column: str | tuple, ignore_index: bool = False) -> DataFrame:
8148+
def explode(
8149+
self,
8150+
column: str | tuple | list[str | tuple],
8151+
ignore_index: bool = False,
8152+
) -> DataFrame:
81498153
"""
81508154
Transform each element of a list-like to a row, replicating index values.
81518155
81528156
.. versionadded:: 0.25.0
81538157
81548158
Parameters
81558159
----------
8156-
column : str or tuple
8157-
Column to explode.
8160+
column : str or tuple or list thereof
8161+
Column(s) to explode.
8162+
For multiple columns, specify a non-empty list with each element
8163+
be str or tuple, and all specified columns their list-like data
8164+
on same row of the frame must have matching length.
8165+
8166+
.. versionadded:: 1.3.0
8167+
Multi-column explode
8168+
81588169
ignore_index : bool, default False
81598170
If True, the resulting index will be labeled 0, 1, …, n - 1.
81608171
@@ -8169,7 +8180,10 @@ def explode(self, column: str | tuple, ignore_index: bool = False) -> DataFrame:
81698180
Raises
81708181
------
81718182
ValueError :
8172-
if columns of the frame are not unique.
8183+
* If columns of the frame are not unique.
8184+
* If specified columns to explode is empty list.
8185+
* If specified columns to explode have not matching count of
8186+
elements rowwise in the frame.
81738187
81748188
See Also
81758189
--------
@@ -8188,32 +8202,69 @@ def explode(self, column: str | tuple, ignore_index: bool = False) -> DataFrame:
81888202
81898203
Examples
81908204
--------
8191-
>>> df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1})
8205+
>>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]],
8206+
... 'B': 1,
8207+
... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]})
81928208
>>> df
8193-
A B
8194-
0 [1, 2, 3] 1
8195-
1 foo 1
8196-
2 [] 1
8197-
3 [3, 4] 1
8209+
A B C
8210+
0 [0, 1, 2] 1 [a, b, c]
8211+
1 foo 1 NaN
8212+
2 [] 1 []
8213+
3 [3, 4] 1 [d, e]
8214+
8215+
Single-column explode.
81988216
81998217
>>> df.explode('A')
8200-
A B
8201-
0 1 1
8202-
0 2 1
8203-
0 3 1
8204-
1 foo 1
8205-
2 NaN 1
8206-
3 3 1
8207-
3 4 1
8208-
"""
8209-
if not (is_scalar(column) or isinstance(column, tuple)):
8210-
raise ValueError("column must be a scalar")
8218+
A B C
8219+
0 0 1 [a, b, c]
8220+
0 1 1 [a, b, c]
8221+
0 2 1 [a, b, c]
8222+
1 foo 1 NaN
8223+
2 NaN 1 []
8224+
3 3 1 [d, e]
8225+
3 4 1 [d, e]
8226+
8227+
Multi-column explode.
8228+
8229+
>>> df.explode(list('AC'))
8230+
A B C
8231+
0 0 1 a
8232+
0 1 1 b
8233+
0 2 1 c
8234+
1 foo 1 NaN
8235+
2 NaN 1 NaN
8236+
3 3 1 d
8237+
3 4 1 e
8238+
"""
82118239
if not self.columns.is_unique:
82128240
raise ValueError("columns must be unique")
82138241

8242+
columns: list[str | tuple]
8243+
if is_scalar(column) or isinstance(column, tuple):
8244+
assert isinstance(column, (str, tuple))
8245+
columns = [column]
8246+
elif isinstance(column, list) and all(
8247+
map(lambda c: is_scalar(c) or isinstance(c, tuple), column)
8248+
):
8249+
if not column:
8250+
raise ValueError("column must be nonempty")
8251+
if len(column) > len(set(column)):
8252+
raise ValueError("column must be unique")
8253+
columns = column
8254+
else:
8255+
raise ValueError("column must be a scalar, tuple, or list thereof")
8256+
82148257
df = self.reset_index(drop=True)
8215-
result = df[column].explode()
8216-
result = df.drop([column], axis=1).join(result)
8258+
if len(columns) == 1:
8259+
result = df[columns[0]].explode()
8260+
else:
8261+
mylen = lambda x: len(x) if is_list_like(x) else -1
8262+
counts0 = self[columns[0]].apply(mylen)
8263+
for c in columns[1:]:
8264+
if not all(counts0 == self[c].apply(mylen)):
8265+
raise ValueError("columns must have matching element counts")
8266+
result = DataFrame({c: df[c].explode() for c in columns})
8267+
result = df.drop(columns, axis=1).join(result)
82178268
if ignore_index:
82188269
result.index = ibase.default_index(len(result))
82198270
else:

pandas/tests/frame/methods/test_explode.py

+92-1
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,50 @@ def test_error():
99
df = pd.DataFrame(
1010
{"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
1111
)
12-
with pytest.raises(ValueError, match="column must be a scalar"):
12+
with pytest.raises(
13+
ValueError, match="column must be a scalar, tuple, or list thereof"
14+
):
15+
df.explode([list("AA")])
16+
17+
with pytest.raises(ValueError, match="column must be unique"):
1318
df.explode(list("AA"))
1419

1520
df.columns = list("AA")
1621
with pytest.raises(ValueError, match="columns must be unique"):
1722
df.explode("A")
1823

1924

25+
@pytest.mark.parametrize(
26+
"input_subset, error_message",
27+
[
28+
(
29+
list("AC"),
30+
"columns must have matching element counts",
31+
),
32+
(
33+
[],
34+
"column must be nonempty",
35+
),
36+
(
37+
list("AC"),
38+
"columns must have matching element counts",
39+
),
40+
],
41+
)
42+
def test_error_multi_columns(input_subset, error_message):
43+
# GH 39240
44+
df = pd.DataFrame(
45+
{
46+
"A": [[0, 1, 2], np.nan, [], (3, 4)],
47+
"B": 1,
48+
"C": [["a", "b", "c"], "foo", [], ["d", "e", "f"]],
49+
},
50+
index=list("abcd"),
51+
)
52+
with pytest.raises(ValueError, match=error_message):
53+
df.explode(input_subset)
54+
55+
2056
def test_basic():
2157
df = pd.DataFrame(
2258
{"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
@@ -180,3 +216,58 @@ def test_explode_sets():
180216
result = df.explode(column="a").sort_values(by="a")
181217
expected = pd.DataFrame({"a": ["x", "y"], "b": [1, 1]}, index=[1, 1])
182218
tm.assert_frame_equal(result, expected)
219+
220+
221+
@pytest.mark.parametrize(
222+
"input_subset, expected_dict, expected_index",
223+
[
224+
(
225+
list("AC"),
226+
{
227+
"A": pd.Series(
228+
[0, 1, 2, np.nan, np.nan, 3, 4, np.nan],
229+
index=list("aaabcdde"),
230+
dtype=object,
231+
),
232+
"B": 1,
233+
"C": ["a", "b", "c", "foo", np.nan, "d", "e", np.nan],
234+
},
235+
list("aaabcdde"),
236+
),
237+
(
238+
list("A"),
239+
{
240+
"A": pd.Series(
241+
[0, 1, 2, np.nan, np.nan, 3, 4, np.nan],
242+
index=list("aaabcdde"),
243+
dtype=object,
244+
),
245+
"B": 1,
246+
"C": [
247+
["a", "b", "c"],
248+
["a", "b", "c"],
249+
["a", "b", "c"],
250+
"foo",
251+
[],
252+
["d", "e"],
253+
["d", "e"],
254+
np.nan,
255+
],
256+
},
257+
list("aaabcdde"),
258+
),
259+
],
260+
)
261+
def test_multi_columns(input_subset, expected_dict, expected_index):
262+
# GH 39240
263+
df = pd.DataFrame(
264+
{
265+
"A": [[0, 1, 2], np.nan, [], (3, 4), np.nan],
266+
"B": 1,
267+
"C": [["a", "b", "c"], "foo", [], ["d", "e"], np.nan],
268+
},
269+
index=list("abcde"),
270+
)
271+
result = df.explode(input_subset)
272+
expected = pd.DataFrame(expected_dict, expected_index)
273+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)