multi-column explode

iynehz · iynehz · commit 527a587767c2 · 2021-04-07T22:37:48.000+08:00
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -7910,16 +7910,23 @@ def stack(self, level: Level = -1, dropna: bool = True):
 
         return result.__finalize__(self, method="stack")
 
-    def explode(self, column: str | tuple, ignore_index: bool = False) -> DataFrame:
+    def explode(
+        self,
+        column: str | tuple | list[str | tuple],
+        ignore_index: bool = False,
+    ) -> DataFrame:
         """
         Transform each element of a list-like to a row, replicating index values.
 
         .. versionadded:: 0.25.0
 
         Parameters
         ----------
-        column : str or tuple
-            Column to explode.
+        column : str or tuple or list thereof
+            Column(s) to explode.
+            For multiple columns, specify a non-empty list with each element
+            be str or tuple, and all specified columns their list-like data
+            on same row of the frame must have matching length.
         ignore_index : bool, default False
             If True, the resulting index will be labeled 0, 1, …, n - 1.
 
@@ -7934,7 +7941,10 @@ def explode(self, column: str | tuple, ignore_index: bool = False) -> DataFrame:
         Raises
         ------
         ValueError :
-            if columns of the frame are not unique.
+            * If columns of the frame are not unique.
+            * If specified columns to explode is empty list.
+            * If specified columns to explode have not matching count of
+              elements rowwise in the frame.
 
         See Also
         --------
@@ -7953,32 +7963,67 @@ def explode(self, column: str | tuple, ignore_index: bool = False) -> DataFrame:
 
         Examples
         --------
-        >>> df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1})
+        >>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]],
+        ...                    'B': 1,
+        ...                    'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]})
         >>> df
-                   A  B
-        0  [1, 2, 3]  1
-        1        foo  1
-        2         []  1
-        3     [3, 4]  1
+                   A  B          C
+        0  [0, 1, 2]  1  [a, b, c]
+        1        foo  1        NaN
+        2         []  1         []
+        3     [3, 4]  1     [d, e]
 
         >>> df.explode('A')
-             A  B
-        0    1  1
-        0    2  1
-        0    3  1
-        1  foo  1
-        2  NaN  1
-        3    3  1
-        3    4  1
-        """
-        if not (is_scalar(column) or isinstance(column, tuple)):
-            raise ValueError("column must be a scalar")
+             A  B          C
+        0    0  1  [a, b, c]
+        0    1  1  [a, b, c]
+        0    2  1  [a, b, c]
+        1  foo  1        NaN
+        2  NaN  1         []
+        3    3  1     [d, e]
+        3    4  1     [d, e]
+
+        >>> df.explode(list('AC'))
+             A  B    C
+        0    0  1    a
+        0    1  1    b
+        0    2  1    c
+        1  foo  1  NaN
+        2  NaN  1  NaN
+        3    3  1    d
+        3    4  1    e
+        """
         if not self.columns.is_unique:
             raise ValueError("columns must be unique")
 
+        columns: list[str | tuple]
+        if is_scalar(column) or isinstance(column, tuple):
+            # mypy: List item 0 has incompatible type "Union[str, Tuple[Any, ...],
+            # List[Union[str, Tuple[Any, ...]]]]"; expected
+            # "Union[str, Tuple[Any, ...]]"
+            columns = [column]  # type: ignore[list-item]
+        elif isinstance(column, list) and all(
+            map(lambda c: is_scalar(c) or isinstance(c, tuple), column)
+        ):
+            if len(column) == 0:
+                raise ValueError("column must be nonempty")
+            if len(column) > len(set(column)):
+                raise ValueError("column must be unique")
+            columns = column
+        else:
+            raise ValueError("column must be a scalar, tuple, or list thereof")
+
         df = self.reset_index(drop=True)
-        result = df[column].explode()
-        result = df.drop([column], axis=1).join(result)
+        if len(columns) == 1:
+            result = df[column].explode()
+        else:
+            mylen = lambda x: len(x) if is_list_like(x) else -1
+            counts0 = self[columns[0]].apply(mylen)
+            for c in columns[1:]:
+                if not all(counts0 == self[c].apply(mylen)):
+                    raise ValueError("columns must have matching element counts")
+            result = DataFrame({c: df[c].explode() for c in columns})
+        result = df.drop(columns, axis=1).join(result)
         if ignore_index:
             result.index = ibase.default_index(len(result))
         else:
diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py
@@ -9,13 +9,34 @@ def test_error():
     df = pd.DataFrame(
         {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
     )
-    with pytest.raises(ValueError, match="column must be a scalar"):
+    with pytest.raises(
+        ValueError, match="column must be a scalar, tuple, or list thereof"
+    ):
+        df.explode([list("AA")])
+
+    with pytest.raises(ValueError, match="column must be unique"):
         df.explode(list("AA"))
 
     df.columns = list("AA")
     with pytest.raises(ValueError, match="columns must be unique"):
         df.explode("A")
 
+    # GH 39240
+    df1 = df.assign(C=[["a", "b", "c"], "foo", [], ["d", "e", "f"]])
+    df1.columns = list("ABC")
+    with pytest.raises(ValueError, match="columns must have matching element counts"):
+        df1.explode(list("AC"))
+
+    # GH 39240
+    with pytest.raises(ValueError, match="column must be nonempty"):
+        df1.explode([])
+
+    # GH 39240
+    df2 = df.assign(C=[["a", "b", "c"], "foo", [], "d"])
+    df2.columns = list("ABC")
+    with pytest.raises(ValueError, match="columns must have matching element counts"):
+        df2.explode(list("AC"))
+
 
 def test_basic():
     df = pd.DataFrame(
@@ -180,3 +201,25 @@ def test_explode_sets():
     result = df.explode(column="a").sort_values(by="a")
     expected = pd.DataFrame({"a": ["x", "y"], "b": [1, 1]}, index=[1, 1])
     tm.assert_frame_equal(result, expected)
+
+
+def test_multi_columns():
+    # GH 39240
+    df = pd.DataFrame(
+        {
+            "A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")),
+            "B": 1,
+            "C": [["a", "b", "c"], "foo", [], ["d", "e"]],
+        }
+    )
+    result = df.explode(list("AC"))
+    expected = pd.DataFrame(
+        {
+            "A": pd.Series(
+                [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object
+            ),
+            "B": 1,
+            "C": ["a", "b", "c", "foo", np.nan, "d", "e"],
+        }
+    )
+    tm.assert_frame_equal(result, expected)