BUG: Avoid duplicating entire exploded column (#28010)

MarcoGorelli · TomAugspurger · commit 372a9a03608f · 2019-09-17T08:38:13.000-05:00
Closes #28005
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -181,6 +181,7 @@ Indexing
 ^^^^^^^^
 
 - Bug in assignment using a reverse slicer (:issue:`26939`)
+- Bug in :meth:`DataFrame.explode` would duplicate frame in the presence of duplicates in the index (:issue:`28010`)
 - Bug in reindexing a :meth:`PeriodIndex` with another type of index that contained a `Period` (:issue:`28323`) (:issue:`28337`)
 - Fix assignment of column via `.loc` with numpy non-ns datetime type (:issue:`27395`)
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -6304,12 +6304,13 @@ def explode(self, column: Union[str, Tuple]) -> "DataFrame":
         if not self.columns.is_unique:
             raise ValueError("columns must be unique")
 
-        result = self[column].explode()
-        return (
-            self.drop([column], axis=1)
-            .join(result)
-            .reindex(columns=self.columns, copy=False)
-        )
+        df = self.reset_index(drop=True)
+        result = df[column].explode()
+        result = df.drop([column], axis=1).join(result)
+        result.index = self.index.take(result.index)
+        result = result.reindex(columns=self.columns, copy=False)
+
+        return result
 
     def unstack(self, level=-1, fill_value=None):
         """
diff --git a/pandas/tests/frame/test_explode.py b/pandas/tests/frame/test_explode.py
@@ -118,3 +118,47 @@ def test_usecase():
         index=[0, 0, 1, 1],
     )
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "input_dict, input_index, expected_dict, expected_index",
+    [
+        (
+            {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
+            [0, 0],
+            {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
+            [0, 0, 0, 0],
+        ),
+        (
+            {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
+            pd.Index([0, 0], name="my_index"),
+            {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
+            pd.Index([0, 0, 0, 0], name="my_index"),
+        ),
+        (
+            {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
+            pd.MultiIndex.from_arrays(
+                [[0, 0], [1, 1]], names=["my_first_index", "my_second_index"]
+            ),
+            {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
+            pd.MultiIndex.from_arrays(
+                [[0, 0, 0, 0], [1, 1, 1, 1]],
+                names=["my_first_index", "my_second_index"],
+            ),
+        ),
+        (
+            {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
+            pd.MultiIndex.from_arrays([[0, 0], [1, 1]], names=["my_index", None]),
+            {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
+            pd.MultiIndex.from_arrays(
+                [[0, 0, 0, 0], [1, 1, 1, 1]], names=["my_index", None]
+            ),
+        ),
+    ],
+)
+def test_duplicate_index(input_dict, input_index, expected_dict, expected_index):
+    # GH 28005
+    df = pd.DataFrame(input_dict, index=input_index)
+    result = df.explode("col1")
+    expected = pd.DataFrame(expected_dict, index=expected_index, dtype=object)
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/series/test_explode.py b/pandas/tests/series/test_explode.py
@@ -111,3 +111,11 @@ def test_nested_EA():
         pd.date_range("20170101", periods=6, tz="UTC"), index=[0, 0, 0, 1, 1, 1]
     )
     tm.assert_series_equal(result, expected)
+
+
+def test_duplicate_index():
+    # GH 28005
+    s = pd.Series([[1, 2], [3, 4]], index=[0, 0])
+    result = s.explode()
+    expected = pd.Series([1, 2, 3, 4], index=[0, 0, 0, 0], dtype=object)
+    tm.assert_series_equal(result, expected)