Skip to content

Commit 372a9a0

Browse files
MarcoGorelliTomAugspurger
authored andcommitted
BUG: Avoid duplicating entire exploded column (#28010)
Closes #28005
1 parent 25d71fe commit 372a9a0

File tree

4 files changed

+60
-6
lines changed

4 files changed

+60
-6
lines changed

doc/source/whatsnew/v1.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ Indexing
181181
^^^^^^^^
182182

183183
- Bug in assignment using a reverse slicer (:issue:`26939`)
184+
- Bug in :meth:`DataFrame.explode` would duplicate frame in the presence of duplicates in the index (:issue:`28010`)
184185
- Bug in reindexing a :meth:`PeriodIndex` with another type of index that contained a `Period` (:issue:`28323`) (:issue:`28337`)
185186
- Fix assignment of column via `.loc` with numpy non-ns datetime type (:issue:`27395`)
186187

pandas/core/frame.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -6304,12 +6304,13 @@ def explode(self, column: Union[str, Tuple]) -> "DataFrame":
63046304
if not self.columns.is_unique:
63056305
raise ValueError("columns must be unique")
63066306

6307-
result = self[column].explode()
6308-
return (
6309-
self.drop([column], axis=1)
6310-
.join(result)
6311-
.reindex(columns=self.columns, copy=False)
6312-
)
6307+
df = self.reset_index(drop=True)
6308+
result = df[column].explode()
6309+
result = df.drop([column], axis=1).join(result)
6310+
result.index = self.index.take(result.index)
6311+
result = result.reindex(columns=self.columns, copy=False)
6312+
6313+
return result
63136314

63146315
def unstack(self, level=-1, fill_value=None):
63156316
"""

pandas/tests/frame/test_explode.py

+44
Original file line numberDiff line numberDiff line change
@@ -118,3 +118,47 @@ def test_usecase():
118118
index=[0, 0, 1, 1],
119119
)
120120
tm.assert_frame_equal(result, expected)
121+
122+
123+
@pytest.mark.parametrize(
124+
"input_dict, input_index, expected_dict, expected_index",
125+
[
126+
(
127+
{"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
128+
[0, 0],
129+
{"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
130+
[0, 0, 0, 0],
131+
),
132+
(
133+
{"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
134+
pd.Index([0, 0], name="my_index"),
135+
{"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
136+
pd.Index([0, 0, 0, 0], name="my_index"),
137+
),
138+
(
139+
{"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
140+
pd.MultiIndex.from_arrays(
141+
[[0, 0], [1, 1]], names=["my_first_index", "my_second_index"]
142+
),
143+
{"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
144+
pd.MultiIndex.from_arrays(
145+
[[0, 0, 0, 0], [1, 1, 1, 1]],
146+
names=["my_first_index", "my_second_index"],
147+
),
148+
),
149+
(
150+
{"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
151+
pd.MultiIndex.from_arrays([[0, 0], [1, 1]], names=["my_index", None]),
152+
{"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
153+
pd.MultiIndex.from_arrays(
154+
[[0, 0, 0, 0], [1, 1, 1, 1]], names=["my_index", None]
155+
),
156+
),
157+
],
158+
)
159+
def test_duplicate_index(input_dict, input_index, expected_dict, expected_index):
160+
# GH 28005
161+
df = pd.DataFrame(input_dict, index=input_index)
162+
result = df.explode("col1")
163+
expected = pd.DataFrame(expected_dict, index=expected_index, dtype=object)
164+
tm.assert_frame_equal(result, expected)

pandas/tests/series/test_explode.py

+8
Original file line numberDiff line numberDiff line change
@@ -111,3 +111,11 @@ def test_nested_EA():
111111
pd.date_range("20170101", periods=6, tz="UTC"), index=[0, 0, 0, 1, 1, 1]
112112
)
113113
tm.assert_series_equal(result, expected)
114+
115+
116+
def test_duplicate_index():
117+
# GH 28005
118+
s = pd.Series([[1, 2], [3, 4]], index=[0, 0])
119+
result = s.explode()
120+
expected = pd.Series([1, 2, 3, 4], index=[0, 0, 0, 0], dtype=object)
121+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)