Skip to content

Commit fc6a7ac

Browse files
committed
Merge branch 'exploding-frames' of https://github.com/MarcoGorelli/pandas into exploding-frames
2 parents 498f300 + 2af737a commit fc6a7ac

File tree

4 files changed

+98
-7
lines changed

4 files changed

+98
-7
lines changed

doc/source/whatsnew/v1.0.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ Indexing
143143
^^^^^^^^
144144

145145
- Bug in assignment using a reverse slicer (:issue:`26939`)
146-
-
146+
- Bug in :meth:`DataFrame.explode` would duplicate frame in the presence of duplicates in the index (:issue:`28010`)
147147

148148
Missing
149149
^^^^^^^

pandas/core/frame.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -6259,12 +6259,14 @@ def explode(self, column: Union[str, Tuple]) -> "DataFrame":
62596259
if not self.columns.is_unique:
62606260
raise ValueError("columns must be unique")
62616261

6262-
result = self[column].explode()
6263-
return (
6264-
self.drop([column], axis=1)
6265-
.join(result)
6266-
.reindex(columns=self.columns, copy=False)
6267-
)
6262+
result = self.copy()
6263+
exploded_col = result.pop(column).reset_index(drop=True).explode()
6264+
result = result.reset_index().join(exploded_col)
6265+
result.set_index(result.columns[: self.index.nlevels].tolist(), inplace=True)
6266+
result.index.names = self.index.names
6267+
result = result.reindex(columns=self.columns, copy=False)
6268+
6269+
return result
62686270

62696271
def unstack(self, level=-1, fill_value=None):
62706272
"""

pandas/tests/frame/test_explode.py

+81
Original file line numberDiff line numberDiff line change
@@ -118,3 +118,84 @@ def test_usecase():
118118
index=[0, 0, 1, 1],
119119
)
120120
tm.assert_frame_equal(result, expected)
121+
122+
123+
@pytest.mark.parametrize(
124+
"df, expected",
125+
[
126+
(
127+
pd.DataFrame({"col": [[1, 2], [3, 4]]}, index=[0, 0]),
128+
pd.DataFrame({"col": [1, 2, 3, 4]}, index=[0, 0, 0, 0], dtype=object),
129+
),
130+
(
131+
pd.DataFrame(
132+
{"col": [[1, 2], [3, 4]], "other_col": ["a", "b"]}, index=[0, 0]
133+
),
134+
pd.DataFrame(
135+
{"col": [1, 2, 3, 4], "other_col": ["a", "a", "b", "b"]},
136+
index=[0, 0, 0, 0],
137+
dtype=object,
138+
),
139+
),
140+
(
141+
pd.DataFrame(
142+
{"col": [[1, 2], [3, 4]], "other_col": ["a", "b"], "my_index": [0, 0]}
143+
).set_index("my_index"),
144+
pd.DataFrame(
145+
{
146+
"col": [1, 2, 3, 4],
147+
"other_col": ["a", "a", "b", "b"],
148+
"my_index": [0, 0, 0, 0],
149+
},
150+
dtype=object,
151+
).set_index("my_index"),
152+
),
153+
(
154+
pd.DataFrame(
155+
{
156+
"col": [[1, 2], [3, 4]],
157+
"other_col": ["a", "b"],
158+
"my_first_index": [0, 0],
159+
"my_second_index": [1, 1],
160+
}
161+
).set_index(["my_first_index", "my_second_index"]),
162+
pd.DataFrame(
163+
{
164+
"col": [1, 2, 3, 4],
165+
"other_col": ["a", "a", "b", "b"],
166+
"my_first_index": [0, 0, 0, 0],
167+
"my_second_index": [1, 1, 1, 1],
168+
},
169+
dtype=object,
170+
).set_index(["my_first_index", "my_second_index"]),
171+
),
172+
(
173+
pd.DataFrame(
174+
{"col": [[1, 2], [3, 4]], "other_col": ["a", "b"]},
175+
pd.MultiIndex.from_tuples([(0, 1), (0, 1)]),
176+
),
177+
pd.DataFrame(
178+
{"col": [1, 2, 3, 4], "other_col": ["a", "a", "b", "b"]},
179+
pd.MultiIndex.from_tuples([(0, 1), (0, 1), (0, 1), (0, 1)]),
180+
dtype=object,
181+
),
182+
),
183+
(
184+
pd.DataFrame(
185+
{"col": [[1, 2], [3, 4]], "other_col": ["a", "b"]},
186+
pd.MultiIndex.from_arrays([[0, 0], [1, 1]], names=["foo", None]),
187+
),
188+
pd.DataFrame(
189+
{"col": [1, 2, 3, 4], "other_col": ["a", "a", "b", "b"]},
190+
pd.MultiIndex.from_arrays(
191+
[[0, 0, 0, 0], [1, 1, 1, 1]], names=["foo", None]
192+
),
193+
dtype=object,
194+
),
195+
),
196+
],
197+
)
198+
def test_duplicate_index(df, expected):
199+
# GH 28005
200+
result = df.explode("col")
201+
tm.assert_frame_equal(result, expected)

pandas/tests/series/test_explode.py

+8
Original file line numberDiff line numberDiff line change
@@ -111,3 +111,11 @@ def test_nested_EA():
111111
pd.date_range("20170101", periods=6, tz="UTC"), index=[0, 0, 0, 1, 1, 1]
112112
)
113113
tm.assert_series_equal(result, expected)
114+
115+
116+
def test_duplicate_index():
117+
# GH 28005
118+
s = pd.Series([[1, 2], [3, 4]], index=[0, 0])
119+
result = s.explode()
120+
expected = pd.Series([1, 2, 3, 4], index=[0, 0, 0, 0], dtype=object)
121+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)