Skip to content

Commit 71516e5

Browse files
committed
[ENH] Add DataFrame method to explode a list-like column (GH pandas-dev#16538)
Sometimes a values column is presented with list-like values on one row. Instead we may want to split each individual value onto its own row, keeping the same mapping to the other key columns. While it's possible to chain together existing pandas operations (in fact that's exactly what this implementation is) to do this, the sequence of operations is not obvious. By contrast this is available as a built-in operation in say Spark and is a fairly common use case.
1 parent 14c33b0 commit 71516e5

File tree

4 files changed

+186
-0
lines changed

4 files changed

+186
-0
lines changed

asv_bench/benchmarks/reshape.py

+19
Original file line numberDiff line numberDiff line change
@@ -184,4 +184,23 @@ def time_qcut_datetime(self, bins):
184184
pd.qcut(self.datetime_series, bins)
185185

186186

187+
class Explode(object):
188+
param_names = ['n_rows', 'max_list_length']
189+
params = [[100, 1000, 10000], [3, 5, 10]]
190+
191+
def setup(self, n_rows, max_list_length):
192+
import string
193+
num_letters = np.random.randint(0, max_list_length, n_rows)
194+
key_column = [','.join([np.random.choice(list(string.ascii_letters))
195+
for _ in range(k)])
196+
for k in num_letters]
197+
value_column = np.random.randn(n_rows)
198+
self.frame = pd.DataFrame({'key': key_column,
199+
'value': value_column})
200+
201+
def time_explode(self, n_rows, max_list_length):
202+
self.frame.explode('key', sep=',')
203+
204+
205+
187206
from .pandas_vb_common import setup # noqa: F401

doc/source/reshaping.rst

+42
Original file line numberDiff line numberDiff line change
@@ -801,3 +801,45 @@ Note to subdivide over multiple columns we can pass in a list to the
801801
802802
df.pivot_table(
803803
values=['val0'], index='row', columns=['item', 'col'], aggfunc=['mean'])
804+
805+
Exploding a List-like Column
806+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
807+
808+
.. ipython:: python
809+
:suppress:
810+
811+
import pandas as pd
812+
df = pd.DataFrame({'keys': ['panda1', 'panda2', 'panda3']
813+
'values': [['eats','shoots'],
814+
['shoots','leaves'],
815+
['eats','shoots','leaves']]})
816+
exploded = df.explode('values')
817+
df2 = pd.DataFrame({'keys': ['panda1', 'panda2', 'panda3']
818+
'values': ['eats,shoots',
819+
'shoots,leaves',
820+
'eats,shoots,leaves']})
821+
822+
Sometimes the value column is list-like:
823+
824+
.. ipython:: python
825+
826+
df
827+
828+
But we actually want to put each value onto its own row:
829+
830+
.. ipython:: python
831+
832+
exploded
833+
834+
For this we can use ``DataFrame.explode``:
835+
836+
df.explode('values')
837+
838+
For convenience, we can use the optional keyword ``sep`` to automatically
839+
split a string values column before exploding:
840+
841+
.. ipython:: python
842+
843+
df2
844+
845+
df2.explode('values', sep=',')

pandas/core/frame.py

+42
Original file line numberDiff line numberDiff line change
@@ -5980,6 +5980,48 @@ def melt(self, id_vars=None, value_vars=None, var_name=None,
59805980
var_name=var_name, value_name=value_name,
59815981
col_level=col_level)
59825982

5983+
def explode(self, col_name, sep=None, dtype=None):
5984+
"""
5985+
Create a new DataFrame where each element in each row
5986+
of a list-like column `col_name` is expanded to its own row
5987+
5988+
.. versionadded:: 0.25.0
5989+
5990+
Parameters
5991+
----------
5992+
col_name : str
5993+
Name of the column to be exploded
5994+
sep : str, default None
5995+
Convenience to split a string `col_name` before exploding
5996+
dtype : str or dtype, default None
5997+
Optionally coerce the dtype of exploded column
5998+
-
5999+
Examples
6000+
--------
6001+
>>> df = pd.DataFrame({'k': ['a,b', 'c,d'], 'v': [0, 1]})
6002+
>>> df.explode('k', sep=',')
6003+
k v
6004+
0 a 0
6005+
0 b 0
6006+
1 c 1
6007+
1 d 1
6008+
"""
6009+
col = self[col_name]
6010+
if len(self) == 0:
6011+
return self.copy()
6012+
if sep:
6013+
col_expanded = col.str.split(sep, expand=True)
6014+
else:
6015+
col_expanded = col.apply(Series)
6016+
col_stacked = (col_expanded
6017+
.stack()
6018+
.reset_index(level=-1, drop=True)
6019+
.rename(col_name))
6020+
if dtype:
6021+
col_stacked = col_stacked.astype(dtype)
6022+
return (col_stacked.to_frame()
6023+
.join(self.drop(col_name, axis=1)))
6024+
59836025
# ----------------------------------------------------------------------
59846026
# Time series-related
59856027

pandas/tests/frame/test_reshape.py

+83
Original file line numberDiff line numberDiff line change
@@ -917,6 +917,89 @@ def test_unstack_swaplevel_sortlevel(self, level):
917917
result = df.unstack().swaplevel(axis=1).sort_index(axis=1, level=level)
918918
tm.assert_frame_equal(result, expected)
919919

920+
def test_explode():
921+
# GH 16538
922+
923+
# Automatically do str.split
924+
columns = ['a', 'b', 'c']
925+
df = pd.DataFrame([['foo,bar', 'x', 42],
926+
['fizz,buzz', 'y', 43]],
927+
columns=columns)
928+
rs = df.explode('a', sep=',')
929+
xp = pd.DataFrame({'a': ['foo','bar','fizz','buzz'],
930+
'b': ['x', 'x', 'y', 'y'],
931+
'c': [42, 42, 43, 43]},
932+
index=[0,0,1,1])
933+
tm.assert_frame_equal(rs, xp)
934+
935+
# Coerce dtype
936+
df = pd.DataFrame([[[0, 1, 4], 'x', 42],
937+
[[2, 3], 'y', 43]],
938+
columns=columns)
939+
rs = df.explode('a', dtype='int')
940+
xp = pd.DataFrame({'a': [0, 1, 4, 2, 3],
941+
'b': ['x', 'x', 'x', 'y', 'y'],
942+
'c': [42, 42, 42, 43, 43]},
943+
index=[0, 0, 0, 1, 1])
944+
tm.assert_frame_equal(rs, xp)
945+
946+
# NaN's and empty lists are omitted
947+
# TODO: option to preserve explicit NAs instead
948+
df = pd.DataFrame([[[], 'x', 42],
949+
[[2.0, np.nan], 'y', 43]],
950+
columns=columns)
951+
rs = df.explode('a')
952+
xp = pd.DataFrame({'a': [2.0],
953+
'b': ['y'],
954+
'c': [43]},
955+
index=[1])
956+
tm.assert_frame_equal(rs, xp)
957+
958+
# Not everything is a list
959+
df = pd.DataFrame([[[0, 1, 4], 'x', 42],
960+
[3, 'y', 43]],
961+
columns=columns)
962+
rs = df.explode('a', dtype='int')
963+
xp = pd.DataFrame({'a': [0, 1, 4, 3],
964+
'b': ['x', 'x', 'x', 'y'],
965+
'c': [42, 42, 42, 43]},
966+
index=[0, 0, 0, 1])
967+
tm.assert_frame_equal(rs, xp)
968+
969+
# Nothing is a list
970+
df = pd.DataFrame([[0, 'x', 42],
971+
[3, 'y', 43]],
972+
columns=columns)
973+
rs = df.explode('a')
974+
xp = pd.DataFrame({'a': [0, 3],
975+
'b': ['x', 'y'],
976+
'c': [42, 43]},
977+
index=[0, 1])
978+
tm.assert_frame_equal(rs, xp)
979+
980+
# Empty frame
981+
rs = pd.DataFrame(columns=['a', 'b']).explode('a')
982+
xp = pd.DataFrame(columns=['a', 'b'])
983+
tm.assert_frame_equal(rs, xp)
984+
985+
# Bad column name
986+
pytest.raises(KeyError, df.explode, 'badcolumnname')
987+
988+
# Multi-index
989+
columns = ['a', 'b', 'c']
990+
idx = pd.MultiIndex.from_tuples([(0, 'a'), (1, 'b')])
991+
df = pd.DataFrame([['foo,bar', 'x', 42],
992+
['fizz,buzz', 'y', 43]],
993+
columns=columns,
994+
index=idx)
995+
rs = df.explode('a', sep=',')
996+
idx = pd.MultiIndex.from_tuples([(0,'a'),(0,'a'),(1,'b'),(1,'b')])
997+
xp = pd.DataFrame({'a': ['foo','bar','fizz','buzz'],
998+
'b': ['x', 'x', 'y', 'y'],
999+
'c': [42, 42, 43, 43]},
1000+
index=idx)
1001+
tm.assert_frame_equal(rs, xp)
1002+
9201003

9211004
def test_unstack_fill_frame_object():
9221005
# GH12815 Test unstacking with object.

0 commit comments

Comments
 (0)