Skip to content

Commit 9e76b75

Browse files
committed
[ENH] Add DataFrame method to explode a list-like column (GH pandas-dev#16538)
Sometimes a values column is presented with list-like values on one row. Instead we may want to split each individual value onto its own row, keeping the same mapping to the other key columns. While it's possible to chain together existing pandas operations (in fact that's exactly what this implementation is) to do this, the sequence of operations is not obvious. By contrast this is available as a built-in operation in say Spark and is a fairly common use case.
1 parent 14c33b0 commit 9e76b75

File tree

5 files changed

+183
-0
lines changed

5 files changed

+183
-0
lines changed

asv_bench/benchmarks/reshape.py

+18
Original file line numberDiff line numberDiff line change
@@ -184,4 +184,22 @@ def time_qcut_datetime(self, bins):
184184
pd.qcut(self.datetime_series, bins)
185185

186186

187+
class Explode(object):
188+
param_names = ['n_rows', 'max_list_length']
189+
params = [[100, 1000, 10000], [3, 5, 10]]
190+
191+
def setup(self, n_rows, max_list_length):
192+
import string
193+
num_letters = np.random.randint(0, max_list_length, n_rows)
194+
key_column = [','.join([np.random.choice(list(string.ascii_letters))
195+
for _ in range(k)])
196+
for k in num_letters]
197+
value_column = np.random.randn(n_rows)
198+
self.frame = pd.DataFrame({'key': key_column,
199+
'value': value_column})
200+
201+
def time_explode(self, n_rows, max_list_length):
202+
self.frame.explode('key', sep=',')
203+
204+
187205
from .pandas_vb_common import setup # noqa: F401

doc/source/reshaping.rst

+37
Original file line numberDiff line numberDiff line change
@@ -801,3 +801,40 @@ Note to subdivide over multiple columns we can pass in a list to the
801801
802802
df.pivot_table(
803803
values=['val0'], index='row', columns=['item', 'col'], aggfunc=['mean'])
804+
805+
.. _reshaping.explode:
806+
807+
Exploding a List-like Column
808+
----------------------------
809+
810+
.. ipython:: python
811+
:suppress:
812+
813+
keys = ['panda1', 'panda2', 'panda3']
814+
values = [['eats', 'shoots'], ['shoots', 'leaves'], ['eats', 'leaves']]
815+
df = pd.DataFrame({'keys': keys, 'values': values})
816+
exploded = df.explode('values')
817+
values = ['eats,shoots', 'shoots,leaves', 'eats,shoots,leaves']
818+
df2 = pd.DataFrame({'keys': keys, 'values': values})
819+
820+
Sometimes the value column is list-like:
821+
822+
.. ipython:: python
823+
824+
df
825+
826+
But we actually want to put each value onto its own row.
827+
For this purpose we can use ``DataFrame.explode``:
828+
829+
.. ipython:: python
830+
831+
df.explode('values')
832+
833+
For convenience, we can use the optional keyword ``sep`` to automatically
834+
split a string column before exploding:
835+
836+
.. ipython:: python
837+
838+
df2
839+
840+
df2.explode('values', sep=',')

doc/source/whatsnew/v0.24.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ New features
3131
- :func:`read_feather` now accepts ``columns`` as an argument, allowing the user to specify which columns should be read. (:issue:`24025`)
3232
- :func:`DataFrame.to_html` now accepts ``render_links`` as an argument, allowing the user to generate HTML with links to any URLs that appear in the DataFrame.
3333
See the :ref:`section on writing HTML <io.html>` in the IO docs for example usage. (:issue:`2679`)
34+
- :func:`DataFrame.explode` to split list-like values onto individual rows. See :ref:`section on Exploding list-like column <reshaping.html>` in docs for more information (:issue:`16538`)
3435

3536
.. _whatsnew_0240.values_api:
3637

pandas/core/frame.py

+43
Original file line numberDiff line numberDiff line change
@@ -5980,6 +5980,49 @@ def melt(self, id_vars=None, value_vars=None, var_name=None,
59805980
var_name=var_name, value_name=value_name,
59815981
col_level=col_level)
59825982

5983+
def explode(self, col_name, sep=None, dtype=None):
5984+
"""
5985+
Create a new DataFrame where each element in each row
5986+
of a list-like column `col_name` is expanded to its own row
5987+
5988+
.. versionadded:: 0.24.0
5989+
5990+
Parameters
5991+
----------
5992+
col_name : str
5993+
Name of the column to be exploded
5994+
sep : str, default None
5995+
Convenience to split a string `col_name` before exploding
5996+
dtype : str or dtype, default None
5997+
Optionally coerce the dtype of exploded column
5998+
-
5999+
Examples
6000+
--------
6001+
>>> df = pd.DataFrame({'k': ['a,b', 'c,d'], 'v': [0, 1]})
6002+
>>> df.explode('k', sep=',')
6003+
k v
6004+
0 a 0
6005+
0 b 0
6006+
1 c 1
6007+
1 d 1
6008+
"""
6009+
col = self[col_name]
6010+
if len(self) == 0:
6011+
return self.copy()
6012+
if sep:
6013+
col_expanded = col.str.split(sep, expand=True)
6014+
else:
6015+
col_expanded = col.apply(Series)
6016+
col_stacked = (col_expanded
6017+
.stack()
6018+
.reset_index(level=-1, drop=True)
6019+
.rename(col_name))
6020+
if dtype:
6021+
col_stacked = col_stacked.astype(dtype)
6022+
return (col_stacked.to_frame()
6023+
.join(self.drop(col_name, axis=1))
6024+
.reindex(self.columns, axis=1))
6025+
59836026
# ----------------------------------------------------------------------
59846027
# Time series-related
59856028

pandas/tests/frame/test_reshape.py

+84
Original file line numberDiff line numberDiff line change
@@ -918,6 +918,90 @@ def test_unstack_swaplevel_sortlevel(self, level):
918918
tm.assert_frame_equal(result, expected)
919919

920920

921+
def test_explode():
922+
# GH 16538
923+
924+
# Automatically do str.split
925+
columns = ['a', 'b', 'c']
926+
df = pd.DataFrame([['foo,bar', 'x', 42],
927+
['fizz,buzz', 'y', 43]],
928+
columns=columns)
929+
rs = df.explode('a', sep=',')
930+
xp = pd.DataFrame({'a': ['foo', 'bar', 'fizz', 'buzz'],
931+
'b': ['x', 'x', 'y', 'y'],
932+
'c': [42, 42, 43, 43]},
933+
index=[0, 0, 1, 1])
934+
tm.assert_frame_equal(rs, xp)
935+
936+
# Coerce dtype
937+
df = pd.DataFrame([[[0, 1, 4], 'x', 42],
938+
[[2, 3], 'y', 43]],
939+
columns=columns)
940+
rs = df.explode('a', dtype='int')
941+
xp = pd.DataFrame({'a': np.array([0, 1, 4, 2, 3], dtype='int'),
942+
'b': ['x', 'x', 'x', 'y', 'y'],
943+
'c': [42, 42, 42, 43, 43]},
944+
index=[0, 0, 0, 1, 1])
945+
tm.assert_frame_equal(rs, xp)
946+
947+
# NaN's and empty lists are omitted
948+
# TODO: option to preserve explicit NAs instead
949+
df = pd.DataFrame([[[], 'x', 42],
950+
[[2.0, np.nan], 'y', 43]],
951+
columns=columns)
952+
rs = df.explode('a')
953+
xp = pd.DataFrame({'a': [2.0],
954+
'b': ['y'],
955+
'c': [43]},
956+
index=[1])
957+
tm.assert_frame_equal(rs, xp)
958+
959+
# Not everything is a list
960+
df = pd.DataFrame([[[0, 1, 4], 'x', 42],
961+
[3, 'y', 43]],
962+
columns=columns)
963+
rs = df.explode('a', dtype='int')
964+
xp = pd.DataFrame({'a': np.array([0, 1, 4, 3], dtype='int'),
965+
'b': ['x', 'x', 'x', 'y'],
966+
'c': [42, 42, 42, 43]},
967+
index=[0, 0, 0, 1])
968+
tm.assert_frame_equal(rs, xp)
969+
970+
# Nothing is a list
971+
df = pd.DataFrame([[0, 'x', 42],
972+
[3, 'y', 43]],
973+
columns=columns)
974+
rs = df.explode('a')
975+
xp = pd.DataFrame({'a': [0, 3],
976+
'b': ['x', 'y'],
977+
'c': [42, 43]},
978+
index=[0, 1])
979+
tm.assert_frame_equal(rs, xp)
980+
981+
# Empty frame
982+
rs = pd.DataFrame(columns=['a', 'b']).explode('a')
983+
xp = pd.DataFrame(columns=['a', 'b'])
984+
tm.assert_frame_equal(rs, xp)
985+
986+
# Bad column name
987+
pytest.raises(KeyError, df.explode, 'badcolumnname')
988+
989+
# Multi-index
990+
columns = ['a', 'b', 'c']
991+
idx = pd.MultiIndex.from_tuples([(0, 'a'), (1, 'b')])
992+
df = pd.DataFrame([['foo,bar', 'x', 42],
993+
['fizz,buzz', 'y', 43]],
994+
columns=columns,
995+
index=idx)
996+
rs = df.explode('a', sep=',')
997+
idx = pd.MultiIndex.from_tuples([(0, 'a'), (0, 'a'), (1, 'b'), (1, 'b')])
998+
xp = pd.DataFrame({'a': ['foo', 'bar', 'fizz', 'buzz'],
999+
'b': ['x', 'x', 'y', 'y'],
1000+
'c': [42, 42, 43, 43]},
1001+
index=idx)
1002+
tm.assert_frame_equal(rs, xp)
1003+
1004+
9211005
def test_unstack_fill_frame_object():
9221006
# GH12815 Test unstacking with object.
9231007
data = pd.Series(['a', 'b', 'c', 'a'], dtype='object')

0 commit comments

Comments
 (0)