Skip to content

Commit bd49629

Browse files
committed
[ENH] Add DataFrame method to explode a list-like column (GH #16538)
Sometimes a values column is presented with list-like values on one row. Instead we may want to split each individual value onto its own row, keeping the same mapping to the other key columns. While it's possible to chain together existing pandas operations (in fact that's exactly what this implementation is) to do this, the sequence of operations is not obvious. By contrast this is available as a built-in operation in say Spark and is a fairly common use case.
1 parent 14c33b0 commit bd49629

File tree

5 files changed

+196
-0
lines changed

5 files changed

+196
-0
lines changed

asv_bench/benchmarks/reshape.py

+18
Original file line numberDiff line numberDiff line change
@@ -184,4 +184,22 @@ def time_qcut_datetime(self, bins):
184184
pd.qcut(self.datetime_series, bins)
185185

186186

187+
class Explode(object):
188+
param_names = ['n_rows', 'max_list_length']
189+
params = [[100, 1000, 10000], [3, 5, 10]]
190+
191+
def setup(self, n_rows, max_list_length):
192+
import string
193+
num_letters = np.random.randint(0, max_list_length, n_rows)
194+
key_column = [','.join([np.random.choice(list(string.ascii_letters))
195+
for _ in range(k)])
196+
for k in num_letters]
197+
value_column = np.random.randn(n_rows)
198+
self.frame = pd.DataFrame({'key': key_column,
199+
'value': value_column})
200+
201+
def time_explode(self, n_rows, max_list_length):
202+
self.frame.explode('key', sep=',')
203+
204+
187205
from .pandas_vb_common import setup # noqa: F401

doc/source/reshaping.rst

+31
Original file line numberDiff line numberDiff line change
@@ -801,3 +801,34 @@ Note to subdivide over multiple columns we can pass in a list to the
801801
802802
df.pivot_table(
803803
values=['val0'], index='row', columns=['item', 'col'], aggfunc=['mean'])
804+
805+
.. _reshaping.explode:
806+
807+
Exploding a List-like Column
808+
----------------------------
809+
810+
Sometimes the value column is list-like:
811+
812+
.. ipython:: python
813+
814+
keys = ['panda1', 'panda2', 'panda3']
815+
values = [['eats', 'shoots'], ['shoots', 'leaves'], ['eats', 'leaves']]
816+
df = pd.DataFrame({'keys': keys, 'values': values})
817+
df
818+
819+
But we actually want to put each value onto its own row.
820+
For this purpose we can use ``DataFrame.explode``:
821+
822+
.. ipython:: python
823+
824+
df.explode('values')
825+
826+
For convenience, we can use the optional keyword ``sep`` to automatically
827+
split a string column before exploding:
828+
829+
.. ipython:: python
830+
831+
values = ['eats,shoots', 'shoots,leaves', 'eats,shoots,leaves']
832+
df2 = pd.DataFrame({'keys': keys, 'values': values})
833+
df2
834+
df2.explode('values', sep=',')

doc/source/whatsnew/v0.24.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ New features
3131
- :func:`read_feather` now accepts ``columns`` as an argument, allowing the user to specify which columns should be read. (:issue:`24025`)
3232
- :func:`DataFrame.to_html` now accepts ``render_links`` as an argument, allowing the user to generate HTML with links to any URLs that appear in the DataFrame.
3333
See the :ref:`section on writing HTML <io.html>` in the IO docs for example usage. (:issue:`2679`)
34+
- :func:`DataFrame.explode` to split list-like values onto individual rows. See :ref:`section on Exploding list-like column <reshaping.html>` in docs for more information (:issue:`16538`)
3435

3536
.. _whatsnew_0240.values_api:
3637

pandas/core/frame.py

+51
Original file line numberDiff line numberDiff line change
@@ -5980,6 +5980,57 @@ def melt(self, id_vars=None, value_vars=None, var_name=None,
59805980
var_name=var_name, value_name=value_name,
59815981
col_level=col_level)
59825982

5983+
def explode(self, col_name, sep=None, dtype=None):
5984+
"""
5985+
Create new DataFrame expanding a list-like column.
5986+
5987+
.. versionadded:: 0.24.0
5988+
5989+
Parameters
5990+
----------
5991+
col_name : str
5992+
Name of the column to be exploded.
5993+
sep : str, default None
5994+
Convenience to split a string `col_name` before exploding.
5995+
dtype : str or dtype, default None
5996+
Optionally coerce the dtype of exploded column.
5997+
5998+
Returns
5999+
-------
6000+
exploded: DataFrame
6001+
6002+
See Also
6003+
--------
6004+
Series.str.split: Split string values on specified separator.
6005+
Series.str.extract: Extract groups from the first regex match.
6006+
6007+
Examples
6008+
--------
6009+
>>> df = pd.DataFrame({'k': ['a,b', 'c,d'], 'v': [0, 1]})
6010+
>>> df.explode('k', sep=',')
6011+
k v
6012+
0 a 0
6013+
0 b 0
6014+
1 c 1
6015+
1 d 1
6016+
"""
6017+
col = self[col_name]
6018+
if len(self) == 0:
6019+
return self.copy()
6020+
if sep:
6021+
col_expanded = col.str.split(sep, expand=True)
6022+
else:
6023+
col_expanded = col.apply(Series)
6024+
col_stacked = (col_expanded
6025+
.stack()
6026+
.reset_index(level=-1, drop=True)
6027+
.rename(col_name))
6028+
if dtype:
6029+
col_stacked = col_stacked.astype(dtype)
6030+
return (col_stacked.to_frame()
6031+
.join(self.drop(col_name, axis=1))
6032+
.reindex(self.columns, axis=1))
6033+
59836034
# ----------------------------------------------------------------------
59846035
# Time series-related
59856036

pandas/tests/frame/test_reshape.py

+95
Original file line numberDiff line numberDiff line change
@@ -918,6 +918,101 @@ def test_unstack_swaplevel_sortlevel(self, level):
918918
tm.assert_frame_equal(result, expected)
919919

920920

921+
class TestDataFrameExplode(object):
922+
# GH 16538
923+
columns = ['a', 'b', 'c']
924+
925+
def test_sep(self):
926+
# Automatically do str.split
927+
df = pd.DataFrame([['foo,bar', 'x', 42],
928+
['fizz,buzz', 'y', 43]],
929+
columns=self.columns)
930+
rs = df.explode('a', sep=',')
931+
xp = pd.DataFrame({'a': ['foo', 'bar', 'fizz', 'buzz'],
932+
'b': ['x', 'x', 'y', 'y'],
933+
'c': [42, 42, 43, 43]},
934+
index=[0, 0, 1, 1])
935+
tm.assert_frame_equal(rs, xp)
936+
937+
def test_dtype(self):
938+
# Coerce dtype
939+
df = pd.DataFrame([[[0, 1, 4], 'x', 42],
940+
[[2, 3], 'y', 43]],
941+
columns=self.columns)
942+
rs = df.explode('a', dtype='int')
943+
xp = pd.DataFrame({'a': np.array([0, 1, 4, 2, 3], dtype='int'),
944+
'b': ['x', 'x', 'x', 'y', 'y'],
945+
'c': [42, 42, 42, 43, 43]},
946+
index=[0, 0, 0, 1, 1])
947+
tm.assert_frame_equal(rs, xp)
948+
949+
def test_na(self):
950+
# NaN's and empty lists are omitted
951+
# TODO: option to preserve explicit NAs instead
952+
df = pd.DataFrame([[[], 'x', 42],
953+
[[2.0, np.nan], 'y', 43]],
954+
columns=self.columns)
955+
rs = df.explode('a')
956+
xp = pd.DataFrame({'a': [2.0],
957+
'b': ['y'],
958+
'c': [43]},
959+
index=[1])
960+
tm.assert_frame_equal(rs, xp)
961+
962+
def test_nonuniform_type(self):
963+
# Not everything is a list
964+
df = pd.DataFrame([[[0, 1, 4], 'x', 42],
965+
[3, 'y', 43]],
966+
columns=self.columns)
967+
rs = df.explode('a', dtype='int')
968+
xp = pd.DataFrame({'a': np.array([0, 1, 4, 3], dtype='int'),
969+
'b': ['x', 'x', 'x', 'y'],
970+
'c': [42, 42, 42, 43]},
971+
index=[0, 0, 0, 1])
972+
tm.assert_frame_equal(rs, xp)
973+
974+
def test_all_scalars(self):
975+
# Nothing is a list
976+
df = pd.DataFrame([[0, 'x', 42],
977+
[3, 'y', 43]],
978+
columns=self.columns)
979+
rs = df.explode('a')
980+
xp = pd.DataFrame({'a': [0, 3],
981+
'b': ['x', 'y'],
982+
'c': [42, 43]},
983+
index=[0, 1])
984+
tm.assert_frame_equal(rs, xp)
985+
986+
def test_empty(self):
987+
# Empty frame
988+
rs = pd.DataFrame(columns=['a', 'b']).explode('a')
989+
xp = pd.DataFrame(columns=['a', 'b'])
990+
tm.assert_frame_equal(rs, xp)
991+
992+
def test_missing_column(self):
993+
# Bad column name
994+
df = pd.DataFrame([[0, 'x', 42],
995+
[3, 'y', 43]],
996+
columns=self.columns)
997+
pytest.raises(KeyError, df.explode, 'badcolumnname')
998+
999+
def test_multi_index(self):
1000+
# Multi-index
1001+
idx = pd.MultiIndex.from_tuples([(0, 'a'), (1, 'b')])
1002+
df = pd.DataFrame([['foo,bar', 'x', 42],
1003+
['fizz,buzz', 'y', 43]],
1004+
columns=self.columns,
1005+
index=idx)
1006+
rs = df.explode('a', sep=',')
1007+
idx = pd.MultiIndex.from_tuples(
1008+
[(0, 'a'), (0, 'a'), (1, 'b'), (1, 'b')])
1009+
xp = pd.DataFrame({'a': ['foo', 'bar', 'fizz', 'buzz'],
1010+
'b': ['x', 'x', 'y', 'y'],
1011+
'c': [42, 42, 43, 43]},
1012+
index=idx)
1013+
tm.assert_frame_equal(rs, xp)
1014+
1015+
9211016
def test_unstack_fill_frame_object():
9221017
# GH12815 Test unstacking with object.
9231018
data = pd.Series(['a', 'b', 'c', 'a'], dtype='object')

0 commit comments

Comments
 (0)