Skip to content

Commit f2886c1

Browse files
authored
ENH: Add Series method to explode a list-like column (#27267)
* [ENH] Add DataFrame method to explode a list-like column (GH #16538) Sometimes a values column is presented with list-like values on one row. Instead we may want to split each individual value onto its own row, keeping the same mapping to the other key columns. While it's possible to chain together existing pandas operations (in fact that's exactly what this implementation is) to do this, the sequence of operations is not obvious. By contrast this is available as a built-in operation in say Spark and is a fairly common use case. * move to Series * handle generic list-like * lint on asv * move is_list_like to cython and share impl * moar docs * test larger sides to avoid a segfault * fix ref * typos * benchmarks wrong * add inversion * add usecase * cimport is_list_like * use cimports * doc-string * docs & lint * isort * clean object check & update doc-strings * lint * test for nested * better test * try adding frame * test for nested EA * lint * remove multi subset support * update docs * doc-string * add test for MI * lint and docs * ordering * moar lint * multi-index column support * 32-bit compat * moar 32-bit compat
1 parent a4c19e7 commit f2886c1

File tree

20 files changed

+581
-68
lines changed

20 files changed

+581
-68
lines changed

Makefile

+2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
.PHONY : develop build clean clean_pyc doc lint-diff black
22

3+
all: develop
4+
35
clean:
46
-python setup.py clean
57

asv_bench/benchmarks/io/parsers.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
pass
1111

1212

13-
class DoesStringLookLikeDatetime(object):
13+
class DoesStringLookLikeDatetime:
1414

1515
params = (["2Q2005", "0.0", "10000"],)
1616
param_names = ["value"]
@@ -23,7 +23,7 @@ def time_check_datetimes(self, value):
2323
_does_string_look_like_datetime(obj)
2424

2525

26-
class ConcatDateCols(object):
26+
class ConcatDateCols:
2727

2828
params = ([1234567890, "AAAA"], [1, 2])
2929
param_names = ["value", "dim"]

asv_bench/benchmarks/reshape.py

+13
Original file line numberDiff line numberDiff line change
@@ -240,4 +240,17 @@ def time_qcut_datetime(self, bins):
240240
pd.qcut(self.datetime_series, bins)
241241

242242

243+
class Explode:
244+
param_names = ["n_rows", "max_list_length"]
245+
params = [[100, 1000, 10000], [3, 5, 10]]
246+
247+
def setup(self, n_rows, max_list_length):
248+
249+
data = [np.arange(np.random.randint(max_list_length)) for _ in range(n_rows)]
250+
self.series = pd.Series(data)
251+
252+
def time_explode(self, n_rows, max_list_length):
253+
self.series.explode()
254+
255+
243256
from .pandas_vb_common import setup # noqa: F401

asv_bench/benchmarks/series_methods.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ def time_series_datetimeindex_repr(self):
219219
getattr(self.s, "a", None)
220220

221221

222-
class All(object):
222+
class All:
223223

224224
params = [[10 ** 3, 10 ** 6], ["fast", "slow"]]
225225
param_names = ["N", "case"]
@@ -232,7 +232,7 @@ def time_all(self, N, case):
232232
self.s.all()
233233

234234

235-
class Any(object):
235+
class Any:
236236

237237
params = [[10 ** 3, 10 ** 6], ["fast", "slow"]]
238238
param_names = ["N", "case"]
@@ -245,7 +245,7 @@ def time_any(self, N, case):
245245
self.s.any()
246246

247247

248-
class NanOps(object):
248+
class NanOps:
249249

250250
params = [
251251
[

asv_bench/benchmarks/timeseries.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,7 @@ def time_format_YYYYMMDD(self):
293293
to_datetime(self.stringsD, format="%Y%m%d")
294294

295295

296-
class ToDatetimeCacheSmallCount(object):
296+
class ToDatetimeCacheSmallCount:
297297

298298
params = ([True, False], [50, 500, 5000, 100000])
299299
param_names = ["cache", "count"]

ci/code_checks.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then
156156
RET=$(($RET + $?)) ; echo $MSG "DONE"
157157

158158
MSG='Check for python2 new-style classes and for empty parentheses' ; echo $MSG
159-
invgrep -R --include="*.py" --include="*.pyx" -E "class\s\S*\((object)?\):" pandas scripts
159+
invgrep -R --include="*.py" --include="*.pyx" -E "class\s\S*\((object)?\):" pandas asv_bench/benchmarks scripts
160160
RET=$(($RET + $?)) ; echo $MSG "DONE"
161161

162162
MSG='Check for backticks incorrectly rendering because of missing spaces' ; echo $MSG

doc/source/reference/frame.rst

+1
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ Reshaping, sorting, transposing
239239
DataFrame.unstack
240240
DataFrame.swapaxes
241241
DataFrame.melt
242+
DataFrame.explode
242243
DataFrame.squeeze
243244
DataFrame.to_xarray
244245
DataFrame.T

doc/source/reference/series.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,7 @@ Reshaping, sorting
245245
Series.sort_index
246246
Series.swaplevel
247247
Series.unstack
248+
Series.explode
248249
Series.searchsorted
249250
Series.ravel
250251
Series.repeat
@@ -590,4 +591,3 @@ Sparse
590591

591592
SparseSeries.to_coo
592593
SparseSeries.from_coo
593-

doc/source/user_guide/reshaping.rst

+50
Original file line numberDiff line numberDiff line change
@@ -801,3 +801,53 @@ Note to subdivide over multiple columns we can pass in a list to the
801801
802802
df.pivot_table(
803803
values=['val0'], index='row', columns=['item', 'col'], aggfunc=['mean'])
804+
805+
.. _reshaping.explode:
806+
807+
Exploding a list-like column
808+
----------------------------
809+
810+
.. versionadded:: 0.25.0
811+
812+
Sometimes the values in a column are list-like.
813+
814+
.. ipython:: python
815+
816+
keys = ['panda1', 'panda2', 'panda3']
817+
values = [['eats', 'shoots'], ['shoots', 'leaves'], ['eats', 'leaves']]
818+
df = pd.DataFrame({'keys': keys, 'values': values})
819+
df
820+
821+
We can 'explode' the ``values`` column, transforming each list-like to a separate row, by using :meth:`~Series.explode`. This will replicate the index values from the original row:
822+
823+
.. ipython:: python
824+
825+
df['values'].explode()
826+
827+
You can also explode the column in the ``DataFrame``.
828+
829+
.. ipython:: python
830+
831+
df.explode('values')
832+
833+
:meth:`Series.explode` will replace empty lists with ``np.nan`` and preserve scalar entries. The dtype of the resulting ``Series`` is always ``object``.
834+
835+
.. ipython:: python
836+
837+
s = pd.Series([[1, 2, 3], 'foo', [], ['a', 'b']])
838+
s
839+
s.explode()
840+
841+
Here is a typical usecase. You have comma separated strings in a column and want to expand this.
842+
843+
.. ipython:: python
844+
845+
df = pd.DataFrame([{'var1': 'a,b,c', 'var2': 1},
846+
{'var1': 'd,e,f', 'var2': 2}])
847+
df
848+
849+
Creating a long form DataFrame is now straightforward using explode and chained operations
850+
851+
.. ipython:: python
852+
853+
df.assign(var1=df.var1.str.split(',')).explode('var1')

doc/source/whatsnew/v0.25.0.rst

+22
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,28 @@ The repr now looks like this:
182182
json_normalize(data, max_level=1)
183183
184184
185+
.. _whatsnew_0250.enhancements.explode:
186+
187+
Series.explode to split list-like values to rows
188+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
189+
190+
:class:`Series` and :class:`DataFrame` have gained the :meth:`DataFrame.explode` methods to transform list-likes to individual rows. See :ref:`section on Exploding list-like column <reshaping.explode>` in docs for more information (:issue:`16538`, :issue:`10511`)
191+
192+
193+
Here is a typical usecase. You have comma separated string in a column.
194+
195+
.. ipython:: python
196+
197+
df = pd.DataFrame([{'var1': 'a,b,c', 'var2': 1},
198+
{'var1': 'd,e,f', 'var2': 2}])
199+
df
200+
201+
Creating a long form ``DataFrame`` is now straightforward using chained operations
202+
203+
.. ipython:: python
204+
205+
df.assign(var1=df.var1.str.split(',')).explode('var1')
206+
185207
.. _whatsnew_0250.enhancements.other:
186208

187209
Other enhancements

pandas/_libs/lib.pxd

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
cdef bint c_is_list_like(object, bint)

pandas/_libs/lib.pyx

+55
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from collections import abc
12
from decimal import Decimal
23
from fractions import Fraction
34
from numbers import Number
@@ -886,6 +887,60 @@ def is_period(val: object) -> bool:
886887
return util.is_period_object(val)
887888

888889

890+
def is_list_like(obj: object, allow_sets: bool = True):
891+
"""
892+
Check if the object is list-like.
893+
894+
Objects that are considered list-like are for example Python
895+
lists, tuples, sets, NumPy arrays, and Pandas Series.
896+
897+
Strings and datetime objects, however, are not considered list-like.
898+
899+
Parameters
900+
----------
901+
obj : The object to check
902+
allow_sets : boolean, default True
903+
If this parameter is False, sets will not be considered list-like
904+
905+
.. versionadded:: 0.24.0
906+
907+
Returns
908+
-------
909+
is_list_like : bool
910+
Whether `obj` has list-like properties.
911+
912+
Examples
913+
--------
914+
>>> is_list_like([1, 2, 3])
915+
True
916+
>>> is_list_like({1, 2, 3})
917+
True
918+
>>> is_list_like(datetime(2017, 1, 1))
919+
False
920+
>>> is_list_like("foo")
921+
False
922+
>>> is_list_like(1)
923+
False
924+
>>> is_list_like(np.array([2]))
925+
True
926+
>>> is_list_like(np.array(2)))
927+
False
928+
"""
929+
return c_is_list_like(obj, allow_sets)
930+
931+
932+
cdef inline bint c_is_list_like(object obj, bint allow_sets):
933+
return (
934+
isinstance(obj, abc.Iterable)
935+
# we do not count strings/unicode/bytes as list-like
936+
and not isinstance(obj, (str, bytes))
937+
# exclude zero-dimensional numpy arrays, effectively scalars
938+
and not (util.is_array(obj) and obj.ndim == 0)
939+
# exclude sets if allow_sets is False
940+
and not (allow_sets is False and isinstance(obj, abc.Set))
941+
)
942+
943+
889944
_TYPE_MAP = {
890945
'categorical': 'categorical',
891946
'category': 'categorical',

pandas/_libs/reshape.pyx

+61-2
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@ import cython
22
from cython import Py_ssize_t
33

44
from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
5-
uint32_t, uint64_t, float32_t, float64_t)
6-
5+
uint32_t, uint64_t, float32_t, float64_t, ndarray)
6+
cimport numpy as cnp
7+
import numpy as np
8+
from pandas._libs.lib cimport c_is_list_like
9+
cnp.import_array()
710

811
ctypedef fused reshape_t:
912
uint8_t
@@ -91,3 +94,59 @@ unstack_int64 = unstack["int64_t"]
9194
unstack_float32 = unstack["float32_t"]
9295
unstack_float64 = unstack["float64_t"]
9396
unstack_object = unstack["object"]
97+
98+
99+
@cython.wraparound(False)
100+
@cython.boundscheck(False)
101+
def explode(ndarray[object] values):
102+
"""
103+
transform array list-likes to long form
104+
preserve non-list entries
105+
106+
Parameters
107+
----------
108+
values : object ndarray
109+
110+
Returns
111+
-------
112+
tuple(values, counts)
113+
"""
114+
cdef:
115+
Py_ssize_t i, j, count, n
116+
object v
117+
ndarray[object] result
118+
ndarray[int64_t] counts
119+
120+
# find the resulting len
121+
n = len(values)
122+
counts = np.zeros(n, dtype='int64')
123+
for i in range(n):
124+
v = values[i]
125+
if c_is_list_like(v, False):
126+
if len(v):
127+
counts[i] += len(v)
128+
else:
129+
# empty list-like, use a nan marker
130+
counts[i] += 1
131+
else:
132+
counts[i] += 1
133+
134+
result = np.empty(counts.sum(), dtype='object')
135+
count = 0
136+
for i in range(n):
137+
v = values[i]
138+
139+
if c_is_list_like(v, False):
140+
if len(v):
141+
for j in range(len(v)):
142+
result[count] = v[j]
143+
count += 1
144+
else:
145+
# empty list-like, use a nan marker
146+
result[count] = np.nan
147+
count += 1
148+
else:
149+
# replace with the existing scalar
150+
result[count] = v
151+
count += 1
152+
return result, counts

0 commit comments

Comments
 (0)